-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_deterministic.py
More file actions
145 lines (112 loc) · 5.22 KB
/
test_deterministic.py
File metadata and controls
145 lines (112 loc) · 5.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3
"""
Test script to verify deterministic behavior of the stylometric analysis.
"""
import os
import pandas as pd
import numpy as np
from stylometric_analyzer import StylometricAnalyzer
def test_deterministic_analysis():
"""Test that running the same analysis multiple times produces identical results."""
# Create test articles
test_articles = {
"article1": "This is a test article about technology and innovation. It discusses various aspects of modern computing and artificial intelligence.",
"article2": "Another test article focusing on science and research. It covers topics related to machine learning and data analysis.",
"article3": "A third article about business and economics. It examines market trends and financial analysis."
}
print("Testing deterministic behavior...")
print("Running analysis 3 times with the same input...")
results = []
for i in range(3):
print(f"\n--- Run {i+1} ---")
# Create a fresh analyzer instance
analyzer = StylometricAnalyzer()
# Perform pairwise analysis
df = analyzer.pairwise_analysis(test_articles)
# Store results for comparison
results.append(df)
print(f"Run {i+1} completed. Shape: {df.shape}")
print(f"First row semantic score: {df.iloc[0]['semantic_avg_score']:.6f}")
# Compare results
print("\n=== COMPARISON ===")
# Check if all DataFrames are identical
all_identical = True
for i in range(1, len(results)):
if not results[0].equals(results[i]):
all_identical = False
print(f"Results {i} differs from results 0")
else:
print(f"Results {i} is identical to results 0")
if all_identical:
print("\n✓ SUCCESS: All runs produced identical results!")
print("The analysis is deterministic.")
else:
print("\n✗ FAILURE: Results differ between runs!")
print("The analysis is not deterministic.")
# Show differences
for i in range(1, len(results)):
print(f"\nDifferences between run 0 and run {i}:")
diff_mask = results[0] != results[i]
if diff_mask.any().any():
for col in results[0].columns:
if diff_mask[col].any():
print(f" Column '{col}' has differences")
for idx in diff_mask[diff_mask[col]].index:
print(f" Row {idx}: {results[0].loc[idx, col]} vs {results[i].loc[idx, col]}")
return all_identical
def test_clustering_deterministic():
"""Test that clustering produces deterministic results."""
from heatmap_visualizer import create_similarity_heatmap
# Create test data
test_data = pd.DataFrame({
'pair': ['article1 vs article2', 'article1 vs article3', 'article2 vs article3'],
'semantic_avg_score': [0.75, 0.65, 0.80],
'semantic_median_score': [0.74, 0.64, 0.79],
'semantic_max_score': [0.85, 0.75, 0.90],
'semantic_z_score': [1.5, 0.5, 2.0],
'semantic_baseline_mean': [0.6, 0.6, 0.6],
'semantic_baseline_std': [0.1, 0.1, 0.1]
})
print("\nTesting clustering deterministic behavior...")
results = []
for i in range(3):
print(f"\n--- Clustering Run {i+1} ---")
# Create heatmap (this will trigger clustering)
fig = create_similarity_heatmap(test_data, metric='semantic')
# Extract the z data (similarity matrix) from the figure
z_data = fig.data[0].z
results.append(z_data)
print(f"Clustering run {i+1} completed.")
print(f"Matrix shape: {z_data.shape}")
print(f"First element: {z_data[0, 0]:.6f}")
# Compare clustering results
print("\n=== CLUSTERING COMPARISON ===")
all_identical = True
for i in range(1, len(results)):
if not np.array_equal(results[0], results[i]):
all_identical = False
print(f"Clustering results {i} differs from results 0")
else:
print(f"Clustering results {i} is identical to results 0")
if all_identical:
print("\n✓ SUCCESS: All clustering runs produced identical results!")
print("The clustering is deterministic.")
else:
print("\n✗ FAILURE: Clustering results differ between runs!")
print("The clustering is not deterministic.")
return all_identical
if __name__ == "__main__":
print("Deterministic Analysis Test")
print("=" * 40)
# Test main analysis
analysis_deterministic = test_deterministic_analysis()
# Test clustering
clustering_deterministic = test_clustering_deterministic()
print("\n" + "=" * 40)
print("FINAL RESULTS:")
print(f"Analysis deterministic: {'✓' if analysis_deterministic else '✗'}")
print(f"Clustering deterministic: {'✓' if clustering_deterministic else '✗'}")
if analysis_deterministic and clustering_deterministic:
print("\n🎉 ALL TESTS PASSED! The analysis is fully deterministic.")
else:
print("\n⚠️ Some tests failed. The analysis may not be fully deterministic.")