-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathplot_hist.py
More file actions
99 lines (72 loc) · 3.43 KB
/
plot_hist.py
File metadata and controls
99 lines (72 loc) · 3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import csv
import matplotlib.pyplot as plt # type: ignore
from collections import defaultdict
import numpy as np #type: ignore
from scipy.stats import norm #type: ignore
def plot_histograms_by_m(csv_file: str):
estimates_by_m = defaultdict(list)
with open(csv_file, mode='r') as file:
reader = csv.DictReader(file)
for row in reader:
reg_count = int(row['register_count'])
estimate = float(row['estimate'])
estimates_by_m[reg_count].append(estimate)
for reg_count, estimates in estimates_by_m.items():
mean_est = np.mean(estimates)
std_est = np.std(estimates)
plt.hist(estimates, bins=30, alpha=0.75, density=True, color='cornflowerblue', edgecolor='black')
x = np.linspace(min(estimates), max(estimates),300)
normal_dist = norm.pdf(x, mean_est, std_est)
plt.plot(x, normal_dist, color='red', linewidth=2, label='Gaussian curve')
plt.title(f"Combined Estimation Distribution for m={reg_count}")
plt.xlabel("Estimated Cardinality")
plt.ylabel("Frequency")
#plt.axvline(np.mean(estimates), color='red', linestyle='dashed', linewidth=2, label=f'Mean Estimate')
#plt.legend()
#plt.show()
filename = f"hist_m{reg_count}.jpg"
plt.savefig(filename)
plt.clf() #clear so next plot is also made on a blank slate
# Function to compute sigma and the fraction of estimates within the bounds
def compute_sigma_fractions(csv_file: str, n: int = 1000000) -> np.ndarray:
results = []
with open(csv_file, mode='r') as file:
reader = csv.DictReader(file)
estimates_by_m = {}
for row in reader:
reg_count = int(row['register_count'])
estimate = float(row['estimate'])
if reg_count not in estimates_by_m:
estimates_by_m[reg_count] = []
estimates_by_m[reg_count].append(estimate)
for reg_count, estimates in estimates_by_m.items():
m = reg_count
sigma = 1.04 / np.sqrt(m)
lower_bound_sigma = n * (1 - sigma)
upper_bound_sigma = n * (1 + sigma)
lower_bound_2sigma = n * (1 - 2 * sigma)
upper_bound_2sigma = n * (1 + 2 * sigma)
count_sigma = sum(lower_bound_sigma <= est <= upper_bound_sigma for est in estimates)
count_2sigma = sum(lower_bound_2sigma <= est <= upper_bound_2sigma for est in estimates)
total_runs = len(estimates)
fraction_sigma = count_sigma / total_runs
fraction_2sigma = count_2sigma / total_runs
results.append([m, fraction_sigma, fraction_2sigma])
return np.array(results)
# Function to write the results to a LaTeX table
def write_latex_tabular(res: np.ndarray, filename: str):
with open(filename, 'w') as f:
f.write(r'\begin{tabular}{rrr}' + '\n')
f.write(r'$m$ & $n(1\pm\sigma)$ & $n(1\pm 2\sigma)$' + '\n')
f.write(r'\\\hline' + '\n')
for i in range(res.shape[0]):
fields = [str(int(res[i,0])),
f'{res[i,1]:.6f}',
f'{res[i,2]:.6f}']
f.write(' & '.join(fields) + r'\\' + '\n')
f.write(r'\end{tabular}' + '\n')
# Run the full process
if __name__ == '__main__':
plot_histograms_by_m('results_for_hist.csv')
res = compute_sigma_fractions('results_for_hist.csv')
write_latex_tabular(res, 'sigma_table.tex')