HyperLogLog/plot_hist.py at main · lens161/HyperLogLog · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import csv
import matplotlib.pyplot as plt # type: ignore
from collections import defaultdict
import numpy as np #type: ignore
from scipy.stats import norm #type: ignore


def plot_histograms_by_m(csv_file: str):
    estimates_by_m = defaultdict(list)

    with open(csv_file, mode='r') as file:
        reader = csv.DictReader(file)

        for row in reader:
            reg_count = int(row['register_count'])
            estimate = float(row['estimate'])

            estimates_by_m[reg_count].append(estimate)

    for reg_count, estimates in estimates_by_m.items():
        mean_est = np.mean(estimates)
        std_est = np.std(estimates)

        plt.hist(estimates, bins=30, alpha=0.75, density=True, color='cornflowerblue', edgecolor='black')

        x = np.linspace(min(estimates), max(estimates),300)
        normal_dist = norm.pdf(x, mean_est, std_est)

        plt.plot(x, normal_dist, color='red', linewidth=2, label='Gaussian curve')

        plt.title(f"Combined Estimation Distribution for m={reg_count}")
        plt.xlabel("Estimated Cardinality")
        plt.ylabel("Frequency")
        #plt.axvline(np.mean(estimates), color='red', linestyle='dashed', linewidth=2, label=f'Mean Estimate')
        #plt.legend()
        #plt.show()
        filename = f"hist_m{reg_count}.jpg"
        plt.savefig(filename)
        plt.clf() #clear so next plot is also made on a blank slate


# Function to compute sigma and the fraction of estimates within the bounds
def compute_sigma_fractions(csv_file: str, n: int = 1000000) -> np.ndarray:
    results = []

    with open(csv_file, mode='r') as file:
        reader = csv.DictReader(file)
        estimates_by_m = {}

        for row in reader:
            reg_count = int(row['register_count'])
            estimate = float(row['estimate'])

            if reg_count not in estimates_by_m:
                estimates_by_m[reg_count] = []
            estimates_by_m[reg_count].append(estimate)

    for reg_count, estimates in estimates_by_m.items():
        m = reg_count
        sigma = 1.04 / np.sqrt(m)

        lower_bound_sigma = n * (1 - sigma)
        upper_bound_sigma = n * (1 + sigma)
        lower_bound_2sigma = n * (1 - 2 * sigma)
        upper_bound_2sigma = n * (1 + 2 * sigma)

        count_sigma = sum(lower_bound_sigma <= est <= upper_bound_sigma for est in estimates)
        count_2sigma = sum(lower_bound_2sigma <= est <= upper_bound_2sigma for est in estimates)

        total_runs = len(estimates)
        fraction_sigma = count_sigma / total_runs
        fraction_2sigma = count_2sigma / total_runs

        results.append([m, fraction_sigma, fraction_2sigma])

    return np.array(results)

# Function to write the results to a LaTeX table
def write_latex_tabular(res: np.ndarray, filename: str):
    with open(filename, 'w') as f:
        f.write(r'\begin{tabular}{rrr}' + '\n')
        f.write(r'$m$ & $n(1\pm\sigma)$ & $n(1\pm 2\sigma)$' + '\n')
        f.write(r'\\\hline' + '\n')
        for i in range(res.shape[0]):
            fields = [str(int(res[i,0])),
                      f'{res[i,1]:.6f}',
                      f'{res[i,2]:.6f}']
            f.write(' & '.join(fields) + r'\\' + '\n')
        f.write(r'\end{tabular}' + '\n')

# Run the full process
if __name__ == '__main__':

    plot_histograms_by_m('results_for_hist.csv')

    res = compute_sigma_fractions('results_for_hist.csv')

    write_latex_tabular(res, 'sigma_table.tex')