import pandas as pd import matplotlib.pyplot as plt import numpy as np # Read the data df = pd.read_csv('results.csv') # Get trials per point from the data trials_per_point = df.groupby(['n', 'k']).size().max() # Compute statistics per (n, k) pair stats = df.groupby(['n', 'k']).agg( median_rel_err=('relative_error', 'median'), q25_rel_err=('relative_error', lambda x: x.quantile(0.25)), q75_rel_err=('relative_error', lambda x: x.quantile(0.75)), q01_rel_err=('relative_error', lambda x: x.quantile(0.01)), q99_rel_err=('relative_error', lambda x: x.quantile(0.99)), ).reset_index() k_values = sorted(stats['k'].unique()) # Individual plots for k in k_values: subset = stats[stats['k'] == k].sort_values('n') if len(subset) == 0: continue fig, ax = plt.subplots(figsize=(10, 6)) ax.fill_between(subset['n'], subset['q25_rel_err'] * 100, subset['q75_rel_err'] * 100, alpha=0.3, color='steelblue', label='IQR (25th-75th percentile)') ax.plot(subset['n'], subset['median_rel_err'] * 100, 'o-', color='steelblue', markersize=6, linewidth=2, label='Median') ax.set_xscale('log', base=2) ax.set_yscale('symlog', linthresh=0.01) ax.set_ylim(-1000, 1000) ax.set_xlabel('True Number of Records', fontsize=12) ax.set_ylabel('Error in Estimate (%)', fontsize=12) ax.set_title(f'Count Estimate Error (sampling {k} record{"s" if k > 1 else ""})', fontsize=14) ax.legend(loc='upper right', fontsize=10) ax.grid(True, alpha=0.3) ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5) # Format x-axis ticks as powers of 2 n_values = sorted(subset['n'].unique()) ax.set_xticks(n_values) ax.set_xticklabels([f'$2^{{{int(np.log2(n))}}}$' for n in n_values], fontsize=9) plt.tight_layout() filename = f'error_k{k}.png' plt.savefig(filename, dpi=150, bbox_inches='tight') print(f"Saved {filename}") plt.close() print(f"Generated {len(k_values)} individual plots") # Combined grid plot n_plots = len(k_values) n_cols = 4 n_rows = (n_plots + n_cols - 1) // n_cols fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 3.5 * n_rows)) axes = axes.flatten() # Different colors for each plot colors = plt.cm.tab20(np.linspace(0, 1, len(k_values))) # Custom y-axis tick formatter for percentages def format_pct(val, pos): if val == 0: return '0%' elif abs(val) >= 1: if val == int(val): return f'{int(val)}%' return f'{val:.0f}%' else: return f'{val}%' from matplotlib.ticker import FuncFormatter pct_formatter = FuncFormatter(format_pct) # Find index of k=1024 for legend k1024_idx = k_values.index(1024) if 1024 in k_values else 10 for idx, k in enumerate(k_values): ax = axes[idx] subset = stats[stats['k'] == k].sort_values('n') if len(subset) == 0: continue color = colors[idx] # 1st-99th percentile as faint lines ax.plot(subset['n'], subset['q01_rel_err'] * 100, '--', color=color, alpha=0.4, linewidth=1, label='1st-99th pct' if idx == k1024_idx else None) ax.plot(subset['n'], subset['q99_rel_err'] * 100, '--', color=color, alpha=0.4, linewidth=1) # IQR as filled band ax.fill_between(subset['n'], subset['q25_rel_err'] * 100, subset['q75_rel_err'] * 100, alpha=0.3, color=color, label='IQR' if idx == k1024_idx else None) # Median ax.plot(subset['n'], subset['median_rel_err'] * 100, 'o-', color=color, markersize=3, linewidth=1.5, label='Median' if idx == k1024_idx else None) ax.set_xscale('log', base=2) ax.set_yscale('symlog', linthresh=0.01) ax.set_ylim(-10000, 100000) ax.set_title(f'k={k}', fontsize=11) ax.grid(True, alpha=0.3) ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5) # Set y-axis ticks and format as percentages ax.set_yticks([-10000, -1000, -100, -10, -1, -0.1, 0, 0.1, 1, 10, 100, 1000, 10000, 100000]) ax.yaxis.set_major_formatter(pct_formatter) # Simplified x-axis ticks n_values = sorted(subset['n'].unique()) ax.set_xticks(n_values[::2]) # Every other tick ax.set_xticklabels([f'$2^{{{int(np.log2(n))}}}$' for n in n_values[::2]], fontsize=7) # Hide unused subplots for idx in range(len(k_values), len(axes)): axes[idx].set_visible(False) # Add legend from k=1024 subplot handles, labels = axes[k1024_idx].get_legend_handles_labels() fig.legend(handles, labels, loc='lower right', fontsize=10, bbox_to_anchor=(0.99, 0.01)) # Add common labels fig.supxlabel('True data size (n)', fontsize=12) fig.supylabel(r'Relative Error of $\hat{n}$ (%)', fontsize=12) fig.suptitle(f'Percentage Error in Estimate as Data Size Grows (n) by Different Offsets (k) [{trials_per_point} trials per point]', fontsize=14, y=1.01) plt.tight_layout() plt.savefig('error_all_k.png', dpi=150, bbox_inches='tight') print("Saved error_all_k.png") plt.close() # Error vs k for fixed n = 1,280,000 (close to 2^20) target_n = 1280000 subset_k = stats[stats['n'] == target_n].sort_values('k') fig, ax = plt.subplots(figsize=(10, 6)) # 1st-99th percentile as faint lines ax.plot(subset_k['k'], subset_k['q01_rel_err'] * 100, '--', color='steelblue', alpha=0.4, linewidth=1, label='1st-99th percentile') ax.plot(subset_k['k'], subset_k['q99_rel_err'] * 100, '--', color='steelblue', alpha=0.4, linewidth=1) # IQR as filled band ax.fill_between(subset_k['k'], subset_k['q25_rel_err'] * 100, subset_k['q75_rel_err'] * 100, alpha=0.3, color='steelblue', label='IQR (25th-75th percentile)') # Median ax.plot(subset_k['k'], subset_k['median_rel_err'] * 100, 'o-', color='steelblue', markersize=6, linewidth=2, label='Median') ax.set_xscale('log', base=2) ax.set_yscale('symlog', linthresh=0.01) ax.set_ylim(-10000, 10000) ax.set_xlabel('Offset (k)', fontsize=12) ax.set_ylabel(r'Relative Error of $\hat{n}$ (%)', fontsize=12) ax.set_title(f'Percentage Error in Estimate as k Changes for Constant n (n = {target_n:,}) [{trials_per_point} trials per point]', fontsize=14) ax.legend(loc='upper right', fontsize=10) ax.grid(True, alpha=0.3) ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5) # Format x-axis ticks k_vals = sorted(subset_k['k'].unique()) ax.set_xticks(k_vals) ax.set_xticklabels([str(k) for k in k_vals], fontsize=9, rotation=45, ha='right') # Format y-axis with percentage labels ax.set_yticks([-10000, -1000, -100, -10, -1, -0.1, 0, 0.1, 1, 10, 100, 1000, 10000]) ax.yaxis.set_major_formatter(pct_formatter) plt.tight_layout() plt.savefig('error_vs_k.png', dpi=150, bbox_inches='tight') print("Saved error_vs_k.png") plt.close() # Absolute error grid plot fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 3.5 * n_rows)) axes = axes.flatten() for idx, k in enumerate(k_values): ax = axes[idx] subset = stats[stats['k'] == k].sort_values('n') if len(subset) == 0: continue color = colors[idx] # Compute absolute errors by multiplying relative error by n abs_median = subset['median_rel_err'] * subset['n'] abs_q25 = subset['q25_rel_err'] * subset['n'] abs_q75 = subset['q75_rel_err'] * subset['n'] abs_q01 = subset['q01_rel_err'] * subset['n'] abs_q99 = subset['q99_rel_err'] * subset['n'] # 1st-99th percentile as faint lines ax.plot(subset['n'], abs_q01, '--', color=color, alpha=0.4, linewidth=1, label='1st-99th pct' if idx == k1024_idx else None) ax.plot(subset['n'], abs_q99, '--', color=color, alpha=0.4, linewidth=1) # IQR as filled band ax.fill_between(subset['n'], abs_q25, abs_q75, alpha=0.3, color=color, label='IQR' if idx == k1024_idx else None) # Median ax.plot(subset['n'], abs_median, 'o-', color=color, markersize=3, linewidth=1.5, label='Median' if idx == k1024_idx else None) ax.set_xscale('log', base=2) ax.set_yscale('symlog', linthresh=1000) ax.set_ylim(-1e8, 1e10) ax.set_title(f'k={k}', fontsize=11) ax.grid(True, alpha=0.3) ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5) # Simplified x-axis ticks n_values = sorted(subset['n'].unique()) ax.set_xticks(n_values[::2]) ax.set_xticklabels([f'$2^{{{int(np.log2(n))}}}$' for n in n_values[::2]], fontsize=7) # Hide unused subplots for idx in range(len(k_values), len(axes)): axes[idx].set_visible(False) # Add legend from k=1024 subplot handles, labels = axes[k1024_idx].get_legend_handles_labels() fig.legend(handles, labels, loc='lower right', fontsize=10, bbox_to_anchor=(0.99, 0.01)) # Add common labels fig.supxlabel('True data size (n)', fontsize=12) fig.supylabel(r'Absolute Error of $\hat{n}$', fontsize=12) fig.suptitle(f'Absolute Error in Estimate as Data Size Grows (n) by Different Offsets (k) [{trials_per_point} trials per point]', fontsize=14, y=1.01) plt.tight_layout() plt.savefig('absolute_error_all_k.png', dpi=150, bbox_inches='tight') print("Saved absolute_error_all_k.png") plt.close() # Absolute error vs k for fixed n fig, ax = plt.subplots(figsize=(10, 6)) # Compute absolute errors abs_median_k = subset_k['median_rel_err'] * subset_k['n'] abs_q25_k = subset_k['q25_rel_err'] * subset_k['n'] abs_q75_k = subset_k['q75_rel_err'] * subset_k['n'] abs_q01_k = subset_k['q01_rel_err'] * subset_k['n'] abs_q99_k = subset_k['q99_rel_err'] * subset_k['n'] # 1st-99th percentile as faint lines ax.plot(subset_k['k'], abs_q01_k, '--', color='steelblue', alpha=0.4, linewidth=1, label='1st-99th percentile') ax.plot(subset_k['k'], abs_q99_k, '--', color='steelblue', alpha=0.4, linewidth=1) # IQR as filled band ax.fill_between(subset_k['k'], abs_q25_k, abs_q75_k, alpha=0.3, color='steelblue', label='IQR (25th-75th percentile)') # Median ax.plot(subset_k['k'], abs_median_k, 'o-', color='steelblue', markersize=6, linewidth=2, label='Median') ax.set_xscale('log', base=2) ax.set_yscale('symlog', linthresh=1000) ax.set_xlabel('Offset (k)', fontsize=12) ax.set_ylabel(r'Absolute Error of $\hat{n}$', fontsize=12) ax.set_title(f'Absolute Error in Estimate as k Changes for Constant n (n = {target_n:,}) [{trials_per_point} trials per point]', fontsize=14) ax.legend(loc='upper right', fontsize=10) ax.grid(True, alpha=0.3) ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5) # Format x-axis ticks ax.set_xticks(k_vals) ax.set_xticklabels([str(k) for k in k_vals], fontsize=9, rotation=45, ha='right') plt.tight_layout() plt.savefig('absolute_error_vs_k.png', dpi=150, bbox_inches='tight') print("Saved absolute_error_vs_k.png") plt.close()