import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Read the data
df = pd.read_csv('results.csv')

# Get trials per point from the data
trials_per_point = df.groupby(['n', 'k']).size().max()

# Compute statistics per (n, k) pair
stats = df.groupby(['n', 'k']).agg(
    median_rel_err=('relative_error', 'median'),
    q25_rel_err=('relative_error', lambda x: x.quantile(0.25)),
    q75_rel_err=('relative_error', lambda x: x.quantile(0.75)),
    q01_rel_err=('relative_error', lambda x: x.quantile(0.01)),
    q99_rel_err=('relative_error', lambda x: x.quantile(0.99)),
).reset_index()

k_values = sorted(stats['k'].unique())

# Individual plots
for k in k_values:
    subset = stats[stats['k'] == k].sort_values('n')
    if len(subset) == 0:
        continue

    fig, ax = plt.subplots(figsize=(10, 6))

    ax.fill_between(subset['n'], subset['q25_rel_err'] * 100, subset['q75_rel_err'] * 100,
                    alpha=0.3, color='steelblue', label='IQR (25th-75th percentile)')
    ax.plot(subset['n'], subset['median_rel_err'] * 100, 'o-', color='steelblue',
            markersize=6, linewidth=2, label='Median')

    ax.set_xscale('log', base=2)
    ax.set_yscale('symlog', linthresh=0.01)
    ax.set_ylim(-1000, 1000)
    ax.set_xlabel('True Number of Records', fontsize=12)
    ax.set_ylabel('Error in Estimate (%)', fontsize=12)
    ax.set_title(f'Count Estimate Error (sampling {k} record{"s" if k > 1 else ""})', fontsize=14)
    ax.legend(loc='upper right', fontsize=10)
    ax.grid(True, alpha=0.3)
    ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

    # Format x-axis ticks as powers of 2
    n_values = sorted(subset['n'].unique())
    ax.set_xticks(n_values)
    ax.set_xticklabels([f'$2^{{{int(np.log2(n))}}}$' for n in n_values], fontsize=9)

    plt.tight_layout()
    filename = f'error_k{k}.png'
    plt.savefig(filename, dpi=150, bbox_inches='tight')
    print(f"Saved {filename}")
    plt.close()

print(f"Generated {len(k_values)} individual plots")

# Combined grid plot
n_plots = len(k_values)
n_cols = 4
n_rows = (n_plots + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 3.5 * n_rows))
axes = axes.flatten()

# Different colors for each plot
colors = plt.cm.tab20(np.linspace(0, 1, len(k_values)))

# Custom y-axis tick formatter for percentages
def format_pct(val, pos):
    if val == 0:
        return '0%'
    elif abs(val) >= 1:
        if val == int(val):
            return f'{int(val)}%'
        return f'{val:.0f}%'
    else:
        return f'{val}%'

from matplotlib.ticker import FuncFormatter
pct_formatter = FuncFormatter(format_pct)

# Find index of k=1024 for legend
k1024_idx = k_values.index(1024) if 1024 in k_values else 10

for idx, k in enumerate(k_values):
    ax = axes[idx]
    subset = stats[stats['k'] == k].sort_values('n')
    if len(subset) == 0:
        continue

    color = colors[idx]
    # 1st-99th percentile as faint lines
    ax.plot(subset['n'], subset['q01_rel_err'] * 100, '--', color=color, alpha=0.4, linewidth=1,
            label='1st-99th pct' if idx == k1024_idx else None)
    ax.plot(subset['n'], subset['q99_rel_err'] * 100, '--', color=color, alpha=0.4, linewidth=1)
    # IQR as filled band
    ax.fill_between(subset['n'], subset['q25_rel_err'] * 100, subset['q75_rel_err'] * 100,
                    alpha=0.3, color=color, label='IQR' if idx == k1024_idx else None)
    # Median
    ax.plot(subset['n'], subset['median_rel_err'] * 100, 'o-', color=color,
            markersize=3, linewidth=1.5, label='Median' if idx == k1024_idx else None)

    ax.set_xscale('log', base=2)
    ax.set_yscale('symlog', linthresh=0.01)
    ax.set_ylim(-10000, 100000)
    ax.set_title(f'k={k}', fontsize=11)
    ax.grid(True, alpha=0.3)
    ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

    # Set y-axis ticks and format as percentages
    ax.set_yticks([-10000, -1000, -100, -10, -1, -0.1, 0, 0.1, 1, 10, 100, 1000, 10000, 100000])
    ax.yaxis.set_major_formatter(pct_formatter)

    # Simplified x-axis ticks
    n_values = sorted(subset['n'].unique())
    ax.set_xticks(n_values[::2])  # Every other tick
    ax.set_xticklabels([f'$2^{{{int(np.log2(n))}}}$' for n in n_values[::2]], fontsize=7)

# Hide unused subplots
for idx in range(len(k_values), len(axes)):
    axes[idx].set_visible(False)

# Add legend from k=1024 subplot
handles, labels = axes[k1024_idx].get_legend_handles_labels()
fig.legend(handles, labels, loc='lower right', fontsize=10, bbox_to_anchor=(0.99, 0.01))

# Add common labels
fig.supxlabel('True data size (n)', fontsize=12)
fig.supylabel(r'Relative Error of $\hat{n}$ (%)', fontsize=12)
fig.suptitle(f'Percentage Error in Estimate as Data Size Grows (n) by Different Offsets (k)  [{trials_per_point} trials per point]', fontsize=14, y=1.01)

plt.tight_layout()
plt.savefig('error_all_k.png', dpi=150, bbox_inches='tight')
print("Saved error_all_k.png")
plt.close()

# Error vs k for fixed n = 1,280,000 (close to 2^20)
target_n = 1280000
subset_k = stats[stats['n'] == target_n].sort_values('k')

fig, ax = plt.subplots(figsize=(10, 6))

# 1st-99th percentile as faint lines
ax.plot(subset_k['k'], subset_k['q01_rel_err'] * 100, '--', color='steelblue', alpha=0.4, linewidth=1, label='1st-99th percentile')
ax.plot(subset_k['k'], subset_k['q99_rel_err'] * 100, '--', color='steelblue', alpha=0.4, linewidth=1)
# IQR as filled band
ax.fill_between(subset_k['k'], subset_k['q25_rel_err'] * 100, subset_k['q75_rel_err'] * 100,
                alpha=0.3, color='steelblue', label='IQR (25th-75th percentile)')
# Median
ax.plot(subset_k['k'], subset_k['median_rel_err'] * 100, 'o-', color='steelblue',
        markersize=6, linewidth=2, label='Median')

ax.set_xscale('log', base=2)
ax.set_yscale('symlog', linthresh=0.01)
ax.set_ylim(-10000, 10000)
ax.set_xlabel('Offset (k)', fontsize=12)
ax.set_ylabel(r'Relative Error of $\hat{n}$ (%)', fontsize=12)
ax.set_title(f'Percentage Error in Estimate as k Changes for Constant n (n = {target_n:,})  [{trials_per_point} trials per point]', fontsize=14)
ax.legend(loc='upper right', fontsize=10)
ax.grid(True, alpha=0.3)
ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

# Format x-axis ticks
k_vals = sorted(subset_k['k'].unique())
ax.set_xticks(k_vals)
ax.set_xticklabels([str(k) for k in k_vals], fontsize=9, rotation=45, ha='right')

# Format y-axis with percentage labels
ax.set_yticks([-10000, -1000, -100, -10, -1, -0.1, 0, 0.1, 1, 10, 100, 1000, 10000])
ax.yaxis.set_major_formatter(pct_formatter)

plt.tight_layout()
plt.savefig('error_vs_k.png', dpi=150, bbox_inches='tight')
print("Saved error_vs_k.png")
plt.close()

# Absolute error grid plot
fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 3.5 * n_rows))
axes = axes.flatten()

for idx, k in enumerate(k_values):
    ax = axes[idx]
    subset = stats[stats['k'] == k].sort_values('n')
    if len(subset) == 0:
        continue

    color = colors[idx]
    # Compute absolute errors by multiplying relative error by n
    abs_median = subset['median_rel_err'] * subset['n']
    abs_q25 = subset['q25_rel_err'] * subset['n']
    abs_q75 = subset['q75_rel_err'] * subset['n']
    abs_q01 = subset['q01_rel_err'] * subset['n']
    abs_q99 = subset['q99_rel_err'] * subset['n']

    # 1st-99th percentile as faint lines
    ax.plot(subset['n'], abs_q01, '--', color=color, alpha=0.4, linewidth=1,
            label='1st-99th pct' if idx == k1024_idx else None)
    ax.plot(subset['n'], abs_q99, '--', color=color, alpha=0.4, linewidth=1)
    # IQR as filled band
    ax.fill_between(subset['n'], abs_q25, abs_q75,
                    alpha=0.3, color=color, label='IQR' if idx == k1024_idx else None)
    # Median
    ax.plot(subset['n'], abs_median, 'o-', color=color,
            markersize=3, linewidth=1.5, label='Median' if idx == k1024_idx else None)

    ax.set_xscale('log', base=2)
    ax.set_yscale('symlog', linthresh=1000)
    ax.set_ylim(-1e8, 1e10)
    ax.set_title(f'k={k}', fontsize=11)
    ax.grid(True, alpha=0.3)
    ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

    # Simplified x-axis ticks
    n_values = sorted(subset['n'].unique())
    ax.set_xticks(n_values[::2])
    ax.set_xticklabels([f'$2^{{{int(np.log2(n))}}}$' for n in n_values[::2]], fontsize=7)

# Hide unused subplots
for idx in range(len(k_values), len(axes)):
    axes[idx].set_visible(False)

# Add legend from k=1024 subplot
handles, labels = axes[k1024_idx].get_legend_handles_labels()
fig.legend(handles, labels, loc='lower right', fontsize=10, bbox_to_anchor=(0.99, 0.01))

# Add common labels
fig.supxlabel('True data size (n)', fontsize=12)
fig.supylabel(r'Absolute Error of $\hat{n}$', fontsize=12)
fig.suptitle(f'Absolute Error in Estimate as Data Size Grows (n) by Different Offsets (k)  [{trials_per_point} trials per point]', fontsize=14, y=1.01)

plt.tight_layout()
plt.savefig('absolute_error_all_k.png', dpi=150, bbox_inches='tight')
print("Saved absolute_error_all_k.png")
plt.close()

# Absolute error vs k for fixed n
fig, ax = plt.subplots(figsize=(10, 6))

# Compute absolute errors
abs_median_k = subset_k['median_rel_err'] * subset_k['n']
abs_q25_k = subset_k['q25_rel_err'] * subset_k['n']
abs_q75_k = subset_k['q75_rel_err'] * subset_k['n']
abs_q01_k = subset_k['q01_rel_err'] * subset_k['n']
abs_q99_k = subset_k['q99_rel_err'] * subset_k['n']

# 1st-99th percentile as faint lines
ax.plot(subset_k['k'], abs_q01_k, '--', color='steelblue', alpha=0.4, linewidth=1, label='1st-99th percentile')
ax.plot(subset_k['k'], abs_q99_k, '--', color='steelblue', alpha=0.4, linewidth=1)
# IQR as filled band
ax.fill_between(subset_k['k'], abs_q25_k, abs_q75_k,
                alpha=0.3, color='steelblue', label='IQR (25th-75th percentile)')
# Median
ax.plot(subset_k['k'], abs_median_k, 'o-', color='steelblue',
        markersize=6, linewidth=2, label='Median')

ax.set_xscale('log', base=2)
ax.set_yscale('symlog', linthresh=1000)
ax.set_xlabel('Offset (k)', fontsize=12)
ax.set_ylabel(r'Absolute Error of $\hat{n}$', fontsize=12)
ax.set_title(f'Absolute Error in Estimate as k Changes for Constant n (n = {target_n:,})  [{trials_per_point} trials per point]', fontsize=14)
ax.legend(loc='upper right', fontsize=10)
ax.grid(True, alpha=0.3)
ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)

# Format x-axis ticks
ax.set_xticks(k_vals)
ax.set_xticklabels([str(k) for k in k_vals], fontsize=9, rotation=45, ha='right')

plt.tight_layout()
plt.savefig('absolute_error_vs_k.png', dpi=150, bbox_inches='tight')
print("Saved absolute_error_vs_k.png")
plt.close()