import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime
# Load CSV
csv_path = "quarry-97983-llm-editor-blocks-on-enwiki-with-edit-counts-by-month.csv"
df = pd.read_csv(csv_path)
df['log_month'] = pd.to_datetime(df['log_month'])
df = df.sort_values('log_month')
months = df['log_month']
block_counts = df['block_count'].astype(float)
total_edits = df['total_edits_of_blocked_users'].astype(float)
# Axes alignment
Lmax = max(2, int(math.ceil(block_counts.max() * 1.1)))
Rmax = 5 * 10**4
if Lmax <= 1:
Lmax = 2
a = np.log10(Rmax) / (Lmax - 1)
Rmin = 10 ** (-a)
# Fits with covariance
x_num = mdates.date2num(months)
coef_b, cov_b = np.polyfit(x_num, block_counts, 1, cov=True)
slope_b, intercept_b = coef_b
slope_b_se = float(np.sqrt(cov_b[0, 0]))
valid = total_edits > 0
coef_e, cov_e = np.polyfit(x_num[valid], np.log(total_edits[valid]), 1, cov=True)
slope_e, intercept_e = coef_e
slope_e_se = float(np.sqrt(cov_e[0, 0]))
p_blocks = np.poly1d([slope_b, intercept_b])
p_edits = np.poly1d([slope_e, intercept_e])
Z = 1.96
def dt_and_ci_from_slope_lin(mean_level, slope, slope_se):
if slope <= 0:
return np.inf, (np.inf, np.inf)
dt = mean_level / slope
dt_se = abs(mean_level) * slope_se / (slope**2)
return dt, (max(dt - Z*dt_se, 0), dt + Z*dt_se)
def dt_and_ci_from_slope_exp(slope, slope_se):
if slope <= 0:
return np.inf, (np.inf, np.inf)
dt = np.log(2) / slope
dt_se = abs(np.log(2)) * slope_se / (slope**2)
return dt, (max(dt - Z*dt_se, 0), dt + Z*dt_se)
dt_b, ci_dt_b = dt_and_ci_from_slope_lin(block_counts.mean(), slope_b, slope_b_se)
dt_e, ci_dt_e = dt_and_ci_from_slope_exp(slope_e, slope_e_se)
def fmt_ci(dt, ci):
return f"{int(round(dt))} days, 95% CI: [{int(round(ci[0]))}, {int(round(ci[1]))}]"
# Plot
fig, ax1 = plt.subplots(figsize=(8, 6))
width_days = 10
magenta_dark, teal_dark = '#8B008B', '#008080'
ax1.bar(months - pd.Timedelta(days=width_days/2), block_counts, width=width_days, color=magenta_dark)
ax1.set_ylabel('Block Count', color=magenta_dark)
ax1.tick_params(axis='y', labelcolor=magenta_dark)
ax1.set_ylim(0, Lmax)
ax2 = ax1.twinx()
ax2.bar(months + pd.Timedelta(days=width_days/2), total_edits, width=width_days, color=teal_dark)
ax2.set_ylabel('Total Edits (Log Scale)', color=teal_dark)
ax2.tick_params(axis='y', labelcolor=teal_dark)
ax2.set_yscale('log')
ax2.set_ylim(Rmin, Rmax)
ax2.set_yticks([10**i for i in range(0, 5)])
ax2.set_yticklabels([r'$10^0$', r'$10^1$', r'$10^2$', r'$10^3$', r'$10^4$'])
xmin = mdates.date2num(datetime.datetime(2022, 3, 1))
xmax = mdates.date2num((months.max() + pd.DateOffset(months=1)).to_pydatetime())
ax1.set_xlim(xmin, xmax)
ax1.set_xlabel('Month')
ax1.set_title('English Wikipedia non-IP Accounts Blocked as LLMs')
ax1.xaxis.set_major_locator(mdates.MonthLocator(interval=3))
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.setp(ax1.get_xticklabels(), rotation=45, ha="right")
fig.tight_layout()
x_line = np.linspace(xmin, xmax, 1000)
line_blocks, = ax1.plot(x_line, p_blocks(x_line), "--", color=magenta_dark, linewidth=2.5)
line_edits, = ax2.plot(x_line, np.exp(p_edits(x_line)), "--", color=teal_dark, linewidth=2.5)
handles = [ax1.patches[0], line_blocks, ax2.patches[0], line_edits]
labels = [
"Block Count",
f"Blocks double in {fmt_ci(dt_b, ci_dt_b)}",
"Total Edits",
f"Edits double in {fmt_ci(dt_e, ci_dt_e)}"
]
fig.legend(handles, labels, loc="upper left", bbox_to_anchor=(0, 1), bbox_transform=ax1.transAxes)
fig.savefig('wikipedia_block_stats_latest_ci.svg', format='svg')