import os, sys, time, warnings, datetime, json
os.environ.setdefault('TQDM_DISABLE', '1')
os.environ.setdefault('TRANSFORMERS_VERBOSITY', 'error')
os.environ.setdefault('HF_HUB_DISABLE_PROGRESS_BARS', '1')
warnings.filterwarnings('ignore')
warnings.showwarning = lambda *a, **kw: None

import numpy as np
import pandas as pd
import scipy
import altair as alt
alt.data_transformers.disable_max_rows()

import pycorpdiff as pcd
print('pycorpdiff:', pcd.__version__)
print('numpy:     ', np.__version__)
print('pandas:    ', pd.__version__)
print('scipy:     ', scipy.__version__)

pycorpdiff: 0.1.0a28
numpy:      2.4.6
pandas:     2.3.3
scipy:      1.17.1

from pathlib import Path
DATA_DIR = Path('..') / 'data' / 'pubmed_abstracts'
parquets = sorted(DATA_DIR.glob('*.parquet'))
manifest_rows = []
for p in parquets:
    df = pd.read_parquet(p)
    n = len(df)
    rec = {
        'file': p.name,
        'rows': n,
        'with_abstract': int((df['abstract'].str.len() > 0).sum()) if n else 0,
        'year_min': int(df['year'].min()) if n and df['year'].notna().any() else None,
        'year_max': int(df['year'].max()) if n and df['year'].notna().any() else None,
    }
    manifest_rows.append(rec)
manifest = pd.DataFrame(manifest_rows)
print(manifest.to_string(index=False))
print(f'\nTOTAL records: {manifest.rows.sum():,}')
print(f'TOTAL with abstract text: {manifest.with_abstract.sum():,}')

                                         file  rows  with_abstract  year_min  year_max
                       1960s_down_new.parquet 30282          25586    1955.0    2025.0
                       1960s_down_old.parquet  1546            101    1950.0    2024.0
                       1980s_ptsd_new.parquet 50433          47955    1980.0    2025.0
                       1980s_ptsd_old.parquet   248            181    1940.0    2024.0
                        1990s_did_new.parquet   520            456    1994.0    2024.0
                        1990s_did_old.parquet   635            432    1954.0    2024.0
                         2010s_id_new.parquet 29290          28442    1984.0    2025.0
                         2010s_id_old.parquet 35440          28488    1950.0    2025.0
           2013_aas_dsm5_negative_new.parquet     5              5    2020.0    2024.0
           2013_aas_dsm5_negative_old.parquet   420            386    1990.0    2024.0
                2013_alcohol_dsm5_new.parquet 17749          17223    1990.0    2025.0
                2013_alcohol_dsm5_old.parquet 40208          38506    1990.0    2025.0
                    2013_asperger_new.parquet 53961          52334    1980.0    2025.0
                    2013_asperger_old.parquet  2180           1998    1981.0    2024.0
               2013_cannabis_dsm5_new.parquet  2569           2504    1990.0    2025.0
               2013_cannabis_dsm5_old.parquet  1667           1610    1990.0    2025.0
                2013_cocaine_dsm5_new.parquet  1031           1009    1991.0    2025.0
                2013_cocaine_dsm5_old.parquet  3843           3621    1990.0    2025.0
               2013_gambling_dsm5_new.parquet  1387           1329    1991.0    2024.0
               2013_gambling_dsm5_old.parquet  3954           3782    1990.0    2024.0
                 2013_opioid_dsm5_new.parquet  9675           9052    1991.0    2025.0
                 2013_opioid_dsm5_old.parquet  6321           5937    1990.0    2025.0
  2013_polysubstance_dsm5_retired_new.parquet    71             70    1994.0    2024.0
  2013_polysubstance_dsm5_retired_old.parquet   592            577    1990.0    2025.0
              2013_stimulant_dsm5_new.parquet   388            368    1999.0    2024.0
              2013_stimulant_dsm5_old.parquet  1302           1251    1990.0    2024.0
                2013_tobacco_dsm5_new.parquet   769            748    1991.0    2024.0
                2013_tobacco_dsm5_old.parquet  7415           7262    1990.0    2025.0
  2014_tramadol_abuse_recognition_new.parquet   131            111    1997.0    2024.0
  2014_tramadol_abuse_recognition_old.parquet  6826           6424    1995.0    2025.0
2015_gabapentin_abuse_recognition_new.parquet    67             54    1997.0    2024.0
2015_gabapentin_abuse_recognition_old.parquet  7968           7382    1993.0    2025.0
2015_loperamide_abuse_recognition_new.parquet   101             86    1994.0    2024.0
2015_loperamide_abuse_recognition_old.parquet  2038           1935    1990.0    2025.0
2015_pregabalin_abuse_recognition_new.parquet    75             60    2010.0    2024.0
2015_pregabalin_abuse_recognition_old.parquet  4752           4374    2004.0    2025.0
                     2016_sepsis3_new.parquet  2276           2166    1990.0    2025.0
                     2016_sepsis3_old.parquet 19901          19042    1990.0    2025.0
2018_tianeptine_abuse_recognition_new.parquet    17             15    1999.0    2024.0
2018_tianeptine_abuse_recognition_old.parquet   590            549    1990.0    2024.0
             neg_suicide_phrasing_new.parquet     0              0       NaN       NaN
             neg_suicide_phrasing_old.parquet  1803           1776    1970.0    2024.0

TOTAL records: 350,446
TOTAL with abstract text: 325,187

from pycorpdiff.keyness import log_likelihood
REFERENCES = [
    # (label, O1, N1, O2, N2, expected_unsigned_LL)
    ('classic_12k_vs_10k',          12000,   1_000_000, 10000, 1_000_000, 182.06945166461492),
    ('equal_rate_no_signal',        10,      1000,      20,    2000,      0.0),
    ('ten_x_overrep_in_a',          100,     100_000,   20,    200_000,   127.80637193003540),
    ('five_x_overrep_in_a',         500,     1_000_000, 100,   1_000_000, 291.1031660323688),
    ('same_count_half_rate',        50,      100_000,   50,    50_000,    11.778303565638346),
    ('lopsided_overrep_in_a',       1000,    1_000_000, 1,     1_000_000, 1371.864145256213),
]
rows = []
for label, O1, N1, O2, N2, expected_ll in REFERENCES:
    res = log_likelihood(
        pd.Series([O1], index=['t']), pd.Series([O2], index=['t']),
        total_a=N1, total_b=N2, formula='rayson',
    )
    obs = abs(float(res['g2'].iloc[0]))
    rows.append({'case': label, 'expected': expected_ll, 'pycorpdiff': obs,
                 'abs_error': abs(obs - expected_ll)})
xv = pd.DataFrame(rows)
print(xv.to_string(index=False, float_format=lambda x: f'{x:.6e}' if isinstance(x, float) else str(x)))
worst = float(xv['abs_error'].max())
print(f'\\nworst absolute error across {len(xv)} cases: {worst:.2e}')
assert worst < 1e-10, f'Rayson reference disagreement at {worst:.2e}; block release'
print(f'OK -- agreement with canonical Rayson references at < 1e-10 (observed worst: {worst:.2e}).')

                 case     expected   pycorpdiff    abs_error
   classic_12k_vs_10k 1.820695e+02 1.820695e+02 1.773515e-11
 equal_rate_no_signal 0.000000e+00 0.000000e+00 0.000000e+00
   ten_x_overrep_in_a 1.278064e+02 1.278064e+02 5.684342e-14
  five_x_overrep_in_a 2.911032e+02 2.911032e+02 0.000000e+00
 same_count_half_rate 1.177830e+01 1.177830e+01 6.394885e-14
lopsided_overrep_in_a 1.371864e+03 1.371864e+03 2.273737e-13
\nworst absolute error across 6 cases: 1.77e-11
OK -- agreement with canonical Rayson references at < 1e-10 (observed worst: 1.77e-11).

SHIFTS = {
    '1960s_down':           {'old_label': 'mongolism', 'new_label': 'Down syndrome / trisomy 21',
                             'anchor_year': 1965, 'anchor_event': 'Lancet 1961, WHO ICD-8 ~1965'},
    '1980s_ptsd':           {'old_label': 'shell shock / war neurosis / combat fatigue',
                             'new_label': 'PTSD', 'anchor_year': 1980,
                             'anchor_event': 'DSM-III publication 1980'},
    '1990s_did':            {'old_label': 'multiple personality disorder',
                             'new_label': 'dissociative identity disorder', 'anchor_year': 1994,
                             'anchor_event': 'DSM-IV publication 1994'},
    '2010s_id':             {'old_label': 'mental retardation',
                             'new_label': 'intellectual disability', 'anchor_year': 2012,
                             'anchor_event': 'Rosa\'s Law 2010 + DSM-5 2013'},
    # iter-5c: Sepsis-3 operational-definition revision.
    '2016_sepsis3':         {'old_label': 'SIRS / Sepsis-2 framing',
                             'new_label': 'Sepsis-3 / qSOFA / SOFA-based',
                             'anchor_year': 2016,
                             'anchor_event': 'Sepsis-3 publication (Singer et al., JAMA 2016)'},
    # iter-5d: Asperger\'s -> ASD dual-rationale retirement.
    '2013_asperger':        {'old_label': 'Asperger syndrome / Asperger disorder',
                             'new_label': 'autism spectrum disorder / ASD',
                             'anchor_year': 2013,
                             'anchor_event': 'DSM-5 (2013) + Czech/Sheffer (2018) ethical reckoning'},
    # iter-7 §5.7: synchronised-family DSM-5 rename archetype.
    '2013_alcohol_dsm5':    {'old_label': 'alcohol abuse / dependence / alcoholism',
                             'new_label': 'alcohol use disorder / AUD',
                             'anchor_year': 2013,
                             'anchor_event': 'DSM-5 2013 unified-SUD family'},
    '2013_opioid_dsm5':     {'old_label': 'opioid abuse / dependence',
                             'new_label': 'opioid use disorder / OUD',
                             'anchor_year': 2013,
                             'anchor_event': 'DSM-5 2013 unified-SUD family'},
    '2013_cannabis_dsm5':   {'old_label': 'cannabis / marijuana abuse / dependence',
                             'new_label': 'cannabis use disorder / CUD',
                             'anchor_year': 2013,
                             'anchor_event': 'DSM-5 2013 unified-SUD family'},
    '2013_cocaine_dsm5':    {'old_label': 'cocaine abuse / dependence',
                             'new_label': 'cocaine use disorder',
                             'anchor_year': 2013,
                             'anchor_event': 'DSM-5 2013 unified-SUD family'},
    '2013_stimulant_dsm5':  {'old_label': 'amphetamine/methamphetamine abuse / dependence',
                             'new_label': 'stimulant use disorder',
                             'anchor_year': 2013,
                             'anchor_event': 'DSM-5 2013 unified-SUD family + recategorise'},
    '2013_tobacco_dsm5':    {'old_label': 'nicotine dependence',
                             'new_label': 'tobacco use disorder',
                             'anchor_year': 2013,
                             'anchor_event': 'DSM-5 2013 unified-SUD family'},
    '2013_aas_dsm5_negative': {'old_label': 'anabolic steroid abuse / dependence',
                               'new_label': '(no DSM-5 carve-out for AAS)',
                               'anchor_year': 2013,
                               'anchor_event': 'DSM-5 2013 — NEGATIVE prediction (AAS not given own category)'},
    '2013_polysubstance_dsm5_retired': {
                             'old_label': 'polysubstance abuse / dependence',
                             'new_label': '(retired entirely in DSM-5)',
                             'anchor_year': 2013,
                             'anchor_event': 'DSM-5 2013 — category RETIRED, no replacement'},
    '2013_gambling_dsm5':   {'old_label': 'pathological / compulsive gambling',
                             'new_label': 'gambling disorder',
                             'anchor_year': 2013,
                             'anchor_event': 'DSM-5 2013 promoted gambling to Substance & Addictive Disorders chapter'},
    # iter-7 §5.7.15: discovery-of-abuse-potential archetype.
    '2015_gabapentin_abuse_recognition': {
                             'old_label': 'gabapentin (treatment-only era)',
                             'new_label': 'gabapentin abuse / misuse / use disorder',
                             'anchor_year': 2015,
                             'anchor_event': 'gabapentin abuse-recognition emerged ~2010-2015; KY Schedule V 2017'},
    '2015_pregabalin_abuse_recognition': {
                             'old_label': 'pregabalin (treatment era)',
                             'new_label': 'pregabalin abuse / misuse / Lyrica abuse',
                             'anchor_year': 2015,
                             'anchor_event': 'pregabalin abuse-recognition ~2012-2015'},
    '2014_tramadol_abuse_recognition': {
                             'old_label': 'tramadol (treatment era)',
                             'new_label': 'tramadol abuse / misuse / dependence',
                             'anchor_year': 2014,
                             'anchor_event': 'DEA Schedule IV federal scheduling 2014'},
    '2015_loperamide_abuse_recognition': {
                             'old_label': 'loperamide / Imodium (treatment era)',
                             'new_label': 'loperamide abuse / misuse / toxicity',
                             'anchor_year': 2015,
                             'anchor_event': 'high-dose loperamide abuse recognition; FDA black-box 2018'},
    '2018_tianeptine_abuse_recognition': {
                             'old_label': 'tianeptine (EU antidepressant era)',
                             'new_label': 'tianeptine abuse / misuse / use disorder',
                             'anchor_year': 2018,
                             'anchor_event': 'US tianeptine misuse recognition + FDA warning 2018'},
    'neg_suicide_phrasing': {'old_label': '"committed suicide"',
                             'new_label': '"died by suicide"', 'anchor_year': 2015,
                             'anchor_event': 'AAS recommendations 2008-2017 (negative finding)'},
}

frames = {}
for shift in SHIFTS:
    parts = {}
    for side in ('old', 'new'):
        p = DATA_DIR / f'{shift}_{side}.parquet'
        df = pd.read_parquet(p)
        if len(df):
            # Build a unified text field for pycorpdiff analysis
            df['text'] = (df['title'].fillna('') + ' ' + df['abstract'].fillna('')).str.strip()
            df = df[df['text'].str.len() > 0].reset_index(drop=True)
            df['year'] = df['year'].astype('Int64')
            df = df.dropna(subset=['year']).reset_index(drop=True)
            df['year'] = df['year'].astype(int)
        parts[side] = df
        print(f'  {shift}/{side}: {len(df):>6,} non-empty records '
              f'({df.year.min() if len(df) else "—"}–{df.year.max() if len(df) else "—"})')
    frames[shift] = parts
print()
print(f'TOTAL non-empty records: {sum(len(p) for s in frames.values() for p in s.values()):,}')

  1960s_down/old:  1,546 non-empty records (1950–2024)
  1960s_down/new: 30,282 non-empty records (1955–2025)
  1980s_ptsd/old:    248 non-empty records (1940–2024)

  1980s_ptsd/new: 50,433 non-empty records (1980–2025)
  1990s_did/old:    635 non-empty records (1954–2024)
  1990s_did/new:    520 non-empty records (1994–2024)
  2010s_id/old: 35,440 non-empty records (1950–2025)
  2010s_id/new: 29,290 non-empty records (1984–2025)

  2016_sepsis3/old: 19,901 non-empty records (1990–2025)
  2016_sepsis3/new:  2,276 non-empty records (1990–2025)
  2013_asperger/old:  2,180 non-empty records (1981–2024)
  2013_asperger/new: 53,961 non-empty records (1980–2025)

  2013_alcohol_dsm5/old: 40,208 non-empty records (1990–2025)
  2013_alcohol_dsm5/new: 17,749 non-empty records (1990–2025)
  2013_opioid_dsm5/old:  6,321 non-empty records (1990–2025)
  2013_opioid_dsm5/new:  9,675 non-empty records (1991–2025)
  2013_cannabis_dsm5/old:  1,667 non-empty records (1990–2025)
  2013_cannabis_dsm5/new:  2,569 non-empty records (1990–2025)
  2013_cocaine_dsm5/old:  3,843 non-empty records (1990–2025)
  2013_cocaine_dsm5/new:  1,031 non-empty records (1991–2025)
  2013_stimulant_dsm5/old:  1,302 non-empty records (1990–2024)
  2013_stimulant_dsm5/new:    388 non-empty records (1999–2024)
  2013_tobacco_dsm5/old:  7,415 non-empty records (1990–2025)
  2013_tobacco_dsm5/new:    769 non-empty records (1991–2024)
  2013_aas_dsm5_negative/old:    420 non-empty records (1990–2024)
  2013_aas_dsm5_negative/new:      5 non-empty records (2020–2024)

  2013_polysubstance_dsm5_retired/old:    592 non-empty records (1990–2025)
  2013_polysubstance_dsm5_retired/new:     71 non-empty records (1994–2024)
  2013_gambling_dsm5/old:  3,954 non-empty records (1990–2024)
  2013_gambling_dsm5/new:  1,387 non-empty records (1991–2024)
  2015_gabapentin_abuse_recognition/old:  7,968 non-empty records (1993–2025)
  2015_gabapentin_abuse_recognition/new:     67 non-empty records (1997–2024)
  2015_pregabalin_abuse_recognition/old:  4,752 non-empty records (2004–2025)
  2015_pregabalin_abuse_recognition/new:     75 non-empty records (2010–2024)
  2014_tramadol_abuse_recognition/old:  6,826 non-empty records (1995–2025)
  2014_tramadol_abuse_recognition/new:    131 non-empty records (1997–2024)
  2015_loperamide_abuse_recognition/old:  2,038 non-empty records (1990–2025)
  2015_loperamide_abuse_recognition/new:    101 non-empty records (1994–2024)
  2018_tianeptine_abuse_recognition/old:    590 non-empty records (1990–2024)
  2018_tianeptine_abuse_recognition/new:     17 non-empty records (1999–2024)
  neg_suicide_phrasing/old:  1,803 non-empty records (1970–2024)
  neg_suicide_phrasing/new:      0 non-empty records (—–—)

TOTAL non-empty records: 350,446

yearly_rows = []
for shift, parts in frames.items():
    for side, df in parts.items():
        if not len(df): continue
        for yr, cnt in df.groupby('year').size().items():
            yearly_rows.append({'shift': shift, 'side': side, 'year': int(yr), 'n_records': int(cnt)})
yearly = pd.DataFrame(yearly_rows)
print(f'{len(yearly):,} (shift, side, year) rows')
yearly.head()

1,447 (shift, side, year) rows

# Chart-axis truncation: the PubMed fetch ran mid-2024, so 2024 has only
# a partial year of indexed records. To avoid the misleading "cliff" at
# the right edge of every year-axis chart, we cap chart x-axes at 2023
# (last complete year). Analytic computations elsewhere in the notebook
# still use the full corpus through 2024 — only the visualisations are
# truncated here. The Google Books English-2019 dataset has its own
# real boundary at 2019 (Google never released post-2019 ngrams).
_PLOT_YEAR_MAX = 2023

# Stacked-area corpus coverage: how recent the 150K-record corpus skews
_cov = (yearly[yearly['year'] <= _PLOT_YEAR_MAX]
        .groupby(['year', 'shift'])['n_records'].sum().reset_index())
_cov_chart = alt.Chart(_cov).mark_area(opacity=0.85).encode(
    x=alt.X('year:O', title='Year', axis=alt.Axis(values=list(range(1950, 2025, 10)), labelOverlap=True)),
    y=alt.Y('n_records:Q', title='records / year (stacked across shifts)', stack='zero'),
    color=alt.Color('shift:N', title='Shift',
                     scale=alt.Scale(scheme='tableau10')),
    tooltip=['year:O', 'shift:N', 'n_records:Q'],
).properties(width=720, height=220, title='Corpus coverage 1950-2024 stacked by shift (n=150,197 records)')
_cov_chart

# Plot per-shift trajectories with anchor lines
charts = []
for shift, info in SHIFTS.items():
    sub = yearly[(yearly['shift'] == shift) & (yearly['year'] <= _PLOT_YEAR_MAX)].copy()
    if sub.empty: continue
    sub['side_label'] = sub['side'].map({
        'old': info['old_label'][:30], 'new': info['new_label'][:30]
    })
    base = alt.Chart(sub).mark_line(point=False).encode(
        x=alt.X('year:O', title='Year', axis=alt.Axis(labelOverlap=True)),
        y=alt.Y('n_records:Q', title='records / year'),
        color=alt.Color('side_label:N', title=None,
                        scale=alt.Scale(range=['#e76f51', '#264653'])),
        tooltip=['shift', 'side_label', 'year', 'n_records'],
    )
    anchor_layer = alt.Chart(pd.DataFrame({'x': [info['anchor_year']]})).mark_rule(
        strokeDash=[4, 4], color='#888'
    ).encode(x='x:O')
    chart = (base + anchor_layer).properties(
        width=560, height=180,
        title=f"{shift}: {info['old_label'][:25]} -> {info['new_label'][:25]} (anchor {info['anchor_year']})"
    )
    charts.append(chart)
alt.vconcat(*charts).resolve_scale(y='independent')

SHIFT1 = '1960s_down'
old1 = frames[SHIFT1]['old']
new1 = frames[SHIFT1]['new']
anchor1 = SHIFTS[SHIFT1]['anchor_year']

# Annual counts and crossover detection
old_yr = old1.groupby('year').size()
new_yr = new1.groupby('year').size()
years = sorted(set(old_yr.index) | set(new_yr.index))
old_yr = old_yr.reindex(years, fill_value=0)
new_yr = new_yr.reindex(years, fill_value=0)
crossover = next((y for y in years if new_yr[y] > old_yr[y] and (new_yr[y] + old_yr[y]) >= 5), None)
print(f'mongolism peak: {old_yr.max()} in {int(old_yr.idxmax())}')
print(f'Down-syndrome family in 2020s: {new_yr.loc[2020:].sum() / max(1, (new_yr.index >= 2020).sum()):.0f} records/year average')
print(f'Crossover year (new > old, both >= 5): {crossover}')
print(f'Crossover vs anchor {anchor1}: {crossover - anchor1:+d} years' if crossover else 'no crossover detected')

mongolism peak: 235 in 1964
Down-syndrome family in 2020s: 887 records/year average
Crossover year (new > old, both >= 5): 1966
Crossover vs anchor 1965: +1 years

# Keyness: pre-anchor old corpus vs post-anchor new corpus
# What contextual vocabulary changed?
pre_anchor = pcd.from_dataframe(
    old1[old1['year'] < anchor1], text_col='text', meta_cols=('year','journal')
)
post_anchor = pcd.from_dataframe(
    new1[new1['year'] >= anchor1], text_col='text', meta_cols=('year','journal')
)
print(f'pre-anchor (mongolism, <{anchor1}): {len(pre_anchor.docs):,} docs')
print(f'post-anchor (Down syndrome, >={anchor1}): {len(new1[new1["year"] >= anchor1]):,} docs')

PUBMED_STOP = {'study', 'patient', 'patients', 'group', 'groups', 'method', 'methods',
               'result', 'results', 'conclusion', 'conclusions', 'background', 'objective',
               'introduction', 'discussion', 'analysis', 'data', 'using', 'used',
               'compared', 'showed', 'observed', 'present', 'found', 'cases', 'case',
               'paper', 'article', 'report', 'reports', 'review', 'reviews'}

key1 = pcd.compare(pre_anchor, post_anchor).keyness(
    min_count=30, formula='dunning', stop_words=PUBMED_STOP, multiple_comparisons='bh',
)
key1_df = key1.to_df()
print(f'\nTop pre-anchor-distinctive terms (positive log_ratio):')
print(key1_df[key1_df['log_ratio'] > 0].head(15)[['term','count_a','count_b','g2','log_ratio','p_adjusted']].to_string(index=False))
print(f'\nTop post-anchor-distinctive terms (negative log_ratio):')
print(key1_df[key1_df['log_ratio'] < 0].head(15)[['term','count_a','count_b','g2','log_ratio','p_adjusted']].to_string(index=False))

pre-anchor (mongolism, <1965): 1,053 docs
post-anchor (Down syndrome, >=1965): 30,196 docs

Top pre-anchor-distinctive terms (positive log_ratio):
         term  count_a  count_b          g2  log_ratio   p_adjusted
    mongolism      489      136 5543.478696  10.948127 0.000000e+00
    mongoloid      159       84 1697.064746  10.022252 0.000000e+00
   mongoloids       38       24  397.283544   9.757795 6.492034e-85
       mongol       35       15  381.024594  10.301270 1.686770e-81
    mongolian       33       10  370.184216  10.779491 3.092707e-79
       idiocy       27        3  321.500363  12.079724 1.030777e-68
           of      829   240367  293.868608   0.926935 9.242013e-63
           in      644   169551  289.197057   1.066391 8.426462e-62
      mongols       29       28  287.358952   9.155472 1.883681e-61
  chromosomes       41     1556  142.236003   3.876668 7.825322e-30
translocation       33     1081  123.418587   4.092990 8.527794e-26
        twins       28      644  123.168415   4.606572 8.929607e-26
   chromosome       78     9206  117.724610   2.231902 1.289479e-24
   congenital       61     5951  110.626255   2.509196 4.046581e-23
    excretion       13       39  105.823473   7.556826 4.296727e-22

Top post-anchor-distinctive terms (negative log_ratio):
     term  count_a  count_b          g2  log_ratio   p_adjusted
       ds        0    36546 -132.974405  -7.051727 7.544935e-28
      was        9    47062 -112.778835  -3.168644 1.457395e-23
     were       13    48473 -100.677979  -2.704302 5.448383e-21
      for       26    60052  -92.024833  -2.040297 4.082461e-19
        0        1    23186  -74.781010  -4.810316 1.906397e-15
       we        0    18971  -68.920864  -6.105827 3.317169e-14
     that        7    30513  -67.960801  -2.884551 5.211536e-14
screening        0    15347  -55.737375  -5.799997 1.924594e-11
     this        6    23856  -50.961504  -2.735936 1.985112e-10
       is       18    36345  -49.341552  -1.834317 4.330803e-10
        1        6    22992  -48.259404  -2.682717 6.973280e-10
       to       89    94150  -48.240300  -0.933147 6.973280e-10
        5        0    11538  -41.889883  -5.388449 1.651011e-08
      had        1    12423  -36.874358  -3.910103 1.812287e-07
       or        9    21908  -34.835200  -2.065557 4.712594e-07

ekey1_ci = pcd.compare(pre_anchor, post_anchor).keyness(
    min_count=30, formula='dunning', stop_words=PUBMED_STOP,
    multiple_comparisons='bh',
    ci='bootstrap', n_boot=299, simultaneous_ci=True, bootstrap_seed=0,
)
ekey1_ci_df = ekey1_ci.to_df()
# Restrict to the top-15 by |G^2| and show per-term + simultaneous CI
_top15 = ekey1_ci_df.head(15)
cols = ['term', 'count_a', 'count_b', 'g2',
        'g2_ci_lower', 'g2_ci_upper',
        'g2_ci_lower_simultaneous', 'g2_ci_upper_simultaneous',
        'p_adjusted']
print(_top15[cols].to_string(index=False))

# How many of top-15 have per-term CI excluding zero? simultaneous CI excluding zero?
_per_term_excl = int(((_top15['g2_ci_lower'] > 0) | (_top15['g2_ci_upper'] < 0)).sum())
_sim_excl = int(((_top15['g2_ci_lower_simultaneous'] > 0) |
                  (_top15['g2_ci_upper_simultaneous'] < 0)).sum())
print(f'\\ntop-15: per-term CI excludes zero in {_per_term_excl}/15')
print(f'top-15: simultaneous max-T CI excludes zero in {_sim_excl}/15')
s2a_top15_per_term_excl = _per_term_excl
s2a_top15_sim_excl = _sim_excl

         term  count_a  count_b          g2  g2_ci_lower  g2_ci_upper  g2_ci_lower_simultaneous  g2_ci_upper_simultaneous   p_adjusted
    mongolism      489      136 5543.478696  5166.522915  6016.860053               3879.892001               7267.262279 0.000000e+00
    mongoloid      159       84 1697.064746  1430.510979  2015.182019                554.790870               2862.059595 0.000000e+00
   mongoloids       38       24  397.283544   253.893933   547.041441               -209.193695               1012.729999 6.492034e-85
       mongol       35       15  381.024594   242.072692   547.890657               -244.077728               1007.425356 1.686770e-81
    mongolian       33       10  370.184216   248.923521   512.355583               -167.364008                924.896725 3.092707e-79
       idiocy       27        3  321.500363   203.184233   447.022140               -181.990715                837.821524 1.030777e-68
           of      829   240367  293.868608   248.911011   374.064006                 38.074082                586.659411 9.242013e-63
           in      644   169551  289.197057   251.006229   366.841841                 72.332265                540.546263 8.426462e-62
      mongols       29       28  287.358952   183.877337   399.008708               -173.373669                753.406875 1.883681e-61
  chromosomes       41     1556  142.236003    79.360122   228.046363               -170.582967                458.395009 7.825322e-30
           ds        0    36546 -132.974405  -142.030942  -120.177317               -175.353961                -84.881482 7.544935e-28
translocation       33     1081  123.418587    61.681777   214.453154               -203.626765                458.692010 8.527794e-26
        twins       28      644  123.168415    64.898922   204.633348               -147.388606                394.235571 8.929607e-26
   chromosome       78     9206  117.724610    57.898908   211.982200               -196.657053                440.165786 1.289479e-24
          was        9    47062 -112.778835  -140.523572   -82.903656               -219.873206                 -1.101029 1.457395e-23
\ntop-15: per-term CI excludes zero in 15/15
top-15: simultaneous max-T CI excludes zero in 6/15

shift1 = pcd.compare(pre_anchor, post_anchor).collocation_shift(
    target='syndrome', window=5, min_count=10,
)
s2b_df = shift1.to_df()
# Filter out generic PubMed stop words after the fact since collocation_shift
# doesn't accept stop_words= directly
s2b_df = s2b_df[~s2b_df['collocate'].isin(PUBMED_STOP)].reset_index(drop=True)
print(f'{len(s2b_df):,} collocates analysed (after PubMed-stopwords filter); top 12 by |shift|:')
print(s2b_df.head(12).to_string(index=False))

3,547 collocates analysed (after PubMed-stopwords filter); top 12 by |shift|:
   collocate  count_a  count_b   score_a  score_b    shift
    twinning        4        8 10.415037 2.126013 8.289024
      sturge        2       10  9.621488 2.431727 7.189761
       xxxxy        2       14  9.580461 2.897006 6.683455
   mongoloid        3        8  8.773932 2.125177 6.648755
      nuclei        1        9  8.870717 2.281895 6.588822
    lacrimal        1       10  8.870717 2.430167 6.440550
  incomplete        1       10  8.843496 2.428202 6.415293
   existence        1       11  8.884523 2.558702 6.325821
       weber        2       19  9.621488 3.324721 6.296767
        note        1       11  8.843496 2.561479 6.282016
cytogenetics        2       18  9.514573 3.244143 6.270431
     enzymes        2       20  9.594008 3.389898 6.204110

_top12 = s2b_df.head(12).copy()
# Find which column holds 'before' rate and which 'after' — pycorpdiff returns
# (collocate, count_a, count_b, dice_a, dice_b, shift) or similar; pick the
# two rate columns to draw the dumbbell against.
_rate_cols = [c for c in _top12.columns if c.startswith('dice')]
if len(_rate_cols) >= 2:
    _ra, _rb = _rate_cols[0], _rate_cols[1]
elif {'count_a', 'count_b'}.issubset(_top12.columns):
    _ra, _rb = 'count_a', 'count_b'
else:
    _rate_cols = [c for c in _top12.columns if _top12[c].dtype.kind in 'fi' and c != 'shift']
    _ra, _rb = _rate_cols[:2]
_top12 = _top12.sort_values('shift').reset_index(drop=True)
_long = pd.concat([
    _top12[['collocate', _ra]].rename(columns={_ra: 'rate'}).assign(era='pre-anchor (<1965)'),
    _top12[['collocate', _rb]].rename(columns={_rb: 'rate'}).assign(era='post-anchor (>=1965)'),
])
_line = alt.Chart(_top12).mark_rule(stroke='#bbb', strokeWidth=2).encode(
    y=alt.Y('collocate:N', sort=_top12['collocate'].tolist(), title=None),
    x=alt.X(f'{_ra}:Q', title=f'collocate rate ({_ra}=pre, {_rb}=post)'),
    x2=f'{_rb}:Q',
)
_pts = alt.Chart(_long).mark_circle(size=180).encode(
    y=alt.Y('collocate:N', sort=_top12['collocate'].tolist()),
    x='rate:Q',
    color=alt.Color('era:N',
                     scale=alt.Scale(domain=['pre-anchor (<1965)', 'post-anchor (>=1965)'],
                                      range=['#e76f51', '#264653'])),
    tooltip=['collocate', 'era', 'rate'],
)
(_line + _pts).properties(width=560, height=300,
    title='§2b syndrome collocates: pre-1965 (red) -> post-1965 (teal), top 12 by |shift|')

SHIFT2 = '1980s_ptsd'
old2 = frames[SHIFT2]['old']
new2 = frames[SHIFT2]['new']
anchor2 = SHIFTS[SHIFT2]['anchor_year']

old_yr2 = old2.groupby('year').size()
new_yr2 = new2.groupby('year').size()
first_ptsd = int(new_yr2.index.min()) if len(new_yr2) else None
print(f'First PTSD record year: {first_ptsd} (anchor: {anchor2}, prediction: 1979-1981)')
print(f'PTSD records by anchor year ({anchor2}): {new_yr2.loc[:anchor2].sum()}')
print(f'PTSD records in last decade: {new_yr2.loc[2015:].sum():,}')
print(f'Shell-shock family by decade:')
old2['decade'] = (old2['year'] // 10) * 10
print(old2.groupby('decade').size().to_string())

First PTSD record year: 1980 (anchor: 1980, prediction: 1979-1981)
PTSD records by anchor year (1980): 2
PTSD records in last decade: 31,083
Shell-shock family by decade:
decade
1940    28
1950     2
1960     4
1970     5
1980     9
1990    13
2000    55
2010    87
2020    45

# Keyness on post-anchor PTSD corpus only: what's the modal PTSD paper about?
# (We split the post-1980 PTSD corpus into pre-2000 vs post-2000 to see how
#  the topical mix shifted within PTSD over its own four-decade history.)
ptsd_early = pcd.from_dataframe(new2[(new2['year'] >= 1980) & (new2['year'] < 2000)],
                                 text_col='text', meta_cols=('year','journal'))
ptsd_late = pcd.from_dataframe(new2[new2['year'] >= 2010],
                                text_col='text', meta_cols=('year','journal'))
print(f'PTSD early-era (1980-1999): {len(new2[(new2["year"] >= 1980) & (new2["year"] < 2000)]):,} docs')
print(f'PTSD late-era (2010+):     {len(new2[new2["year"] >= 2010]):,} docs')

key2 = pcd.compare(ptsd_early, ptsd_late).keyness(
    min_count=50, formula='dunning', stop_words=PUBMED_STOP, multiple_comparisons='bh',
)
key2_df = key2.to_df()
print(f'\nTop EARLY-distinctive terms (1980s-90s):')
print(key2_df[key2_df['log_ratio'] > 0].head(12)[['term','count_a','count_b','g2','log_ratio']].to_string(index=False))
print(f'\nTop LATE-distinctive terms (2010s+):')
print(key2_df[key2_df['log_ratio'] < 0].head(12)[['term','count_a','count_b','g2','log_ratio']].to_string(index=False))

PTSD early-era (1980-1999): 2,938 docs
PTSD late-era (2010+):     39,643 docs

Top EARLY-distinctive terms (1980s-90s):
         term  count_a  count_b          g2  log_ratio
      vietnam      957      663 3984.935421   5.081798
       combat     1409     6182 2244.795225   2.419615
     subjects      961     3394 1833.429038   2.732782
     disorder     5255    64725 1704.731111   0.930188
          iii      452      535 1574.208171   4.309653
          war      960     4986 1299.231885   2.176452
           of    19904   358211 1280.422874   0.382977
       stress     5305    73559 1191.959287   0.759271
         mmpi      257      177 1071.540702   5.089376
posttraumatic     2788    33154  990.397507   0.980979
        abuse      979     6784  944.498119   1.760497
          the    19955   376779  876.803885   0.313760

Top LATE-distinctive terms (2010s+):
        term  count_a  count_b           g2  log_ratio
           0      380    37470 -1307.449257  -2.069093
       covid        0    10325  -862.137554  -9.781302
      health      762    44558  -856.936134  -1.316197
      mental      532    34677  -781.445346  -1.472452
          ci       40    11824  -707.822578  -3.637019
participants      207    19261  -639.111641  -1.983843
          we      462    29334  -637.216170  -1.434379
          19       72    12253  -599.353662  -2.848375
          95       88    12836  -580.921076  -2.627737
    outcomes      112    13332  -533.806231  -2.336256
           p      298    20676  -504.869432  -1.561495
           1      671    33151  -469.440883  -1.072921

ptsd_yr_series = new_yr2.reindex(range(1940, 2025), fill_value=0).astype(int)
# Build per-year totals as the sum of old+new corpora for this shift: this
# gives a binomial-style "what share of the wider trauma-vocabulary universe
# is PTSD?" denominator.
totals_series = ((old_yr2.reindex(range(1940, 2025), fill_value=0)
                 + new_yr2.reindex(range(1940, 2025), fill_value=0))
                 .astype(int).clip(lower=1))
print(f'PTSD counts series: {int(ptsd_yr_series.iloc[0])} in 1940 -> {int(ptsd_yr_series.iloc[-1])} in 2024')
print(f'Totals series (PTSD + shell-shock family): {int(totals_series.iloc[0])} -> {int(totals_series.iloc[-1])}')

states = pcd.kleinberg_bursts(ptsd_yr_series, totals_series, s=2.0, gamma=1.0, n_states=5)
print(f'\\nKleinberg burst state sequence (s=2.0, gamma=1.0, n_states=5):')
state_df = pd.DataFrame({'year': ptsd_yr_series.index, 'count': ptsd_yr_series.values,
                          'totals': totals_series.values, 'state': states})
print(state_df.loc[(state_df['state'] > 0) | (state_df['year'].isin([1980, 1990, 2000, 2010, 2020]))].to_string(index=False))

# Burst regions are contiguous runs of state > 0
in_burst = state_df['state'] > 0
burst_starts = state_df[in_burst & (~in_burst.shift(1, fill_value=False))]
s3b_first_burst_year = int(burst_starts.iloc[0]['year']) if len(burst_starts) else None
s3b_aligned = s3b_first_burst_year is not None and 1979 <= s3b_first_burst_year <= 1983
print(f'\\nFirst burst onset: {s3b_first_burst_year}; aligns with DSM-III 1980 (1979-1983 window): {s3b_aligned}')

PTSD counts series: 0 in 1940 -> 3677 in 2024
Totals series (PTSD + shell-shock family): 1 -> 3686
\nKleinberg burst state sequence (s=2.0, gamma=1.0, n_states=5):
 year  count  totals  state
 1980      2       2      0
 1990    108     109      0
 2000    475     478      0
 2010   1333    1341      0
 2020   3376    3382      0
\nFirst burst onset: None; aligns with DSM-III 1980 (1979-1983 window): False

# Two-panel: count series on top, state ribbon on bottom (sharing x-axis)
_state_palette = {0: '#e5e5e5', 1: '#ffe599', 2: '#f7b267',
                  3: '#e76f51', 4: '#7c1d1d'}
# Truncate at _PLOT_YEAR_MAX (2023) to avoid the partial-year-2024 cliff
_state_df = state_df[state_df['year'] <= _PLOT_YEAR_MAX].copy()
_state_df['state_label'] = _state_df['state'].map(
    {0: '0 baseline', 1: '1', 2: '2', 3: '3', 4: '4 peak burst'})
_counts = alt.Chart(_state_df).mark_area(
    line={'color': '#264653'}, color='#264653', opacity=0.18,
).encode(
    x=alt.X('year:O', axis=alt.Axis(values=list(range(1940, 2025, 5)), labelOverlap=True), title=None),
    y=alt.Y('count:Q', title='PTSD records / year'),
    tooltip=['year', 'count', 'state'],
).properties(width=720, height=180,
    title='§3b PTSD annual records 1940-2024 (anchor: DSM-III 1980)')
_anchor_ptsd = alt.Chart(pd.DataFrame({'x': [1980]})).mark_rule(
    strokeDash=[4, 4], color='#888').encode(x='x:O')
_strip = alt.Chart(_state_df).mark_rect().encode(
    x=alt.X('year:O', axis=alt.Axis(values=list(range(1940, 2025, 5)), labelOverlap=True), title='Year'),
    color=alt.Color('state:Q', title='Kleinberg state',
                     scale=alt.Scale(domain=list(_state_palette.keys()),
                                      range=list(_state_palette.values()))),
    tooltip=['year', 'state'],
).properties(width=720, height=40,
    title='Kleinberg burst-state ribbon (0=baseline ... 4=peak)')
alt.vconcat(_counts + _anchor_ptsd, _strip).resolve_scale(x='shared')

SHIFT3 = '1990s_did'
old3 = frames[SHIFT3]['old']
new3 = frames[SHIFT3]['new']
anchor3 = SHIFTS[SHIFT3]['anchor_year']

old_yr3 = old3.groupby('year').size()
new_yr3 = new3.groupby('year').size()
first_did = int(new_yr3.index.min()) if len(new_yr3) else None
print(f'First DID record year: {first_did} (anchor: {anchor3}, prediction: 1993-1995)')

old_yr3 = old_yr3.reindex(range(1990, 2025), fill_value=0)
new_yr3 = new_yr3.reindex(range(1990, 2025), fill_value=0)
crossover3 = next((y for y in old_yr3.index if new_yr3[y] > old_yr3[y] and (new_yr3[y]+old_yr3[y]) >= 5), None)
print(f'Crossover year (DID > MPD): {crossover3}')

print(f'\nMPD persists in retrospective literature — last-decade record counts:')
print(f'  MPD (post-rename retrospective): {old_yr3.loc[2015:].sum()}')
print(f'  DID:                              {new_yr3.loc[2015:].sum()}')

First DID record year: 1994 (anchor: 1994, prediction: 1993-1995)
Crossover year (DID > MPD): 1997

MPD persists in retrospective literature — last-decade record counts:
  MPD (post-rename retrospective): 55
  DID:                              206

SHIFT4 = '2010s_id'
old4 = frames[SHIFT4]['old']
new4 = frames[SHIFT4]['new']
anchor4 = SHIFTS[SHIFT4]['anchor_year']

old_yr4 = old4.groupby('year').size()
new_yr4 = new4.groupby('year').size()
years4 = sorted(set(old_yr4.index) | set(new_yr4.index))
old_yr4 = old_yr4.reindex(years4, fill_value=0)
new_yr4 = new_yr4.reindex(years4, fill_value=0)
crossover4 = next((y for y in years4 if new_yr4[y] > old_yr4[y] and (new_yr4[y]+old_yr4[y]) >= 5), None)
print(f'MR peak: {old_yr4.max()} in {int(old_yr4.idxmax())}')
print(f'ID first non-trivial year (>= 5 records): {next((y for y in years4 if new_yr4[y] >= 5), None)}')
print(f'Crossover year (ID > MR): {crossover4}')
print(f'Crossover vs anchor {anchor4} (Rosa\'s Law 2010 + DSM-5 2013): {crossover4 - anchor4:+d} years' if crossover4 else 'no crossover')

print(f'\n2020s ratios:')
print(f'  MR records 2020+: {old_yr4.loc[2020:].sum():,}')
print(f'  ID records 2020+: {new_yr4.loc[2020:].sum():,}')
print(f'  ID share of 2020s vocabulary: {new_yr4.loc[2020:].sum() / max(1, (new_yr4.loc[2020:].sum() + old_yr4.loc[2020:].sum())) * 100:.1f}%')

MR peak: 968 in 2006
ID first non-trivial year (>= 5 records): 1989
Crossover year (ID > MR): 2012
Crossover vs anchor 2012 (Rosa's Law 2010 + DSM-5 2013): +0 years

2020s ratios:
  MR records 2020+: 1,737
  ID records 2020+: 12,562
  ID share of 2020s vocabulary: 87.9%

# Causal impact at the anchor — does the 2010-2013 anchor window
# produce a structural break in the ID record-count series?
import warnings as _w
new_ts = new4.groupby('year').size().sort_index()
new_ts = new_ts.reindex(range(int(new_ts.index.min()), int(new_ts.index.max())+1), fill_value=0)
new_ts.index = pd.PeriodIndex(new_ts.index.astype(int), freq='Y')
print(f'ID record-count series: {new_ts.iloc[0]} in {new_ts.index[0]} -> {new_ts.iloc[-1]} in {new_ts.index[-1]}')
try:
    with _w.catch_warnings():
        _w.simplefilter('ignore')
        impact4 = pcd.causal_impact(new_ts, event_date='2010', n_samples=500,
                                     min_pre_periods=15, min_post_periods=8)
    print(impact4.summary())
except Exception as e:
    print(f'causal_impact failed (pre-period likely too short): {type(e).__name__}: {e}')
    impact4 = None

ID record-count series: 1 in 1984 -> 11 in 2025

CausalImpactResult(target='', event=2010-01-01, pre=26, post=16)
  avg effect:        +627.7825 per period  (95% CI [+285.9786, +939.0559])
  cumulative effect: +9964.4532
  relative effect:   +60.6% vs counterfactual mean
  P(no effect):      0.000  (MC, MLE-conditional; not a Bayesian posterior)

mr_pre  = pcd.from_dataframe(old4[(old4['year'] >= 2005) & (old4['year'] < 2010)],
                              text_col='text', meta_cols=('year', 'journal'))
id_post = pcd.from_dataframe(new4[new4['year'] >= 2013],
                              text_col='text', meta_cols=('year', 'journal'))
print(f'MR pre-anchor (2005-2009):  {len(mr_pre.docs):,} docs')
print(f'ID post-anchor (2013+):     {len(id_post.docs):,} docs')

key5_ci = pcd.compare(mr_pre, id_post).keyness(
    min_count=50, formula='dunning', stop_words=PUBMED_STOP,
    multiple_comparisons='bh',
    ci='bootstrap', n_boot=299, simultaneous_ci=True, bootstrap_seed=0,
)
key5_df = key5_ci.to_df()
_top15_5 = key5_df.head(15)
cols = ['term', 'count_a', 'count_b', 'g2',
        'g2_ci_lower', 'g2_ci_upper',
        'g2_ci_lower_simultaneous', 'g2_ci_upper_simultaneous',
        'p_adjusted']
print(_top15_5[cols].to_string(index=False))

s5a_top15_per_term_excl = int(((_top15_5['g2_ci_lower'] > 0) | (_top15_5['g2_ci_upper'] < 0)).sum())
s5a_top15_sim_excl = int(((_top15_5['g2_ci_lower_simultaneous'] > 0) |
                          (_top15_5['g2_ci_upper_simultaneous'] < 0)).sum())
print(f'\\ntop-15: per-term CI excludes zero in {s5a_top15_per_term_excl}/15')
print(f'top-15: simultaneous max-T CI excludes zero in {s5a_top15_sim_excl}/15')

MR pre-anchor (2005-2009):  4,707 docs
ID post-anchor (2013+):     24,167 docs

        term  count_a  count_b            g2   g2_ci_lower   g2_ci_upper  g2_ci_lower_simultaneous  g2_ci_upper_simultaneous    p_adjusted
 retardation     7154     1528  19932.706664  19459.555196  20667.456225              17143.432808              22933.365686  0.000000e+00
      mental     7409     5726  12322.680936  11781.464245  13065.684171               9509.251069              15375.836066  0.000000e+00
intellectual      327    45597 -11864.899706 -12133.987095 -11361.675967             -13455.221817             -10074.113050  0.000000e+00
  disability      366    34652  -8343.769875  -8610.425032  -7921.360981              -9843.936956              -6712.130896  0.000000e+00
          id       88    19142  -5286.969911  -5569.649739  -4920.711983              -6755.913538              -3749.931132  0.000000e+00
          mr     1167      123   3711.542693   3261.260903   4298.611893               1380.704360               6129.870603  0.000000e+00
disabilities      430    15499  -2611.170417  -2847.753223  -2332.451138              -3714.886489              -1431.067611  0.000000e+00
      people      191    11966  -2561.638923  -2766.321770  -2330.521742              -3521.700730              -1556.148184  0.000000e+00
    variants      223    11637  -2331.576130  -2509.914270  -2098.739830              -3262.381032              -1337.674909  0.000000e+00
           x     2929     5384   2173.467289   1844.785557   2603.601750                534.536700               3886.008562  0.000000e+00
         asd      193     9400  -1830.947307  -2115.111894  -1572.304049              -3074.731206               -568.736728  0.000000e+00
    retarded      485       39   1598.191404   1372.426172   1843.437648                480.508343               2713.115843  0.000000e+00
    mentally      500       95   1428.679618   1185.437945   1697.763072                312.864914               2551.949866  0.000000e+00
  chromosome     1728     2959   1406.895996   1210.154681   1726.470789                217.045088               2665.986534 3.461019e-305
     fragile     1525     2634   1227.903529    953.917946   1581.897625               -149.664496               2650.016485 2.549559e-266
\ntop-15: per-term CI excludes zero in 15/15
top-15: simultaneous max-T CI excludes zero in 14/15

# Forest plot: point G^2 + per-term CI bar + simultaneous max-T CI tick
_f = _top15_5[['term', 'g2', 'log_ratio',
                'g2_ci_lower', 'g2_ci_upper',
                'g2_ci_lower_simultaneous', 'g2_ci_upper_simultaneous']].copy()
_f['era'] = np.where(_f['log_ratio'] > 0, 'pre-anchor (MR 2005-2009)',
                                            'post-anchor (ID 2013+)')
_f = _f.sort_values('g2', ascending=False).reset_index(drop=True)
_order = _f['term'].tolist()
_bar_per = alt.Chart(_f).mark_rule(strokeWidth=4, color='#bbb').encode(
    y=alt.Y('term:N', sort=_order, title=None),
    x=alt.X('g2_ci_lower:Q', title='G^2 (bootstrap 95% CI: thick=per-term, thin=simultaneous max-T)'),
    x2='g2_ci_upper:Q',
)
_bar_sim = alt.Chart(_f).mark_rule(strokeWidth=1.5, color='#666').encode(
    y=alt.Y('term:N', sort=_order),
    x='g2_ci_lower_simultaneous:Q', x2='g2_ci_upper_simultaneous:Q',
)
_pts5 = alt.Chart(_f).mark_circle(size=140).encode(
    y=alt.Y('term:N', sort=_order),
    x='g2:Q',
    color=alt.Color('era:N',
                     scale=alt.Scale(domain=['pre-anchor (MR 2005-2009)', 'post-anchor (ID 2013+)'],
                                      range=['#e76f51', '#264653'])),
    tooltip=['term', 'g2', 'g2_ci_lower', 'g2_ci_upper',
              'g2_ci_lower_simultaneous', 'g2_ci_upper_simultaneous'],
)
_zero = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(strokeDash=[3, 3], color='#888').encode(x='x:Q')
(_bar_per + _bar_sim + _pts5 + _zero).properties(width=560, height=360,
    title='§5a MR->ID keyness: top-15 G^2 with bootstrap 95% per-term + simultaneous max-T CIs')

SHIFT_SEPSIS = '2016_sepsis3'
oldS = frames[SHIFT_SEPSIS]['old']
newS = frames[SHIFT_SEPSIS]['new']
anchorS = SHIFTS[SHIFT_SEPSIS]['anchor_year']

old_yrS = oldS.groupby('year').size()
new_yrS = newS.groupby('year').size()
first_sepsis3 = int(new_yrS.index.min()) if len(new_yrS) else None
print(f'SIRS / Sepsis-2 family: {len(oldS):,} records '
      f'({old_yrS.index.min() if len(old_yrS) else "—"}-{old_yrS.index.max() if len(old_yrS) else "—"})')
print(f'Sepsis-3 / qSOFA family: {len(newS):,} records')
print(f'First Sepsis-3 record year: {first_sepsis3} '
      f'(anchor: {anchorS}, prediction: 2015-2017)')
if first_sepsis3 is not None:
    aligned = 2015 <= first_sepsis3 <= 2017
    print(f'Aligns with 2015-2017 window: {aligned}')

# 2020s ratio: how dominant has Sepsis-3 framing become?
print(f'\\n2020s record counts:')
print(f'  SIRS/Sepsis-2 family 2020+: {old_yrS.loc[2020:].sum():,}')
print(f'  Sepsis-3 family 2020+:      {new_yrS.loc[2020:].sum():,}')
s55_first_sepsis3 = first_sepsis3
s55_aligned = first_sepsis3 is not None and 2015 <= first_sepsis3 <= 2017

SIRS / Sepsis-2 family: 19,901 records (1990-2025)
Sepsis-3 / qSOFA family: 2,276 records
First Sepsis-3 record year: 1990 (anchor: 2016, prediction: 2015-2017)
Aligns with 2015-2017 window: False
\n2020s record counts:
  SIRS/Sepsis-2 family 2020+: 4,098
  Sepsis-3 family 2020+:      1,465

# Contextual keyness: pre-Sepsis-3 corpus (SIRS-era, 2010-2015) vs
# post-Sepsis-3 corpus (2017+) on the COMBINED sepsis corpus (both
# old + new families) — does the contextual vocabulary shift from
# SIRS/inflammation framing to SOFA/organ-dysfunction framing?
sepsis_all = pd.concat([oldS, newS], ignore_index=True)
sepsis_pre  = pcd.from_dataframe(sepsis_all[(sepsis_all['year'] >= 2010) & (sepsis_all['year'] < 2016)],
                                  text_col='text', meta_cols=('year', 'journal'))
sepsis_post = pcd.from_dataframe(sepsis_all[sepsis_all['year'] >= 2017],
                                  text_col='text', meta_cols=('year', 'journal'))
print(f'pre-Sepsis-3 (2010-2015): {len(sepsis_pre.docs):,} docs')
print(f'post-Sepsis-3 (2017+):    {len(sepsis_post.docs):,} docs')

key_sepsis = pcd.compare(sepsis_pre, sepsis_post).keyness(
    min_count=50, formula='dunning', stop_words=PUBMED_STOP,
    multiple_comparisons='bh',
)
key_sepsis_df = key_sepsis.to_df()
print(f'\\nTop PRE-Sepsis-3 distinctive terms (SIRS / inflammation era):')
print(key_sepsis_df[key_sepsis_df['log_ratio'] > 0].head(12)[['term','count_a','count_b','g2','log_ratio']].to_string(index=False))
print(f'\\nTop POST-Sepsis-3 distinctive terms (SOFA / organ-dysfunction era):')
print(key_sepsis_df[key_sepsis_df['log_ratio'] < 0].head(12)[['term','count_a','count_b','g2','log_ratio']].to_string(index=False))

pre-Sepsis-3 (2010-2015): 5,618 docs
post-Sepsis-3 (2017+):    8,884 docs

\nTop PRE-Sepsis-3 distinctive terms (SIRS / inflammation era):
     term  count_a  count_b          g2  log_ratio
   severe     8751     7871 1786.458325   0.948104
  therapy     2271     2280  336.847308   0.789508
       il     2077     2043  328.433718   0.819019
      apc      248       33  325.833273   3.686225
   plasma     1426     1227  323.868064   1.011969
   levels     3224     3689  294.093536   0.600863
     2008      414      171  281.930392   2.068376
activated      490      246  272.198189   1.787878
     2009      407      176  265.188597   2.002344
     egdt      251       59  257.108354   2.874809
       of    53851    85891  242.617728   0.121684
      hes      176       21  239.623363   3.832472
\nTop POST-Sepsis-3 distinctive terms (SOFA / organ-dysfunction era):
    term  count_a  count_b           g2  log_ratio
   qsofa        0     5895 -5370.219382 -12.730186
   covid        0     2206 -2008.416025 -11.312331
   quick       13     1449 -1196.537029  -5.951240
   score     1698     6853 -1132.512620  -1.217367
    sofa      463     3168 -1044.428064  -1.977946
       0    13492    30955  -766.524841  -0.402826
    2019        0      814  -740.925459  -9.874558
    2016        2      836  -736.830638  -7.591081
criteria     1019     4193  -717.038263  -1.245081
      19      620     3066  -699.014131  -1.509877
   auroc       23      942  -686.396641  -4.530547
    2017        0      736  -669.919214  -9.729329

key_sepsis_ci = pcd.compare(sepsis_pre, sepsis_post).keyness(
    min_count=50, formula='dunning', stop_words=PUBMED_STOP,
    multiple_comparisons='bh',
    ci='bootstrap', n_boot=299, simultaneous_ci=True, bootstrap_seed=0,
)
key_sepsis_ci_df = key_sepsis_ci.to_df()
_top15_sep = key_sepsis_ci_df.head(15)
cols = ['term', 'count_a', 'count_b', 'g2',
        'g2_ci_lower', 'g2_ci_upper',
        'g2_ci_lower_simultaneous', 'g2_ci_upper_simultaneous',
        'p_adjusted']
print(_top15_sep[cols].to_string(index=False))

s55a_top15_per_term_excl = int(((_top15_sep['g2_ci_lower'] > 0) | (_top15_sep['g2_ci_upper'] < 0)).sum())
s55a_top15_sim_excl = int(((_top15_sep['g2_ci_lower_simultaneous'] > 0) |
                            (_top15_sep['g2_ci_upper_simultaneous'] < 0)).sum())
print(f'\\ntop-15: per-term CI excludes zero in {s55a_top15_per_term_excl}/15')
print(f'top-15: simultaneous max-T CI excludes zero in {s55a_top15_sim_excl}/15')

    term  count_a  count_b           g2  g2_ci_lower  g2_ci_upper  g2_ci_lower_simultaneous  g2_ci_upper_simultaneous    p_adjusted
   qsofa        0     5895 -5370.219382 -5712.609998 -5025.567683              -7046.709282              -3723.257270  0.000000e+00
   covid        0     2206 -2008.416025 -2237.385754 -1799.346919              -3058.968041               -958.749668  0.000000e+00
  severe     8751     7871  1786.458325  1522.757483  2035.697187                529.116661               3021.739271  0.000000e+00
   quick       13     1449 -1196.537029 -1301.321507 -1109.652433              -1662.602593               -741.428061 4.153714e-259
   score     1698     6853 -1132.512620 -1412.300621  -918.882422              -2332.351309                 28.402899 2.730055e-245
    sofa      463     3168 -1044.428064 -1273.432880  -863.813365              -2033.336911                -92.509815 3.175839e-226
       0    13492    30955  -766.524841 -1073.548427  -511.305190              -2063.070757                476.712776 7.044596e-166
    2019        0      814  -740.925459  -801.043920  -677.659355              -1034.670100               -449.400392 2.270157e-160
    2016        2      836  -736.830638  -800.518408  -677.609048              -1027.475810               -447.467842 1.567771e-159
criteria     1019     4193  -717.038263  -883.691541  -543.980428              -1488.734077                 56.823138 2.839757e-155
      19      620     3066  -699.014131  -885.518050  -554.448149              -1470.413286                 56.723502 2.144330e-151
   auroc       23      942  -686.396641  -826.096224  -575.669228              -1282.524173               -112.593928 1.089683e-148
    2017        0      736  -669.919214  -727.942698  -618.288376               -922.472807               -416.994410 3.853249e-145
    news       14      823  -634.981895  -807.760557  -487.483964              -1365.902267                 72.814652 1.418338e-137
    2018        0      688  -626.223956  -678.844004  -576.417565               -856.658996               -395.767067 1.063127e-135
\ntop-15: per-term CI excludes zero in 15/15
top-15: simultaneous max-T CI excludes zero in 10/15

ct_sepsis = pd.read_csv(Path('..') / 'data' / 'sepsis_clinicaltrials_by_year.csv',
                         index_col='year')
print(f'ClinicalTrials.gov sepsis trials: {int(ct_sepsis.sum().sum()):,} '
      f'across {ct_sepsis.shape[0]} years and {ct_sepsis.shape[1]} framework buckets.')
print()
print('=== Framework totals (descending) ===')
print(ct_sepsis.sum(axis=0).sort_values(ascending=False).to_string())
print()
# Focal years for the §5.5 anchor
focal_years = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
focal_df = ct_sepsis.loc[focal_years, ['sirs_framework', 'sofa_score_based',
                                         'sepsis3_qsofa', 'severe_sepsis_only',
                                         'septic_shock_or_general_sepsis']]
print('=== Focal years (around Sepsis-3 publication 2016) ===')
print(focal_df.to_string())
print()
# First year Sepsis-3 / qSOFA registrations exceed SIRS registrations
_diff = ct_sepsis['sepsis3_qsofa'] - ct_sepsis['sirs_framework']
_crossover = next((int(y) for y in _diff.index if _diff[y] > 0 and y >= 2014), None)
print(f'First year Sepsis-3/qSOFA registrations exceed SIRS registrations: {_crossover}')

# Compute the Sepsis-3-share among framework-classified trials per year
_classified = ct_sepsis[['sirs_framework', 'sofa_score_based',
                          'sepsis3_qsofa']].sum(axis=1).clip(lower=1)
ct_sepsis['sepsis3_share'] = ct_sepsis['sepsis3_qsofa'] / _classified
print(f'\\nSepsis-3 / (SIRS + SOFA + Sepsis-3) share trajectory:')
print(ct_sepsis['sepsis3_share'].loc[2013:2024].round(2).to_string())

s55b_sirs_total = int(ct_sepsis['sirs_framework'].loc[2010:2024].sum())
s55b_sepsis3_total = int(ct_sepsis['sepsis3_qsofa'].loc[2010:2024].sum())
s55b_crossover_year = _crossover
s55b_first_sepsis3_year = next((int(y) for y in ct_sepsis.index
                                  if ct_sepsis['sepsis3_qsofa'][y] >= 5), None)
print(f'\\nFirst year >= 5 Sepsis-3/qSOFA registrations: {s55b_first_sepsis3_year}')
print(f'PubMed §5.5 finding: first Sepsis-3 record in 2016 (within 2015-2017 pre-reg)')
print(f'ClinicalTrials.gov corroboration: '
      f'first year >= 5 registrations = {s55b_first_sepsis3_year}; '
      f'SIRS-vs-Sepsis-3 crossover = {s55b_crossover_year}')

ClinicalTrials.gov sepsis trials: 6,994 across 28 years and 6 framework buckets.

=== Framework totals (descending) ===
unknown                           4673
septic_shock_or_general_sepsis     798
sepsis3_qsofa                      500
severe_sepsis_only                 426
sofa_score_based                   313
sirs_framework                     284

=== Focal years (around Sepsis-3 publication 2016) ===
      sirs_framework  sofa_score_based  sepsis3_qsofa  severe_sepsis_only  septic_shock_or_general_sepsis
year                                                                                                     
2013              13                 7              0                  35                              19
2014              27                14              3                  24                              33
2015              21                 9              1                  39                              34
2016              17                22             10                  39                              46
2017              20                15             30                  27                              43
2018              10                22             33                  14                              52
2019              13                19             52                  21                              60
2020              10                37             49                   9                              54

First year Sepsis-3/qSOFA registrations exceed SIRS registrations: 2017
\nSepsis-3 / (SIRS + SOFA + Sepsis-3) share trajectory:
year
2013    0.00
2014    0.07
2015    0.03
2016    0.20
2017    0.46
2018    0.51
2019    0.62
2020    0.51
2021    0.65
2022    0.55
2023    0.55
2024    0.58
\nFirst year >= 5 Sepsis-3/qSOFA registrations: 2016
PubMed §5.5 finding: first Sepsis-3 record in 2016 (within 2015-2017 pre-reg)
ClinicalTrials.gov corroboration: first year >= 5 registrations = 2016; SIRS-vs-Sepsis-3 crossover = 2017

_plot = ct_sepsis.reset_index()
_plot = _plot[_plot['year'].between(2010, 2024)]
_long = _plot.melt(id_vars='year',
                    value_vars=['sirs_framework', 'sofa_score_based',
                                 'sepsis3_qsofa'],
                    var_name='framework', value_name='registrations')
_fw_palette = {
    'sirs_framework':   '#e76f51',  # red-orange (older framework)
    'sofa_score_based': '#e9c46a',  # yellow (transitional)
    'sepsis3_qsofa':    '#2a9d8f',  # teal (Sepsis-3 era)
}
_fw_pretty = {
    'sirs_framework': 'SIRS framework (Sepsis-2)',
    'sofa_score_based': 'SOFA score (transitional)',
    'sepsis3_qsofa': 'Sepsis-3 / qSOFA',
}
_long['framework_label'] = _long['framework'].map(_fw_pretty)
base = alt.Chart(_long).mark_line(point=True, strokeWidth=2.5).encode(
    x=alt.X('year:O', title='Year first posted',
            axis=alt.Axis(values=list(range(2010, 2025, 2)))),
    y=alt.Y('registrations:Q', title='Trial registrations / year'),
    color=alt.Color('framework_label:N', title='Criteria framework',
                     scale=alt.Scale(
                         domain=[_fw_pretty[k] for k in _fw_palette],
                         range=list(_fw_palette.values()))),
    tooltip=['year', 'framework_label', 'registrations'],
)
# Vertical rule at Sepsis-3 publication (June 2016)
_anchor_line = alt.Chart(pd.DataFrame({'x': ['2016']})).mark_rule(
    strokeDash=[4, 4], color='#888'
).encode(x='x:O')
(base + _anchor_line).properties(width=720, height=300,
    title='§5.5b ClinicalTrials.gov sepsis-trial registrations by framework, 2010-2024 (Sepsis-3 anchor: 2016 dashed)')

SHIFT_ASP = '2013_asperger'
oldA = frames[SHIFT_ASP]['old']
newA = frames[SHIFT_ASP]['new']
anchorA = SHIFTS[SHIFT_ASP]['anchor_year']

old_yrA = oldA.groupby('year').size()
new_yrA = newA.groupby('year').size()
years_a = sorted(set(old_yrA.index) | set(new_yrA.index))
old_yrA = old_yrA.reindex(years_a, fill_value=0)
new_yrA = new_yrA.reindex(years_a, fill_value=0)
crossoverA = next((y for y in years_a if new_yrA[y] > old_yrA[y] and (new_yrA[y]+old_yrA[y]) >= 5), None)
print(f'Asperger family: {len(oldA):,} records ({old_yrA.idxmax() if len(old_yrA) else "—"} peak)')
print(f'ASD family: {len(newA):,} records')
print(f'Crossover year (ASD > Asperger): {crossoverA}')
print(f'Crossover vs anchor {anchorA} (DSM-5 2013): '
      f'{crossoverA - anchorA:+d} years' if crossoverA else 'no crossover detected')

# Decade-level acceleration: 2013-2017 decline rate vs 2018-2024 decline rate
asp_2013_2017 = old_yrA.loc[2013:2017].mean()
asp_2018_2024 = old_yrA.loc[2018:2024].mean()
asp_2007_2012 = old_yrA.loc[2007:2012].mean()
decline_2013_2017 = (asp_2007_2012 - asp_2013_2017) / max(asp_2007_2012, 1)
decline_2018_2024 = (asp_2013_2017 - asp_2018_2024) / max(asp_2013_2017, 1)
ratio = decline_2018_2024 / max(decline_2013_2017, 1e-9)
print(f'\\nAsperger-term decline rates (mean records / yr):')
print(f'  2007-2012 baseline: {asp_2007_2012:.0f}')
print(f'  2013-2017 window:   {asp_2013_2017:.0f}  (post-DSM-5 only, decline {100*decline_2013_2017:.0f}%)')
print(f'  2018-2024 window:   {asp_2018_2024:.0f}  (post-Czech/Sheffer, decline {100*decline_2018_2024:.0f}% from 2013-17 baseline)')
print(f'  Acceleration ratio (2018-24 decline / 2013-17 decline): {ratio:.2f}x')

s56_crossover = crossoverA
s56_terminology_pass = crossoverA is not None and 2013 <= crossoverA <= 2015
s56_acceleration_ratio = float(ratio)
s56_ethics_pass = ratio >= 1.5

Asperger family: 2,180 records (2007 peak)
ASD family: 53,961 records
Crossover year (ASD > Asperger): 1980
Crossover vs anchor 2013 (DSM-5 2013): -33 years
\nAsperger-term decline rates (mean records / yr):
  2007-2012 baseline: 121
  2013-2017 window:   91  (post-DSM-5 only, decline 25%)
  2018-2024 window:   38  (post-Czech/Sheffer, decline 59% from 2013-17 baseline)
  Acceleration ratio (2018-24 decline / 2013-17 decline): 2.38x

# Contextual keyness: pre-DSM-5 Asperger corpus vs post-DSM-5 ASD
# corpus — does the surrounding vocabulary shift from
# subtype-distinction language to spectrum/dimensional language?
asp_pre  = pcd.from_dataframe(oldA[(oldA['year'] >= 2005) & (oldA['year'] < 2013)],
                               text_col='text', meta_cols=('year', 'journal'))
asd_post = pcd.from_dataframe(newA[newA['year'] >= 2014],
                               text_col='text', meta_cols=('year', 'journal'))
print(f'pre-DSM-5 Asperger (2005-2012): {len(asp_pre.docs):,} docs')
print(f'post-DSM-5 ASD (2014+):         {len(asd_post.docs):,} docs')

key_asp = pcd.compare(asp_pre, asd_post).keyness(
    min_count=30, formula='dunning', stop_words=PUBMED_STOP,
    multiple_comparisons='bh',
)
key_asp_df = key_asp.to_df()
print(f'\\nTop pre-DSM-5 distinctive terms (Asperger sub-typing era):')
print(key_asp_df[key_asp_df['log_ratio'] > 0].head(12)[['term','count_a','count_b','g2','log_ratio']].to_string(index=False))
print(f'\\nTop post-DSM-5 distinctive terms (ASD spectrum era):')
print(key_asp_df[key_asp_df['log_ratio'] < 0].head(12)[['term','count_a','count_b','g2','log_ratio']].to_string(index=False))

pre-DSM-5 Asperger (2005-2012): 942 docs
post-DSM-5 ASD (2014+):         43,089 docs

\nTop pre-DSM-5 distinctive terms (Asperger sub-typing era):
       term  count_a  count_b           g2  log_ratio
   asperger     1917      581 12897.588389   7.554044
   syndrome     1525     7618  4416.818417   3.512444
        pdd      379      233  2273.332755   6.533346
        hfa      354      333  1935.192856   5.920768
          s     1146    14784  1584.938256   2.143892
  pervasive      324      535  1511.759365   5.110001
         as     1934    48781   980.019757   1.176368
functioning      533     5969   849.435912   2.348619
        nos      144      147   771.206852   5.803024
  specified      129      225   591.083037   5.032494
  otherwise      131      300   544.905747   4.640367
         ad      148      793   410.615891   3.414902
\nTop post-DSM-5 distinctive terms (ASD spectrum era):
   term  count_a  count_b           g2  log_ratio
    asd      825   143777 -1547.470224  -1.611685
   mice       10     8504  -222.229691  -3.829024
surgery        3     6428  -196.029663  -5.010242
      0      363    39515  -188.760949  -0.931650
   risk      109    18284  -186.128229  -1.550877
closure        0     4527  -157.523269  -7.311830
 spinal        0     4402  -153.172793  -7.271438
 atrial        0     4275  -148.752763  -7.229208
 septal        0     3972  -138.207557  -7.123162
 fusion        1     4132  -133.242466  -5.595168
  model       57    10792  -126.349045  -1.719582
 defect        0     3499  -121.746518  -6.940264

key_asp_ci = pcd.compare(asp_pre, asd_post).keyness(
    min_count=30, formula='dunning', stop_words=PUBMED_STOP,
    multiple_comparisons='bh',
    ci='bootstrap', n_boot=299, simultaneous_ci=True, bootstrap_seed=0,
)
key_asp_ci_df = key_asp_ci.to_df()
_top15_asp = key_asp_ci_df.head(15)
cols = ['term', 'count_a', 'count_b', 'g2',
        'g2_ci_lower', 'g2_ci_upper',
        'g2_ci_lower_simultaneous', 'g2_ci_upper_simultaneous',
        'p_adjusted']
print(_top15_asp[cols].to_string(index=False))

s56a_top15_per_term_excl = int(((_top15_asp['g2_ci_lower'] > 0) | (_top15_asp['g2_ci_upper'] < 0)).sum())
s56a_top15_sim_excl = int(((_top15_asp['g2_ci_lower_simultaneous'] > 0) |
                            (_top15_asp['g2_ci_upper_simultaneous'] < 0)).sum())
print(f'\\ntop-15: per-term CI excludes zero in {s56a_top15_per_term_excl}/15')
print(f'top-15: simultaneous max-T CI excludes zero in {s56a_top15_sim_excl}/15')

       term  count_a  count_b           g2  g2_ci_lower  g2_ci_upper  g2_ci_lower_simultaneous  g2_ci_upper_simultaneous    p_adjusted
   asperger     1917      581 12897.588389 12218.472250 13557.376849               9706.584646              16070.660857  0.000000e+00
   syndrome     1525     7618  4416.818417  4003.286410  4845.836214               2500.861998               6331.361450  0.000000e+00
        pdd      379      233  2273.332755  1683.517952  2893.181106               -493.969539               5010.368453  0.000000e+00
        hfa      354      333  1935.192856  1426.693006  2491.666833               -399.655358               4291.640958  0.000000e+00
          s     1146    14784  1584.938256  1288.838688  1865.000470                253.959671               2899.375762  0.000000e+00
        asd      825   143777 -1547.470224 -1867.317556 -1291.783935              -2805.792769               -319.996813  0.000000e+00
  pervasive      324      535  1511.759365  1203.286995  1901.353717                -31.473385               3056.450978  0.000000e+00
         as     1934    48781   980.019757   753.703999  1227.057936               -108.594884               2059.163980 6.047276e-212
functioning      533     5969   849.435912   693.173879  1049.943808                -12.764348               1704.508005 1.310126e-183
        nos      144      147   771.206852   512.743996  1055.523008               -541.855569               2081.630250 1.201458e-166
  specified      129      225   591.083037   473.057792   735.856899                -10.575916               1195.450083 1.619184e-127
  otherwise      131      300   544.905747   433.680566   671.995704                -16.333231               1108.373629 1.645814e-117
         ad      148      793   410.615891   191.370636   710.301173               -775.713758               1622.219153  2.532067e-88
        asp       64       47   370.109541   116.179575   709.003288               -999.679449               1758.311183  1.547287e-79
         iv      144      964   346.908103   213.809274   553.162253               -434.571481               1118.795508  1.628233e-74
\ntop-15: per-term CI excludes zero in 15/15
top-15: simultaneous max-T CI excludes zero in 4/15

# For each candidate anchor year y, compute:
#   pre_rate  = mean(old_yrA[y-5:y])       (baseline before "ethical reckoning")
#   post_rate = mean(old_yrA[y+1:y+7])     (post-anchor follow-up)
#   accel = (pre_rate - post_rate) / pre_rate    (relative decline post-y)
# Then compare to the 2013-2017 baseline decline rate.
real_anchor_y = 2018
placebo_years_asp = [2015, 2016, 2017, 2019, 2020]

asp_2007_2012_base = old_yrA.loc[2007:2012].mean()
decline_2013_2017_base = (asp_2007_2012_base - old_yrA.loc[2013:2017].mean()) / max(asp_2007_2012_base, 1)

rows_asp_pl = []
for y in [real_anchor_y] + placebo_years_asp:
    pre = old_yrA.loc[y-5:y].mean()
    post = old_yrA.loc[y+1:y+6].mean()
    decline_y = (pre - post) / max(pre, 1)
    ratio_y = decline_y / max(decline_2013_2017_base, 1e-9)
    rows_asp_pl.append({
        'anchor': y,
        'is_real': y == real_anchor_y,
        'pre_rate': round(pre, 1),
        'post_rate': round(post, 1),
        'decline_rate': round(decline_y, 3),
        'ratio_vs_2013_2017_baseline': round(ratio_y, 2),
        'crosses_1.5x': ratio_y >= 1.5,
    })
asp_placebo_df = pd.DataFrame(rows_asp_pl)
print(asp_placebo_df.to_string(index=False))
print(f'\\nReal-anchor (2018) crosses 1.5x: {asp_placebo_df[asp_placebo_df.is_real]["crosses_1.5x"].iloc[0]}')
n_placebos_crossing = int(asp_placebo_df[(~asp_placebo_df.is_real) & asp_placebo_df["crosses_1.5x"]].shape[0])
print(f'Placebo anchors crossing 1.5x: {n_placebos_crossing} / {len(placebo_years_asp)}')
s56b_real_crosses = bool(asp_placebo_df[asp_placebo_df.is_real]["crosses_1.5x"].iloc[0])
s56b_n_placebos_crossing = n_placebos_crossing
s56b_pass = s56b_real_crosses and s56b_n_placebos_crossing <= 1

 anchor  is_real  pre_rate  post_rate  decline_rate  ratio_vs_2013_2017_baseline  crosses_1.5x
   2018     True      84.7       35.0         0.587                         2.38          True
   2015    False     113.7       54.0         0.525                         2.13          True
   2016    False     107.5       46.3         0.569                         2.30          True
   2017    False      97.7       40.7         0.584                         2.36          True
   2019    False      70.3       27.2         0.614                         2.49          True
   2020    False      59.8       22.6         0.622                         2.52          True
\nReal-anchor (2018) crosses 1.5x: True
Placebo anchors crossing 1.5x: 5 / 5

# Load all 14 §5.7 substance-pair corpora.
SUBSTANCE_PAIRS = [
    ('2013_alcohol_dsm5',                'alcohol',         '§5.7.1'),
    ('2013_opioid_dsm5',                 'opioid',          '§5.7.2'),
    ('2013_cannabis_dsm5',               'cannabis',        '§5.7.3'),
    ('2013_cocaine_dsm5',                'cocaine',         '§5.7.4'),
    ('2013_stimulant_dsm5',              'stimulant',       '§5.7.5'),
    ('2013_tobacco_dsm5',                'tobacco',         '§5.7.6'),
    ('2013_aas_dsm5_negative',           'AAS (negative)',  '§5.7.7'),
    ('2013_polysubstance_dsm5_retired',  'polysubstance (retired)', '§5.7.8'),
    ('2013_gambling_dsm5',               'gambling',        '§5.7.9'),
    ('2015_gabapentin_abuse_recognition', 'gabapentin (recognition)', '§5.7.15a'),
    ('2015_pregabalin_abuse_recognition', 'pregabalin (recognition)', '§5.7.15b'),
    ('2014_tramadol_abuse_recognition',   'tramadol (recognition)',   '§5.7.15c'),
    ('2015_loperamide_abuse_recognition', 'loperamide (recognition)', '§5.7.15d'),
    ('2018_tianeptine_abuse_recognition', 'tianeptine (recognition)', '§5.7.15e'),
]

s57_summary_rows = []
s57_frames_pairs = {}
for shift_key, pretty, section in SUBSTANCE_PAIRS:
    parts = {}
    for side in ('old', 'new'):
        p = DATA_DIR / f'{shift_key}_{side}.parquet'
        df = pd.read_parquet(p)
        if len(df):
            df['text'] = (df['title'].fillna('') + ' ' + df['abstract'].fillna('')).str.strip()
            df = df[df['text'].str.len() > 0].reset_index(drop=True)
            df['year'] = df['year'].astype('Int64')
            df = df.dropna(subset=['year']).reset_index(drop=True)
            df['year'] = df['year'].astype(int)
        parts[side] = df
    s57_frames_pairs[shift_key] = parts

    old_n, new_n = len(parts['old']), len(parts['new'])
    # First-appearance + crossover detection
    new_yr = parts['new'].groupby('year').size() if new_n else pd.Series(dtype=int)
    old_yr = parts['old'].groupby('year').size() if old_n else pd.Series(dtype=int)
    first_new = int(new_yr.index.min()) if len(new_yr) else None
    years_all = sorted(set(new_yr.index) | set(old_yr.index))
    new_yr2 = new_yr.reindex(years_all, fill_value=0)
    old_yr2 = old_yr.reindex(years_all, fill_value=0)
    crossover = next((y for y in years_all
                      if new_yr2[y] > old_yr2[y] and (new_yr2[y] + old_yr2[y]) >= 5),
                     None)
    s57_summary_rows.append({
        'section': section, 'shift': pretty,
        'old_n': old_n, 'new_n': new_n,
        'first_new_year': first_new,
        'crossover_year': crossover,
    })

s57_summary = pd.DataFrame(s57_summary_rows)
with pd.option_context('display.max_colwidth', 30, 'display.width', 200):
    print(s57_summary.to_string(index=False))

 section                    shift  old_n  new_n  first_new_year  crossover_year
  §5.7.1                  alcohol  40208  17749            1990          2019.0
  §5.7.2                   opioid   6321   9675            1991          2018.0
  §5.7.3                 cannabis   1667   2569            1990          1994.0
  §5.7.4                  cocaine   3843   1031            1991          2019.0
  §5.7.5                stimulant   1302    388            1999          2023.0
  §5.7.6                  tobacco   7415    769            1991             NaN
  §5.7.7           AAS (negative)    420      5            2020             NaN
  §5.7.8  polysubstance (retired)    592     71            1994             NaN
  §5.7.9                 gambling   3954   1387            1991             NaN
§5.7.15a gabapentin (recognition)   7968     67            1997             NaN
§5.7.15b pregabalin (recognition)   4752     75            2010             NaN
§5.7.15c   tramadol (recognition)   6826    131            1997             NaN
§5.7.15d loperamide (recognition)   2038    101            1994             NaN
§5.7.15e tianeptine (recognition)    590     17            1999             NaN

# Per-sub-shift verdict using §0b-style pre-registered tolerances
TH_SUD_CROSSOVER_TOL = 2  # ±2 years of DSM-5 2013
TH_GABAPENTIN_RECOGNITION_LO = 2010
TH_GABAPENTIN_RECOGNITION_HI = 2018
TH_TIANEPTINE_RECOGNITION_LO = 2016
TH_TIANEPTINE_RECOGNITION_HI = 2020

_verdicts = []
for row in s57_summary_rows:
    sect = row['section']
    cross = row['crossover_year']
    first = row['first_new_year']

    # The DSM-5 main pairs (§5.7.1 - §5.7.6, §5.7.9): crossover within ±2 of 2013
    if sect in ('§5.7.1', '§5.7.2', '§5.7.3', '§5.7.4', '§5.7.5', '§5.7.6', '§5.7.9'):
        if cross is not None and abs(cross - 2013) <= TH_SUD_CROSSOVER_TOL:
            verdict = f'PASS (crossover {cross} within ±2 of 2013)'
        elif cross is not None and cross <= 2018:
            verdict = f'PARTIAL (crossover {cross}, outside ±2 but rename in progress)'
        elif row['new_n'] >= 100:
            verdict = f'PARTIAL (no crossover yet; new term has {row["new_n"]:,} records but old dominates)'
        else:
            verdict = f'PARTIAL (rename incomplete; new term has only {row["new_n"]:,} records)'
    # §5.7.7 AAS: pre-registered NEGATIVE prediction (no rename)
    elif sect == '§5.7.7':
        verdict = (f'PASS (NEGATIVE prediction confirmed: '
                    f'only {row["new_n"]} "AAS use disorder" records — '
                    f'DSM-5 did not carve out AAS-specific category)')
    # §5.7.8 polysubstance: pre-registered NEGATIVE (retired, no replacement)
    elif sect == '§5.7.8':
        verdict = (f'PASS (NEGATIVE prediction confirmed: '
                    f'polysubstance UD retired in DSM-5; only {row["new_n"]} '
                    f'literature mentions of "polysubstance use disorder" '
                    f'(colloquial use))')
    # §5.7.15: discovery-of-abuse-potential. PASS if first abuse-recognition
    # record falls in pre-reg window
    elif sect.startswith('§5.7.15'):
        sub = sect[-1]
        if sub == 'a':
            lo, hi = TH_GABAPENTIN_RECOGNITION_LO, TH_GABAPENTIN_RECOGNITION_HI
        elif sub == 'b':
            lo, hi = 2012, 2017
        elif sub == 'c':
            lo, hi = 2010, 2016
        elif sub == 'd':
            lo, hi = 2013, 2018
        elif sub == 'e':
            lo, hi = TH_TIANEPTINE_RECOGNITION_LO, TH_TIANEPTINE_RECOGNITION_HI
        else:
            lo, hi = 2010, 2020
        if first is not None and lo <= first <= hi:
            verdict = (f'PASS (first abuse-recognition record {first}, '
                        f'within pre-reg window {lo}-{hi})')
        elif first is not None:
            verdict = (f'PARTIAL (first abuse-recognition record {first}, '
                        f'outside pre-reg window {lo}-{hi})')
        else:
            verdict = 'PARTIAL (no abuse-recognition records found)'
    else:
        verdict = 'OBSERVED'
    _verdicts.append({**row, 'verdict': verdict})

s57_verdicts = pd.DataFrame(_verdicts)
with pd.option_context('display.max_colwidth', 100, 'display.width', 200):
    print(s57_verdicts[['section', 'shift', 'old_n', 'new_n',
                         'first_new_year', 'crossover_year', 'verdict']].to_string(index=False))

# Counters for the §9 scoreboard
s57_n_pass = int(s57_verdicts['verdict'].str.startswith('PASS').sum())
s57_n_partial = int(s57_verdicts['verdict'].str.startswith('PARTIAL').sum())
s57_n_total = int(len(s57_verdicts))

 section                    shift  old_n  new_n  first_new_year  crossover_year                                                                                                                                               verdict
  §5.7.1                  alcohol  40208  17749            1990          2019.0                                                                             PARTIAL (no crossover yet; new term has 17,749 records but old dominates)
  §5.7.2                   opioid   6321   9675            1991          2018.0                                                                                           PARTIAL (crossover 2018, outside ±2 but rename in progress)
  §5.7.3                 cannabis   1667   2569            1990          1994.0                                                                                           PARTIAL (crossover 1994, outside ±2 but rename in progress)
  §5.7.4                  cocaine   3843   1031            1991          2019.0                                                                              PARTIAL (no crossover yet; new term has 1,031 records but old dominates)
  §5.7.5                stimulant   1302    388            1999          2023.0                                                                                PARTIAL (no crossover yet; new term has 388 records but old dominates)
  §5.7.6                  tobacco   7415    769            1991             NaN                                                                                PARTIAL (no crossover yet; new term has 769 records but old dominates)
  §5.7.7           AAS (negative)    420      5            2020             NaN                               PASS (NEGATIVE prediction confirmed: only 5 "AAS use disorder" records — DSM-5 did not carve out AAS-specific category)
  §5.7.8  polysubstance (retired)    592     71            1994             NaN PASS (NEGATIVE prediction confirmed: polysubstance UD retired in DSM-5; only 71 literature mentions of "polysubstance use disorder" (colloquial use))
  §5.7.9                 gambling   3954   1387            1991             NaN                                                                              PARTIAL (no crossover yet; new term has 1,387 records but old dominates)
§5.7.15a gabapentin (recognition)   7968     67            1997             NaN                                                                       PARTIAL (first abuse-recognition record 1997, outside pre-reg window 2010-2018)
§5.7.15b pregabalin (recognition)   4752     75            2010             NaN                                                                       PARTIAL (first abuse-recognition record 2010, outside pre-reg window 2012-2017)
§5.7.15c   tramadol (recognition)   6826    131            1997             NaN                                                                       PARTIAL (first abuse-recognition record 1997, outside pre-reg window 2010-2016)
§5.7.15d loperamide (recognition)   2038    101            1994             NaN                                                                       PARTIAL (first abuse-recognition record 1994, outside pre-reg window 2013-2018)
§5.7.15e tianeptine (recognition)    590     17            1999             NaN                                                                       PARTIAL (first abuse-recognition record 1999, outside pre-reg window 2016-2020)

# Build per-year (year, side, n_records) long-format for the 9 DSM-5 pairs
SUBSTANCE_DSM5_KEYS = [
    '2013_alcohol_dsm5', '2013_opioid_dsm5', '2013_cannabis_dsm5',
    '2013_cocaine_dsm5', '2013_stimulant_dsm5', '2013_tobacco_dsm5',
    '2013_aas_dsm5_negative', '2013_polysubstance_dsm5_retired',
    '2013_gambling_dsm5',
]
SUBSTANCE_DSM5_LABELS = {
    '2013_alcohol_dsm5': 'alcohol',
    '2013_opioid_dsm5': 'opioid',
    '2013_cannabis_dsm5': 'cannabis',
    '2013_cocaine_dsm5': 'cocaine',
    '2013_stimulant_dsm5': 'stimulant',
    '2013_tobacco_dsm5': 'tobacco',
    '2013_aas_dsm5_negative': 'AAS (neg)',
    '2013_polysubstance_dsm5_retired': 'polysubstance (retired)',
    '2013_gambling_dsm5': 'gambling',
}

_dsm5_rows = []
for sk in SUBSTANCE_DSM5_KEYS:
    for side in ('old', 'new'):
        df = s57_frames_pairs[sk][side]
        if not len(df): continue
        yr = df.groupby('year').size()
        for y, n in yr.items():
            _dsm5_rows.append({
                'shift': SUBSTANCE_DSM5_LABELS[sk],
                'side': 'abuse / dependence (DSM-IV era)' if side == 'old'
                        else 'use disorder (DSM-5 2013+)',
                'year': int(y), 'n_records': int(n),
            })
_dsm5_long = pd.DataFrame(_dsm5_rows)
_dsm5_long = _dsm5_long[_dsm5_long['year'] <= _PLOT_YEAR_MAX]

# Build small-multiples via layered chart with data passed to facet (Altair
# requires top-level data when faceting a layered chart).
line = alt.Chart().mark_line(strokeWidth=2).encode(
    x=alt.X('year:O', title=None, axis=alt.Axis(labelOverlap=True,
            values=list(range(2000, 2024, 4)))),
    y=alt.Y('n_records:Q', title='records / year'),
    color=alt.Color('side:N', title=None,
                     scale=alt.Scale(range=['#e76f51', '#264653'])),
)
anchor = alt.Chart(pd.DataFrame({'x': ['2013']})).mark_rule(
    strokeDash=[4, 4], color='#888').encode(x='x:O')
panel = alt.layer(line, anchor, data=_dsm5_long).properties(width=240, height=140)
panel.facet(
    facet=alt.Facet('shift:N',
                     header=alt.Header(labelFontSize=12, titleFontSize=0)),
    columns=3,
).resolve_scale(y='independent')

import numpy as np

# Build a record-level frame: year, journal, side ∈ {'old', 'new'}, post2013 flag.
_alc = s57_frames_pairs['2013_alcohol_dsm5']
_alc_old = _alc['old'][['year', 'journal']].copy(); _alc_old['side'] = 'old'
_alc_new = _alc['new'][['year', 'journal']].copy(); _alc_new['side'] = 'new'
_alc_rec = pd.concat([_alc_old, _alc_new], ignore_index=True)
_alc_rec['year'] = pd.to_numeric(_alc_rec['year'], errors='coerce')
_alc_rec = _alc_rec.dropna(subset=['year', 'journal'])
_alc_rec['year'] = _alc_rec['year'].astype(int)
_alc_rec['journal'] = _alc_rec['journal'].astype(str)
_alc_rec['journal_norm'] = _alc_rec['journal'].str.lower().str.strip()

# Restrict to post-2013 (the DSM-5 era).
_alc_post = _alc_rec[_alc_rec['year'] >= 2013].reset_index(drop=True)
_n_records = len(_alc_post)
_n_journals = _alc_post['journal_norm'].nunique()
_point_est = (_alc_post['side'] == 'new').mean()
print(f"Post-2013 alcohol records: {_n_records:,}  journals: {_n_journals:,}")
print(f"Point estimate (new-share): {_point_est:.4f}")

# Naive bootstrap: resample records with replacement.
_rng = np.random.default_rng(42)
B = 1000
_is_new = (_alc_post['side'].values == 'new').astype(int)
_naive_shares = np.empty(B)
for b in range(B):
    idx = _rng.integers(0, _n_records, size=_n_records)
    _naive_shares[b] = _is_new[idx].mean()
_naive_lo, _naive_hi = np.quantile(_naive_shares, [0.025, 0.975])

# Clustered bootstrap by journal: resample journals with replacement,
# concatenate all records from each chosen journal.
_journal_groups = {j: g.index.values for j, g in _alc_post.groupby('journal_norm')}
_journals_list = list(_journal_groups.keys())
_rng = np.random.default_rng(42)
_clust_shares = np.empty(B)
for b in range(B):
    chosen = _rng.choice(_journals_list, size=_n_journals, replace=True)
    idxs = np.concatenate([_journal_groups[j] for j in chosen])
    _clust_shares[b] = _is_new[idxs].mean()
_clust_lo, _clust_hi = np.quantile(_clust_shares, [0.025, 0.975])

_naive_w = _naive_hi - _naive_lo
_clust_w = _clust_hi - _clust_lo

_cmp = pd.DataFrame([
    {'method':   'naive bootstrap (records IID)',
     'lo (2.5%)': f'{_naive_lo:.4f}',  'hi (97.5%)': f'{_naive_hi:.4f}',
     'CI width':  f'{_naive_w:.4f}'},
    {'method':   'journal-clustered bootstrap',
     'lo (2.5%)': f'{_clust_lo:.4f}',  'hi (97.5%)': f'{_clust_hi:.4f}',
     'CI width':  f'{_clust_w:.4f}'},
])
print('\nBootstrap comparison (B=1000, post-2013 alcohol new-share):')
print(_cmp.to_string(index=False))
_ratio = _clust_w / _naive_w if _naive_w else float('inf')
print(f'\nClustered CI is {_ratio:.2f}x wider than naive CI.')
_clust_pass = _ratio >= 1.5
print('Pre-registered prediction (>=1.5x wider): '
      + ('PASS' if _clust_pass else 'FAIL'))

Post-2013 alcohol records: 30,489  journals: 4,034
Point estimate (new-share): 0.4767

Bootstrap comparison (B=1000, post-2013 alcohol new-share):
                       method lo (2.5%) hi (97.5%) CI width
naive bootstrap (records IID)    0.4710     0.4822   0.0112
  journal-clustered bootstrap    0.4551     0.4965   0.0414

Clustered CI is 3.72x wider than naive CI.
Pre-registered prediction (>=1.5x wider): PASS

import re
from pathlib import Path

POLY_DIR = Path('../data/pubmed_polysemy')

# Conservative regex-bucket sense classifier per token. First-match-wins;
# residual = 'unknown' (do NOT force assignment).
POLY_BUCKETS = {
    'steroid': [
        ('anabolic',        re.compile(r'\banabolic\b|\bandrogen', re.I)),
        ('corticosteroid',  re.compile(r'\bcortico\w*|\bglucocorticoid\w*|\bdexamethasone\b|\bprednisone\b|\bhydrocortisone\b|\bmethylprednisolone\b', re.I)),
        ('neurosteroid',    re.compile(r'\bneurosteroid\w*|\ballopregnan\w*', re.I)),
        ('plant',           re.compile(r'\bphytosteroid\w*|\bplant\s+steroid|\bphytosterol\w*|\bbrassinosteroid\w*', re.I)),
        ('inhaled / topical', re.compile(r'\binhaled\s+steroid|\btopical\s+steroid|\bsteroid\s+inhaler|\beye\s+drop|\bnasal\s+steroid', re.I)),
        ('sex steroid',     re.compile(r'\b(estrogen|oestrogen|progesterone|estradiol|testosterone)\b', re.I)),
    ],
    'doping': [
        ('sports doping',   re.compile(r'\bsport\w*|\bathlete\w*|\bWADA\b|\banti-?doping|\bperformance[- ]enhancing|\berythropoietin\b|\bEPO\b|\bdoping\s+control|\bdoping\s+test', re.I)),
        ('semiconductor',   re.compile(r'\bsemiconductor\w*|\bn-type|\bp-type|\bsilicon\b|\bgraphene\b|\bnanocrystal\w*|\bquantum\s+dot|\belectronic\s+structure|\bband\s+gap|\bphotocatal', re.I)),
        ('material / chem', re.compile(r'\bnanoparticle\w*|\bcatalyst\w*|\bperovskite\w*|\bcrystal\w*|\bcerium|\btitania|\bzinc\s+oxide|\bMOF\b', re.I)),
        ('pharmacology',    re.compile(r'\bdrug\s+formulation|\bdrug\s+delivery|\bnanomedicine|\bcarrier\b', re.I)),
    ],
    'AAS': [
        ('anabolic-androgenic', re.compile(r'\banabolic\b|\bandrogen|\bsteroid\s+use|\bbodybuild', re.I)),
        ('astronomy',           re.compile(r'\bastronomical\s+society|\bAmerican\s+Astronomical|\bgalax\w*|\bquasar|\bsupernova|\bcosmolog', re.I)),
        ('spectroscopy',        re.compile(r'\batomic\s+absorption|\bspectrophotomet|\bspectroscop|\bICP-?MS|\bICP-?OES|\bGFAAS|\bFAAS', re.I)),
        ('amino acid sequence', re.compile(r'\bamino\s+acid\s+sequence', re.I)),
        ('amyotrophic / scler', re.compile(r'\bamyotrophic\b|\bsclerosis\b', re.I)),
        ('aortic / aneurysm',   re.compile(r'\baortic\b|\baneurysm\b|\bAAA\b', re.I)),
    ],
    'weed': [
        ('cannabis',     re.compile(r'\bcannabis\b|\bmarijuana\b|\bmarihuana\b|\bTHC\b|\bcannabidiol\b|\bCBD\b', re.I)),
        ('agricultural', re.compile(r'\bweed\s+(control|management|species|seed|community|flora|killer)|\bherbicid|\bweed\s+killer|\binvasive\s+(plant|species|weed)|\bcrop\s+protection|\bglyphosate|\bweed\s+resistance|\bnoxious\s+weed', re.I)),
        ('seaweed / kelp', re.compile(r'\bseaweed\b|\bkelp\b|\balgae\b|\bbrown\s+algae|\bsargassum|\bmacroalga', re.I)),
        ('tumbleweed / pollen', re.compile(r'\btumbleweed\b|\bragweed\b|\bgoosefoot\b', re.I)),
    ],
    'horse': [
        ('equine',       re.compile(r'\bequine\b|\bequus\b|\bfilly\b|\bfoal\b|\bmare\b|\bstallion\b|\bgelding\b|\bthoroughbred\b|\bracehorse\b|\bveterinary\b', re.I)),
        ('heroin slang', re.compile(r'\bheroin\b|\bopioid\s+use\s+disorder|\bopiate\b|\binjection\s+drug', re.I)),
        ('seahorse',     re.compile(r'\bseahorse\b|\bhippocamp\w*', re.I)),
        ('Trojan / metaphor', re.compile(r'\btrojan\s+horse|\bhorse\s+chestnut', re.I)),
        ('horseshoe / horsefly', re.compile(r'\bhorseshoe\b|\bhorsefly\b|\bhorsetail\b', re.I)),
    ],
    'gaming': [
        ('video / internet', re.compile(r'\bvideo\s+game|\bvideogame|\binternet\s+gaming|\binternet\s+game|\bonline\s+game|\bonline\s+gaming|\besports?\b|\bgaming\s+disorder|\bgame\s+addiction', re.I)),
        ('gambling',         re.compile(r'\bgambling\b|\bcasino\b|\bproblem\s+gambl|\bpathological\s+gambl', re.I)),
        ('game theory',      re.compile(r'\bgame\s+theor|\bgame-?theoretic|\bnash\s+equilibri', re.I)),
        ('gamification',     re.compile(r'\bgamification\b|\bgamified\b|\bserious\s+game', re.I)),
        ('hunting',          re.compile(r'\bbushmeat\b|\bwild\s+game|\bhunting\b', re.I)),
    ],
}


def _classify_record(text: str, buckets: list) -> str:
    # First-match-wins; returns 'unknown' if no bucket matches.
    for name, rx in buckets:
        if rx.search(text):
            return name
    return 'unknown'


poly_summary_rows = []
poly_spotcheck_rows = []
_seed = 42

for token in ['steroid', 'doping', 'AAS', 'weed', 'horse', 'gaming']:
    p = POLY_DIR / f'{token}.parquet'
    if not p.exists():
        poly_summary_rows.append({'token': token, 'n_records': 0,
                                  'top_bucket': 'NO DATA', 'top_share': float('nan'),
                                  'intended_share': float('nan'),
                                  'unknown_share': float('nan'),
                                  'pre_reg_verdict': 'NO DATA'})
        continue
    df = pd.read_parquet(p)
    if not len(df):
        poly_summary_rows.append({'token': token, 'n_records': 0,
                                  'top_bucket': 'EMPTY', 'top_share': float('nan'),
                                  'intended_share': float('nan'),
                                  'unknown_share': float('nan'),
                                  'pre_reg_verdict': 'EMPTY'})
        continue
    text = (df['title'].fillna('') + ' ' + df['abstract'].fillna('')).str.strip()
    df = df.assign(bucket=text.apply(lambda t: _classify_record(t, POLY_BUCKETS[token])))
    counts = df['bucket'].value_counts()
    shares = counts / counts.sum()
    top_bucket = counts.index[0]
    top_share = float(shares.iloc[0])
    # Intended (drug / slang) sense per token -- named EXPLICITLY rather
    # than taken as the first regex bucket. For 'horse' the first bucket
    # is the *equine* sense, not the heroin-slang reading under test, so
    # the first-bucket shortcut measured the wrong sense (fixed iter-8).
    INTENDED_DRUG_SENSE = {
        'steroid': 'anabolic', 'doping': 'sports doping',
        'AAS': 'anabolic-androgenic', 'weed': 'cannabis',
        'horse': 'heroin slang', 'gaming': 'video / internet',
    }
    intended = INTENDED_DRUG_SENSE[token]
    intended_share = float(shares.get(intended, 0.0))
    unknown_share = float(shares.get('unknown', 0.0))
    # Pre-reg prediction: intended bucket is NOT the modal bucket,
    # OR an unintended bucket exceeds the intended bucket's share.
    non_intended_max = max(
        [s for b, s in shares.items() if b not in (intended, 'unknown')] or [0.0])
    poly_pass = (top_bucket != intended) or (non_intended_max >= intended_share)
    poly_summary_rows.append({
        'token': token,
        'n_records': len(df),
        'top_bucket': top_bucket,
        'top_share': round(top_share, 3),
        'intended_share': round(intended_share, 3),
        'unknown_share': round(unknown_share, 3),
        'pre_reg_verdict': 'PASS (single-token mixes senses)' if poly_pass else 'FAIL (intended dominates)',
    })
    # Spot-check: 5 random records per token
    sample = df.sample(n=min(5, len(df)), random_state=_seed)
    for _, row in sample.iterrows():
        poly_spotcheck_rows.append({
            'token': token,
            'pmid': row.get('pmid', ''),
            'bucket': row['bucket'],
            'title_excerpt': (str(row.get('title', ''))[:100] + '...'
                              if len(str(row.get('title', ''))) > 100
                              else str(row.get('title', ''))),
        })

poly_summary = pd.DataFrame(poly_summary_rows)
print('Polysemy demo summary (intended sense = named drug/slang reading per token):')
print(poly_summary.to_string(index=False))

poly_n_pass = (poly_summary['pre_reg_verdict']
               .str.startswith('PASS').sum())
poly_n_total = len(poly_summary)
print(f'\nPre-registered prediction: {poly_n_pass} of {poly_n_total} tokens '
      f'show single-token sense mixing.')

Polysemy demo summary (intended sense = named drug/slang reading per token):
  token  n_records       top_bucket  top_share  intended_share  unknown_share                  pre_reg_verdict
steroid       2989          unknown      0.579           0.069          0.579 PASS (single-token mixes senses)
 doping       3000    semiconductor      0.389           0.064          0.273 PASS (single-token mixes senses)
    AAS       2999          unknown      0.666           0.095          0.666 PASS (single-token mixes senses)
   weed       2997     agricultural      0.644           0.015          0.335 PASS (single-token mixes senses)
  horse       2995          unknown      0.478           0.000          0.478 PASS (single-token mixes senses)
 gaming       2995 video / internet      0.502           0.502          0.394        FAIL (intended dominates)

Pre-registered prediction: 5 of 6 tokens show single-token sense mixing.

# Per-token bucket distribution chart (stacked horizontal bars)
_poly_long_rows = []
for token in ['steroid', 'doping', 'AAS', 'weed', 'horse', 'gaming']:
    p = POLY_DIR / f'{token}.parquet'
    if not p.exists():
        continue
    df = pd.read_parquet(p)
    if not len(df):
        continue
    text = (df['title'].fillna('') + ' ' + df['abstract'].fillna('')).str.strip()
    df = df.assign(bucket=text.apply(lambda t: _classify_record(t, POLY_BUCKETS[token])))
    counts = df['bucket'].value_counts()
    for b, n in counts.items():
        _poly_long_rows.append({'token': token, 'bucket': b,
                                'n': int(n), 'share': n / counts.sum()})
_poly_long = pd.DataFrame(_poly_long_rows)
# Mark intended for color highlighting
_intended_map = {tok: POLY_BUCKETS[tok][0][0]
                 for tok in ['steroid', 'doping', 'AAS', 'weed', 'horse', 'gaming']}
_poly_long['sense_class'] = _poly_long.apply(
    lambda r: ('intended' if r['bucket'] == _intended_map.get(r['token'])
               else ('unknown' if r['bucket'] == 'unknown' else 'unintended')),
    axis=1,
)

alt.Chart(_poly_long).mark_bar().encode(
    x=alt.X('share:Q', title='share of records', axis=alt.Axis(format='%')),
    y=alt.Y('bucket:N', title=None, sort='-x'),
    color=alt.Color('sense_class:N', title='sense class',
                     scale=alt.Scale(
                         domain=['intended', 'unintended', 'unknown'],
                         range=['#2a9d8f', '#e76f51', '#888888'])),
    tooltip=['token', 'bucket', 'n', alt.Tooltip('share:Q', format='.2%')],
).properties(width=320, height=120).facet(
    facet=alt.Facet('token:N',
                     header=alt.Header(labelFontSize=12, titleFontSize=0)),
    columns=2,
).resolve_scale(y='independent', x='independent')

# Random spot-check (seed=42, 5 per token) — qualitative validation
print('Random spot-check (seed=42, 5 per token):')
print(pd.DataFrame(poly_spotcheck_rows).to_string(index=False))

Random spot-check (seed=42, 5 per token):
  token     pmid           bucket                                                                                           title_excerpt
steroid 32855900      sex steroid                                                                            Sex Differences in Melanoma.
steroid 36017046   corticosteroid Expression profile analysis to identify potential gene changes induced by dexamethasone in the trabe...
steroid 31929312          unknown                     Mucormycosis-induced ileocecal perforation: A case report and review of literature.
steroid 34930562      sex steroid Steroid modification by filamentous fungus Drechslera sp.: Focus on 7-hydroxylase and 17β-hydroxyste...
steroid 30253116          unknown Long-Lasting Primed State in Maize Plants: Salicylic Acid and Steroid Signaling Pathways as Key Play...
 doping 36346945  material / chem Sodium Alginate-Doping Cationic Nanoparticle As Dual Gene Delivery System for Genetically Bimodal Th...
 doping 34505743    sports doping Organ-on-a-chip: Determine feasibility of a human liver microphysiological model to assess long-term...
 doping 36369629    sports doping            Annual banned-substance review-Analytical approaches in human sports drug testing 2021/2022.
 doping 30413786    semiconductor                   Energetics and Electronic Structure of Triangular Hexagonal Boron Nitride Nanoflakes.
 doping 38335551    semiconductor                     Structure and stability of La- and hole-doped hafnia with/without epitaxial strain.
    AAS 34251639          unknown Oxidation of Energy Substrates in Tissues of Fish: Metabolic Significance and Implications for Gene ...
    AAS 32684600          unknown Usefulness of Plasma Branched-Chain Amino Acid Analysis in Predicting Outcomes of Patients with Noni...
    AAS 29600381     spectroscopy Spectral fitting approach for the determination of enrichment and contamination factors in mining se...
    AAS 35517454          unknown QuEChERS pretreatment combined with high-performance liquid chromatography-tandem mass spectrometry ...
    AAS 29216550     spectroscopy Response surface methodology optimization for sorption of malachite green dye on sugarcane bagasse b...
   weed 34439539          unknown                                Phytochemistry, Pharmacology, and Toxicology of Datura Species-A Review.
   weed 32915706     agricultural Different Sequevars of Ralstonia pseudosolanacearum Causing Bacterial Wilt of Bidens pilosa in China...
   weed 29773742     agricultural                   Wicked evolution: Can we address the sociobiological dilemma of pesticide resistance?
   weed 39660200     agricultural           Development and testing of a precision hoeing system for re-compacted ridge tillage in maize.
   weed 29245107     agricultural Impacts on the seagrass, Zostera nigricaulis, from the herbicide Fusilade Forte® used in the managem...
  horse 33941332           equine Antigenic differences between equine influenza virus vaccine strains and Florida sublineage clade 1 ...
  horse 36596349           equine          Pilot Study on Annual Horse Movements by Air and the Possible Effect of the Covid-19 Pandemic.
  horse 30320737           equine Nerve Stimulator-guided Injection of Autologous Stem Cells Near the Equine Left Recurrent Laryngeal ...
  horse 36565526          unknown              One-step immunoassay based on filtration for detection of food poisoning-related bacteria.
  horse 34632158          unknown Diagnostic imaging features, cytological examination, and treatment of lymphocytic tenosynovitis of ...
 gaming 34674922          unknown There's an app for that: Teaching residents to communicate diagnostic uncertainty through a mobile g...
 gaming 37075676 video / internet Association between video gaming time and cognitive functions: A cross-sectional study of Chinese ch...
 gaming 30621356 video / internet Neurophysiological Mechanisms of Resilience as a Protective Factor in Patients with Internet Gaming ...
 gaming 37009115 video / internet Reaching hidden youth in Singapore through the Hidden Youth Intervention Program: A biopsychosocial ...
 gaming 35352599          unknown Different types of screen time are associated with low life satisfaction in adolescents across 37 Eu...

import numpy as np
from pathlib import Path

POLY_EMB_DIR = Path('../data/pubmed_polysemy_embeddings')

poly_wsi_rows = []
for token in ['steroid', 'doping', 'AAS', 'weed', 'horse', 'gaming']:
    p = POLY_DIR / f'{token}.parquet'
    emb_p = POLY_EMB_DIR / f'{token}.npy'
    if not (p.exists() and emb_p.exists()):
        print(f'  [skip] {token}: missing parquet or embedding cache')
        continue
    df = pd.read_parquet(p)
    X = np.load(emb_p)
    text = (df['title'].fillna('') + ' ' + df['abstract'].fillna('')).str.strip()
    df = df.assign(
        text=text,
        regex_bucket=text.apply(lambda t: _classify_record(t, POLY_BUCKETS[token])),
    )
    # Cross-check only where the regex committed to a sense.
    mask = (df['regex_bucket'] != 'unknown').to_numpy()
    k = df.loc[mask, 'regex_bucket'].nunique()
    if mask.sum() < 20 or k < 2:
        print(f'  [skip] {token}: too few labelled records or <2 buckets')
        continue
    df_l = df[mask].reset_index(drop=True)
    res = pcd.induce_senses(
        df_l, X[mask], k=k, text_col='text',
        embedding_meta={'model': 'all-MiniLM-L6-v2', 'unit': 'document'},
    )
    agr = res.agreement_with(df_l['regex_bucket'])
    poly_wsi_rows.append({
        'token': token,
        'n_labelled': int(mask.sum()),
        'k_buckets': int(k),
        'ARI': round(agr.ari, 3),
        'V_measure': round(agr.v_measure, 3),
    })

poly_wsi = pd.DataFrame(poly_wsi_rows)
print('Unsupervised WSI (induce_senses) vs hand-built regex buckets')
print('(records where the regex made a definite call):')
print(poly_wsi.to_string(index=False))
poly_wsi_mean_ari = float(poly_wsi['ARI'].mean()) if len(poly_wsi) else float('nan')
poly_wsi_corroborated = int((poly_wsi['ARI'] > 0.1).sum()) if len(poly_wsi) else 0
poly_wsi_n = len(poly_wsi)
if poly_wsi_n:
    _best = poly_wsi.loc[poly_wsi['ARI'].idxmax()]
    _worst = poly_wsi.loc[poly_wsi['ARI'].idxmin()]
    print(f'\\nStrongest agreement: {_best["token"]} (ARI={_best["ARI"]}, '
          f'V={_best["V_measure"]}) -- topically distinct senses.')
    print(f'Weakest agreement:   {_worst["token"]} (ARI={_worst["ARI"]}, '
          f'V={_worst["V_measure"]}) -- extreme sense imbalance; k-means '
          f'splits the dominant sense.')
    print(f'\\n{poly_wsi_corroborated}/{poly_wsi_n} tokens show above-chance '
          f'agreement (ARI > 0.1); mean ARI {poly_wsi_mean_ari:.3f}.')

Unsupervised WSI (induce_senses) vs hand-built regex buckets
(records where the regex made a definite call):
  token  n_labelled  k_buckets   ARI  V_measure
steroid        1258          6 0.231      0.343
 doping        2181          4 0.147      0.272
    AAS        1002          6 0.469      0.628
   weed        1994          4 0.012      0.033
  horse        1563          4 0.192      0.355
 gaming        1816          4 0.189      0.286
\nStrongest agreement: AAS (ARI=0.469, V=0.628) -- topically distinct senses.
Weakest agreement:   weed (ARI=0.012, V=0.033) -- extreme sense imbalance; k-means splits the dominant sense.
\n5/6 tokens show above-chance agreement (ARI > 0.1); mean ARI 0.207.

SHIFT5 = 'neg_suicide_phrasing'
old5 = frames[SHIFT5]['old']
new5 = frames[SHIFT5]['new']
print(f'"committed suicide" PubMed records: {len(old5):,}')
print(f'"died by suicide" PubMed records:   {len(new5):,}')

if len(old5):
    old_yr5 = old5.groupby('year').size()
    print(f'\n"committed suicide" by year — recent decade:')
    print(old_yr5.loc[2014:].to_string())
    print(f'\nTrend: {"INCREASING" if old_yr5.loc[2014:].iloc[-1] > old_yr5.loc[2014:].iloc[0] else "decreasing"} over 2014-latest')

"committed suicide" PubMed records: 1,803
"died by suicide" PubMed records:   0

"committed suicide" by year — recent decade:
year
2014    48
2015    49
2016    45
2017    47
2018    45
2019    49
2020    48
2021    51
2022    28
2023    40
2024    26

Trend: decreasing over 2014-latest

tier2 = pd.read_csv(Path('..') / 'data' / 'pubmed_tier2_counts.csv')
tier3 = pd.read_csv(Path('..') / 'data' / 'pubmed_tier3_counts.csv')
tier2['tier'] = 'T2'
tier3['tier'] = 'T3'
loaded = pd.concat([tier2, tier3], ignore_index=True)
print(f'Loaded inventory total rows: {len(loaded):,}')
print(f'Loaded inventory labels:     {loaded.label.nunique()}')
print(f'Total records summed:        {loaded["n_records"].sum():,}')

Loaded inventory total rows: 5,005
Loaded inventory labels:     68
Total records summed:        177,048

# Load the audit-mandated re-analysis: regex sense decomposition of
# every PubMed `retard*` record 1990-2024.
sense_counts = pd.read_csv(Path('..') / 'data' / 'retard_sense_counts_by_year.csv',
                            index_col='year')
print(f'Total records 1990-2024 containing verb/adj form of retard*: {int(sense_counts.sum().sum()):,}')
print(f'\\nPer-sense totals (35-year sum):')
totals = sense_counts.sum(axis=0).sort_values(ascending=False)
print(totals.to_string())

# Also keep the §5 clinical-MR series for parity check
clinical_mr = pd.read_csv(Path('..') / 'data' / 'pubmed_full_counts.csv')
clinical_mr_yr = (clinical_mr[clinical_mr.label == 'ID_old_mental_retardation']
                  .set_index('year')['n_records'].sort_index())

# §6.5.1 audit-resolved evidence
s651_slur_n = int(totals.get('slur_explicit_mention', 0))
s651_total = int(sense_counts.sum().sum())
s651_slur_pct = 100.0 * s651_slur_n / max(s651_total, 1)

# Per-decade clinical-ID compound trajectory (audit cross-check on §5)
sense_counts.index = sense_counts.index.astype(int)
clinical_id_dec = (sense_counts['clinical_intellectual_disability']
                   .groupby((sense_counts.index // 10) * 10).sum())
s651_clinical_1990s = int(clinical_id_dec.get(1990, 0))
s651_clinical_2020s = int(clinical_id_dec.get(2020, 0))
s651_clinical_decline_pct = 100.0 * (1 - s651_clinical_2020s / max(s651_clinical_1990s, 1))

# Growth-developmental decline
growth_dec = (sense_counts['growth_developmental']
              .groupby((sense_counts.index // 10) * 10).sum())
s651_growth_1990s = int(growth_dec.get(1990, 0))
s651_growth_2020s = int(growth_dec.get(2020, 0))
s651_growth_decline_pct = 100.0 * (1 - s651_growth_2020s / max(s651_growth_1990s, 1))

print(f'\\n=== §6.5.1 audit-resolved verdict ===')
print(f'Slur sense:                          {s651_slur_n:>3} / {s651_total:,} = {s651_slur_pct:.3f}% (essentially absent)')
print(f'Clinical-ID compound 1990s -> 2020s: {s651_clinical_1990s:>5,} -> {s651_clinical_2020s:>5,} ({s651_clinical_decline_pct:.0f}% decline; corroborates §5)')
print(f'Growth/developmental 1990s -> 2020s: {s651_growth_1990s:>5,} -> {s651_growth_2020s:>5,} ({s651_growth_decline_pct:.0f}% decline; bonus finding)')
print(f'\\nThe original INVERSION narrative was REFUTED by the audit + this re-analysis.')
print(f'The verb-form `retard*` corpus is dominated by scientific process-verb senses.')

# Keep the original variable names alive so the §6.5 scoreboard rows
# downstream don't go undefined; their semantics now reflect the
# audit-resolved analysis.
retarded_slur_yr = sense_counts['slur_explicit_mention']  # the actual slur trajectory
s65_mr_peak_yr = int(clinical_mr_yr.idxmax())
s65_mr_peak_n = int(clinical_mr_yr.max())
s65_slur_peak_yr = int(retarded_slur_yr.idxmax()) if retarded_slur_yr.max() > 0 else None
s65_slur_peak_n = int(retarded_slur_yr.max())
s65_mr_2020s = int(clinical_mr_yr.loc[2020:].sum())
s65_slur_2020s = int(retarded_slur_yr.loc[2020:].sum())

s65_mr_peak_yr = int(clinical_mr_yr.idxmax())
s65_mr_peak_n = int(clinical_mr_yr.max())
s65_slur_peak_yr = int(retarded_slur_yr.idxmax())
s65_slur_peak_n = int(retarded_slur_yr.max())
s65_mr_2020s = int(clinical_mr_yr.loc[2020:].sum())
s65_slur_2020s = int(retarded_slur_yr.loc[2020:].sum())

print(f'Clinical "mental retardation":  peak {s65_mr_peak_n:>5} in {s65_mr_peak_yr}; 2020s sum {s65_mr_2020s:>6,}')
print(f'Slur form "retarded":           peak {s65_slur_peak_n:>5} in {s65_slur_peak_yr}; 2020s sum {s65_slur_2020s:>6,}')
print(f'\\nClinical retired, slur survived. The retirement did NOT eliminate the word —')
print(f'it shifted from clinical usage into stigma-research usage. Inversion ratio:')
print(f'  slur 2020s / clinical 2020s = {s65_slur_2020s / max(s65_mr_2020s, 1):.1f}x')

Total records 1990-2024 containing verb/adj form of retard*: 95,862
\nPer-sense totals (35-year sum):
unknown                             37633
clinical_intellectual_disability    24039
growth_developmental                17814
biology_oncology_process_verb        9754
psychomotor_psychiatric              2623
chemistry_materials_process_verb     2472
scientific_process_passive_voice     1122
food_science                          134
environmental_agricultural            115
speech_language                        71
physics_retarded_potential             43
bone_skeletal                          38
slur_explicit_mention                   4
\n=== §6.5.1 audit-resolved verdict ===
Slur sense:                            4 / 95,862 = 0.004% (essentially absent)
Clinical-ID compound 1990s -> 2020s: 7,249 -> 1,688 (77% decline; corroborates §5)
Growth/developmental 1990s -> 2020s: 4,846 -> 2,928 (40% decline; bonus finding)
\nThe original INVERSION narrative was REFUTED by the audit + this re-analysis.
The verb-form `retard*` corpus is dominated by scientific process-verb senses.
Clinical "mental retardation":  peak  1087 in 2009; 2020s sum  1,960
Slur form "retarded":           peak     1 in 2010; 2020s sum      0
\nClinical retired, slur survived. The retirement did NOT eliminate the word —
it shifted from clinical usage into stigma-research usage. Inversion ratio:
  slur 2020s / clinical 2020s = 0.0x

# Stacked area showing all 7 senses across 1990-2023. Process-verb senses
# dominate; slur sense is essentially absent. This is the headline visual
# evidence behind the §6.5.1 audit-resolved interpretation.
# Truncate at _PLOT_YEAR_MAX (2023) — see §1 chart cell for rationale.
_sense_long = (sense_counts[sense_counts.index <= _PLOT_YEAR_MAX].reset_index()
                            .melt(id_vars='year', var_name='sense', value_name='records')
                            .sort_values(['year', 'sense']))
# Order: scientific senses first (largest), clinical compound middle, slur last
_sense_order = (sense_counts.sum(axis=0)
                            .sort_values(ascending=False).index.tolist())
_palette = ['#264653', '#2a9d8f', '#8ab17d', '#e9c46a',
            '#f4a261', '#e76f51', '#9d2424']
_sense_chart = alt.Chart(_sense_long).mark_area(opacity=0.85).encode(
    x=alt.X('year:O', title='Year', axis=alt.Axis(values=list(range(1990, 2025, 5)), labelOverlap=True)),
    y=alt.Y('records:Q', title='records / year (stacked by sense)', stack='zero'),
    color=alt.Color('sense:N', sort=_sense_order, title='Sense',
                     scale=alt.Scale(domain=_sense_order, range=_palette[:len(_sense_order)])),
    order=alt.Order('sense:N', sort='ascending'),
    tooltip=['year:O', 'sense:N', 'records:Q'],
).properties(width=720, height=300,
    title='§6.5.1 retard* sense-decomposition 1990-2024 (audit-resolved): process-verb senses dominate; slur essentially absent')
_sense_chart

polysemy = pd.read_csv(Path('..') / 'data' / 'polysemy_audit_classifications.csv')
print(f'Total Tier-2/3 labels audited: {len(polysemy)}')
print(f'\\nPer-verdict counts:')
print(polysemy['verdict'].value_counts().to_string())
print(f'\\n=== Polysemy-audited inventory (19 labels) ===\\n')
pd.set_option('display.max_colwidth', 60)
pd.set_option('display.width', 200)
print(polysemy[['label', 'intended_n', 'sampled_n', 'intended_pct',
                 'verdict', 'dominant_alternative_sense']].to_string(index=False))

# §6.5.1b evidence variables for the scoreboard
s651b_total = len(polysemy)
s651b_collision = int((polysemy['verdict'] == 'COLLISION').sum())
s651b_drift = int((polysemy['verdict'] == 'DRIFT').sum())
s651b_valid_era = int((polysemy['verdict'] == 'VALID-ERA-CLINICAL').sum())
s651b_valid_persistent = int((polysemy['verdict'] == 'VALID-PERSISTENT').sum())
s651b_unmeasurable = int((polysemy['verdict'] == 'UNMEASURABLE').sum())
s651b_unclassifiable = int((polysemy['verdict'] == 'UNCLASSIFIABLE').sum())

Total Tier-2/3 labels audited: 18
\nPer-verdict counts:
verdict
COLLISION             7
VALID-ERA-CLINICAL    6
VALID-PERSISTENT      2
DRIFT                 2
UNCLASSIFIABLE        1
\n=== Polysemy-audited inventory (19 labels) ===\n
               label  intended_n  sampled_n  intended_pct            verdict                                                                               dominant_alternative_sense
T3_retarded_morpheme           0         20           0.0          COLLISION                                                    scientific process verb (chemistry/biology/materials)
   T3_dwarf_clinical           2         20          10.0          COLLISION                                                                plant breeding (wheat/sorghum semi-dwarf)
          T3_lunatic           4         20          20.0          COLLISION                                                                      Lunatic Fringe Notch-signaling gene
           T3_midget           0         18           0.0          COLLISION                                                  retinal midget bipolar cells + ice hockey midget league
T3_imbecile_clinical           7          8          87.5 VALID-ERA-CLINICAL                           1954 clinical-era IQ classification (label renamed iter-3: _slur -> _clinical)
 T2_spastic_clinical          20         20         100.0   VALID-PERSISTENT                                           cerebral palsy clinical literature (still active clinical use)
  T2_mongoloid_idiot           3          3         100.0 VALID-ERA-CLINICAL                                                                      1963 Down-syndrome cytogenetics era
       T2_dope_fiend           2          2         100.0 VALID-ERA-CLINICAL                                                                                1972 addiction historical
          T3_bastard           1          1           NaN     UNCLASSIFIABLE                                                                                                      n=1
        T2_frigidity           1         20           5.0          COLLISION                                                     cold temperatures (frigid regions/materials/animals)
    T2_homosexuality           3         20          15.0              DRIFT topic/population descriptor (HIV/gay health/advocacy); term stayed but framing shifted away from disease
  T2_idiocy_clinical          20         20         100.0 VALID-ERA-CLINICAL                                                         amaurotic idiocy / Tay-Sachs historical compound
     T2_illegitimate          20         20         100.0 VALID-ERA-CLINICAL                                                    era-clinical social medicine on illegitimate children
         T2_imbecile           9          9         100.0 VALID-ERA-CLINICAL                                                                           era-clinical IQ classification
            T2_moron           0         10           0.0          COLLISION                                                bacteriophage moron gene elements; moronic acid chemistry
         T3_deformed          16         16         100.0   VALID-PERSISTENT                                          modern reconstructive surgery (facial deformity/cleft lip etc.)
        T3_hottentot           0          4           0.0              DRIFT                                                                 Khoisan population-genetics anthropology
           T3_kaffir           0          9           0.0          COLLISION                                                          kaffir lime (Citrus hystrix / makrut) botanical

_pal_verdict = {
    'VALID-ERA-CLINICAL': '#2a9d8f',
    'VALID-PERSISTENT':   '#264653',
    'COLLISION':          '#e63946',
    'DRIFT':              '#f4a261',
    'UNMEASURABLE':       '#bbbbbb',
    'UNCLASSIFIABLE':     '#dddddd',
}
_p = polysemy.copy()
_p['intended_pct_clean'] = pd.to_numeric(_p['intended_pct'], errors='coerce').fillna(0.0)
# Order: COLLISION at top (red, eye-catching), then DRIFT, then VALIDs
_verdict_rank = {'COLLISION': 0, 'DRIFT': 1, 'VALID-ERA-CLINICAL': 2,
                 'VALID-PERSISTENT': 3, 'UNMEASURABLE': 4, 'UNCLASSIFIABLE': 5}
_p['vrk'] = _p['verdict'].map(_verdict_rank).fillna(99)
_p = _p.sort_values(['vrk', 'intended_pct_clean'], ascending=[True, False]).reset_index(drop=True)
_label_order = _p['label'].tolist()
_pbar = alt.Chart(_p).mark_bar().encode(
    y=alt.Y('label:N', sort=_label_order, title=None),
    x=alt.X('intended_pct_clean:Q', title='% sampled PMIDs in INTENDED sense (random-20 audit)',
            scale=alt.Scale(domain=[0, 100])),
    color=alt.Color('verdict:N', title='Verdict',
                     scale=alt.Scale(domain=list(_pal_verdict.keys()),
                                      range=list(_pal_verdict.values()))),
    tooltip=['label', 'verdict', 'intended_pct', 'sampled_n', 'dominant_alternative_sense'],
).properties(width=560, height=420,
    title=f'§6.5.1b polysemy survey: {s651b_collision}/{s651b_total} = {100*s651b_collision/s651b_total:.0f}% COLLISION rate; intended-sense % per label')
# 75% reference line — the threshold for VALID classification
_thresh = alt.Chart(pd.DataFrame({'x': [75]})).mark_rule(
    strokeDash=[4, 4], color='#444').encode(x='x:Q')
_pbar + _thresh

slur_wsi = pd.read_csv(Path('..') / 'data' / 'slur_wsi_combined.csv')
print(f'Labels in iter-4 WSI: {slur_wsi["label"].nunique()}')
print(f'Total label-year-sense rows: {len(slur_wsi):,}')

# Per-label slur-fraction summary
_rows = []
for label, sub in slur_wsi.groupby('label'):
    total = int(sub['n_records'].sum())
    slur_n = int(sub[sub['sense'] == 'slur_explicit_mention']['n_records'].sum())
    by_sense = sub.groupby('sense')['n_records'].sum().sort_values(ascending=False)
    # Dominant non-slur sense
    non_slur = by_sense.drop('slur_explicit_mention', errors='ignore')
    if len(non_slur):
        dom_sense = str(non_slur.index[0])
        dom_n = int(non_slur.iloc[0])
        dom_pct = 100.0 * dom_n / max(total, 1)
    else:
        dom_sense, dom_n, dom_pct = ('(none)', 0, 0.0)
    _rows.append({
        'label': label,
        'total_records': total,
        'slur_n': slur_n,
        'slur_pct': round(100.0 * slur_n / max(total, 1), 3),
        'dominant_sense': dom_sense,
        'dominant_n': dom_n,
        'dominant_pct': round(dom_pct, 1),
    })
slur_summary = pd.DataFrame(_rows).sort_values('total_records', ascending=False).reset_index(drop=True)
print(f'\\n=== iter-4 slur WSI: per-label corpus-wide slur fractions ===\\n')
with pd.option_context('display.max_colwidth', 40, 'display.width', 200):
    print(slur_summary.to_string(index=False))

# §6.5.1c evidence variables for the scoreboard
s651c_n_labels = int(len(slur_summary))
s651c_total_records = int(slur_summary['total_records'].sum())
s651c_total_slur = int(slur_summary['slur_n'].sum())
s651c_slur_pct = 100.0 * s651c_total_slur / max(s651c_total_records, 1)
s651c_labels_with_any_slur = int((slur_summary['slur_n'] > 0).sum())

Labels in iter-4 WSI: 23
Total label-year-sense rows: 5,536
\n=== iter-4 slur WSI: per-label corpus-wide slur fractions ===\n
                label  total_records  slur_n  slur_pct               dominant_sense  dominant_n  dominant_pct
       T3_lazar_leper          23161       1     0.004                      unknown       12960          56.0
    T3_dwarf_clinical          16219       0     0.000                      unknown       13005          80.2
     T2_hermaphrodite           7764       0     0.000                      unknown        4597          59.2
          T2_hysteria           4180       0     0.000                      unknown        3207          76.7
 T2_transsexual_xvest           3442       0     0.000                      unknown        2332          67.8
           T3_cripple           3040       0     0.000                      unknown        2779          91.4
      T2_neurasthenia            984       0     0.000                      unknown         859          87.3
  T2_psychopath_socio            974       0     0.000                      unknown         779          80.0
           T3_lunatic            585       0     0.000                      unknown         227          38.8
         T3_hunchback            479       0     0.000    drosophila_hunchback_gene         365          76.2
   T3_maniac_madhouse            374       0     0.000                      unknown         258          69.0
            T3_midget            354       0     0.000                      unknown         192          54.2
         T3_deaf_mute            339       0     0.000 historical_deafness_clinical         237          69.9
           T3_bushman            246       0     0.000                      unknown         216          87.8
     T3_siamese_twins            209       0     0.000                      unknown         146          69.9
 T3_imbecile_clinical            155       0     0.000                      unknown         129          83.2
T2_drunkard_inebriate            123       0     0.000                      unknown          73          59.3
  T3_oriental_disease             94       0     0.000 historical_clinical_compound          93          98.9
             T2_moron             94       0     0.000                      unknown          84          89.4
      T3_whore_harlot             60       0     0.000                      unknown          47          78.3
         T3_hottentot             58       0     0.000                      unknown          44          75.9
            T3_kaffir             46       0     0.000        botanical_kaffir_lime          41          89.1
  T3_monster_clinical              3       0     0.000                      unknown           2          66.7

# Render one stacked-area panel per label. Sense colour mapping is
# consistent: slur is always red, dominant non-slur is teal/blue,
# others fall into a calibrated palette.
_panels = []
_palette_seq = ['#264653', '#2a9d8f', '#8ab17d', '#e9c46a',
                '#f4a261', '#5a189a', '#6c757d', '#0077b6']
SLUR_LABEL_ORDER = list(slur_summary['label'])
for label in SLUR_LABEL_ORDER:
    sub = slur_wsi[(slur_wsi['label'] == label) & (slur_wsi['year'] <= _PLOT_YEAR_MAX)].copy()
    if not len(sub) or sub['n_records'].sum() == 0:
        continue
    # Order senses with slur LAST (so it draws on top), then by descending sum
    sense_totals = sub.groupby('sense')['n_records'].sum().sort_values(ascending=False)
    non_slur_senses = [s for s in sense_totals.index if s != 'slur_explicit_mention']
    sense_order = non_slur_senses + (['slur_explicit_mention']
                                       if 'slur_explicit_mention' in sense_totals.index else [])
    # Build colour scale
    domain = sense_order
    rng = []
    for i, s in enumerate(sense_order):
        if s == 'slur_explicit_mention':
            rng.append('#e63946')  # always red
        else:
            rng.append(_palette_seq[i % len(_palette_seq)])

    # Truncate sense name in legend for readability
    sub['sense_short'] = sub['sense'].str.slice(0, 32)
    domain_short = [s[:32] for s in domain]
    sub_dom = sub['sense_short'].tolist()

    total_n = int(sense_totals.sum())
    slur_n = int(sense_totals.get('slur_explicit_mention', 0))
    slur_pct = 100.0 * slur_n / max(total_n, 1)
    title = (f"{label}: n={total_n:,}  slur={slur_n}/{total_n} "
             f"({slur_pct:.3f}%)  dominant: {sense_order[0][:24]}")

    ch = alt.Chart(sub).mark_area(opacity=0.9).encode(
        x=alt.X('year:O', title=None,
                axis=alt.Axis(values=list(range(1950, 2025, 10)), labelOverlap=True)),
        y=alt.Y('n_records:Q', title='records / yr', stack='zero'),
        color=alt.Color('sense_short:N', sort=domain_short, title='Sense',
                         scale=alt.Scale(domain=domain_short, range=rng)),
        order=alt.Order('sense_short:N', sort='ascending'),
        tooltip=['label', 'year', 'sense', 'n_records'],
    ).properties(width=560, height=140, title=title)
    _panels.append(ch)

alt.vconcat(*_panels).resolve_scale(y='independent')

# Reproducible 20-PMID sample from the iter-5b-added records (those
# matching only the new morph forms, not the iter-3 forms).
import re as _re_d
df_retard = pd.read_parquet(Path('..') / 'data' / 'retard_abstracts.parquet')
_old_rx = _re_d.compile(r'\\b(retarded|retards|retard|retardation)\\b', _re_d.IGNORECASE)
_new_rx = _re_d.compile(r'\\b(retarding|retardations|retardant|retardants)\\b', _re_d.IGNORECASE)
df_retard['has_old'] = df_retard['text'].str.contains(_old_rx, na=False)
df_retard['has_new'] = df_retard['text'].str.contains(_new_rx, na=False)
new_only_audit = df_retard[df_retard['has_new'] & ~df_retard['has_old']]
print(f'Total records in broadened corpus: {len(df_retard):,}')
print(f'Records matching ONLY new morph forms: {len(new_only_audit):,}')
print(f'Records matching new + iter-3 forms (already classified): {int((df_retard["has_new"] & df_retard["has_old"]).sum()):,}')
print()
spot_sample = new_only_audit.sample(n=min(20, len(new_only_audit)), random_state=42)
print('=== Random 20 PMIDs (seed=42) from iter-5b-added records ===\\n')
for i, row in spot_sample.reset_index(drop=True).iterrows():
    _t = (row['title'][:130] if row['title'] else '(no title)')
    print(f'#{i+1:>2} [{row["year"]}] {_t}')

Total records in broadened corpus: 95,862
Records matching ONLY new morph forms: 0
Records matching new + iter-3 forms (already classified): 0

=== Random 20 PMIDs (seed=42) from iter-5b-added records ===\n

ext_rows = []
for label in loaded.label.unique():
    yr = loaded[loaded.label == label].set_index('year')['n_records'].sort_index()
    if yr.sum() < 5: continue
    peak_yr = int(yr.idxmax())
    last_5y = int(yr.loc[2020:].sum())
    peak_n = int(yr.max())
    if last_5y == 0 and peak_yr <= 1990:
        ext_rows.append({
            'label': label, 'peak_n': peak_n, 'peak_year': peak_yr,
            'total': int(yr.sum()), 'last_5y': last_5y,
        })
ext_df = pd.DataFrame(ext_rows).sort_values('peak_year')
print(f'Cleanly extinct loaded-vocabulary labels (peak <= 1990, zero records 2020s):')
print(ext_df.to_string(index=False))
s65_n_extinct = len(ext_df)

Cleanly extinct loaded-vocabulary labels (peak <= 1990, zero records 2020s):
                label  peak_n  peak_year  total  last_5y
T2_deep_sleep_therapy       3       1953     13        0
 T3_imbecile_clinical       8       1954    111        0
   T2_mongoloid_idiot       3       1963     19        0
        T2_dope_fiend       2       1972      5        0
           T3_bastard       1       1973     10        0

_e = ext_df.sort_values('peak_year').reset_index(drop=True)
_e['label_short'] = _e['label'].str.replace(r'^T[23]_', '', regex=True)
_order_e = _e['label'].tolist()
_lolli_line = alt.Chart(_e).mark_rule(stroke='#bbb', strokeWidth=2).encode(
    y=alt.Y('label:N', sort=_order_e, title=None,
            axis=alt.Axis(labelExpr="replace(datum.label, /^T[23]_/, '')")),
    x=alt.X('peak_year:Q', title='Year', scale=alt.Scale(domain=[1950, 2024])),
    x2=alt.value(720),  # placeholder; replaced via transform below
)
# Use a calc to put a horizontal lollipop: peak_year -> 2024
_e['end_year'] = 2024
_lolli_line = alt.Chart(_e).mark_rule(stroke='#bbb', strokeWidth=2).encode(
    y=alt.Y('label:N', sort=_order_e, title=None),
    x='peak_year:Q', x2='end_year:Q',
)
_peak_pts = alt.Chart(_e).mark_circle(size=180, color='#e76f51').encode(
    y=alt.Y('label:N', sort=_order_e),
    x=alt.X('peak_year:Q', title='Peak year (red) -> extinction (grey rule to 2020s)'),
    size=alt.Size('peak_n:Q', title='Peak count',
                   scale=alt.Scale(range=[50, 500])),
    tooltip=['label', 'peak_year', 'peak_n', 'total', 'last_5y'],
)
_zero_pts = alt.Chart(_e).mark_tick(thickness=3, color='#264653').encode(
    y=alt.Y('label:N', sort=_order_e),
    x=alt.value(720),
)
(_lolli_line + _peak_pts).properties(width=560, height=max(180, 22*len(_e)),
    title=f'§6.5.2 clean extinctions: {len(_e)} loaded-vocab labels peaking pre-1990 with zero 2020s records')

zero_rows = []
for label in loaded.label.unique():
    yr = loaded[loaded.label == label]['n_records'].sum()
    if yr == 0:
        zero_rows.append({'label': label, 'total': 0,
                          'interpretation': '0 records across 1950-2024 — never indexed or scrubbed'})
zero_df = pd.DataFrame(zero_rows)
print(f'Tier-3 labels with zero records across the full study window:')
print(zero_df.to_string(index=False))
s65_n_zero = len(zero_df)

Tier-3 labels with zero records across the full study window:
               label  total                                         interpretation
T2_dysaesthesia_aeth      0 0 records across 1950-2024 — never indexed or scrubbed

persistent_rows = []
for label in loaded.label.unique():
    yr = loaded[loaded.label == label].set_index('year')['n_records'].sort_index()
    if yr.sum() < 100: continue
    peak_yr = int(yr.idxmax())
    last_5y = int(yr.loc[2020:].sum())
    if peak_yr >= 2015 and last_5y >= 50:
        persistent_rows.append({
            'label': label, 'peak_year': peak_yr,
            'total': int(yr.sum()), 'last_5y': last_5y,
        })
pers_df = pd.DataFrame(persistent_rows).sort_values('last_5y', ascending=False)
print(f'Persistent loaded-vocabulary terms (peak >= 2015 and 2020s sum >= 50):')
print(pers_df.to_string(index=False))
s65_n_persistent = len(pers_df)

Persistent loaded-vocabulary terms (peak >= 2015 and 2020s sum >= 50):
                      label  peak_year  total  last_5y
       T3_retarded_morpheme       2021  50450     5653
     T2_neonatal_abstinence       2021   9270     3451
          T3_dwarf_clinical       2024  15464     2955
T2_testosterone_replacement       2024   5381     1567
           T2_homosexuality       2016   4687      644
           T2_hermaphrodite       2018   5851      579
      T2_conversion_therapy       2024    851      548
                 T3_cripple       2021   1482      311
                T3_deformed       2023   1058      273
        T2_psychopath_socio       2018   1131      173
               T2_frigidity       2024    531      109
  T2_anabolic_steroid_abuse       2018    473      104
        T2_spastic_clinical       2024    574       95

# Join persistence counts to polysemy classifications so each persistent bar
# is colour-coded by whether the persistence is REAL (VALID-PERSISTENT) or
# an artefact of polysemy collision (COLLISION).
_pers_vd = pers_df.merge(
    polysemy[['label', 'verdict', 'dominant_alternative_sense']],
    on='label', how='left',
)
_pers_vd['verdict'] = _pers_vd['verdict'].fillna('NOT-AUDITED')
_pers_palette = {
    'VALID-PERSISTENT':   '#2a9d8f',
    'VALID-ERA-CLINICAL': '#8ab17d',
    'COLLISION':          '#e63946',
    'DRIFT':              '#f4a261',
    'NOT-AUDITED':        '#bbbbbb',
}
_pers_vd = _pers_vd.sort_values('last_5y', ascending=False).reset_index(drop=True)
_ord_p = _pers_vd['label'].tolist()
_perc = alt.Chart(_pers_vd).mark_bar().encode(
    y=alt.Y('label:N', sort=_ord_p, title=None),
    x=alt.X('last_5y:Q', title='2020s record count'),
    color=alt.Color('verdict:N', title='Polysemy verdict (from §6.5.1b)',
                     scale=alt.Scale(domain=list(_pers_palette.keys()),
                                      range=list(_pers_palette.values()))),
    tooltip=['label', 'last_5y', 'peak_year', 'verdict', 'dominant_alternative_sense'],
).properties(width=560, height=max(180, 22*len(_pers_vd)),
    title='§6.5.4 "persistent" labels: red = polysemy collision (apparent persistence is wrong sense); teal = genuine clinical persistence')
_perc

books_path = Path('..') / 'data' / 'books_ngrams_counts.csv'
books = pd.read_csv(books_path)
print(f'Google Books rows: {len(books):,}')
print(f'Shifts: {books["shift"].unique().tolist()}')
print(f'Year range: {books["year"].min()}-{books["year"].max()}')

Google Books rows: 1,800
Shifts: ['1960s_down', '1980s_ptsd', '1990s_did', '2010s_id', 'neg_suicide_phrasing']
Year range: 1900-2019

# Cross-corpus comparison: per-shift, find Books crossover and compare to PubMed
PUBMED_CROSSOVERS = {
    '1960s_down':           crossover,          # 1966
    '1980s_ptsd':           first_ptsd,         # 1980 (first PTSD record)
    '1990s_did':            first_did,          # 1994 (first DID record)
    '2010s_id':             crossover4,         # 2012
    'neg_suicide_phrasing': None,               # 0 records in PubMed
}

THRESH = 1e-8  # both Books-frequencies need to be above this for crossover to be meaningful
rows = []
for shift in books['shift'].unique():
    sub = books[books['shift'] == shift].copy()
    agg = sub.groupby(['year', 'side'])['frequency'].sum().unstack('side', fill_value=0)
    agg = agg.sort_index()
    old_peak = float(agg['old'].max())
    old_peak_yr = int(agg['old'].idxmax()) if old_peak > 0 else None
    valid = (agg['old'] > THRESH) | (agg['new'] > THRESH)
    cross_mask = (agg['new'] > agg['old']) & valid
    books_cross = int(cross_mask.idxmax()) if cross_mask.any() else None
    pubmed_cross = PUBMED_CROSSOVERS.get(shift)
    lag = (books_cross - pubmed_cross) if (books_cross and pubmed_cross) else None
    ratio_2019 = float(agg['new'].iloc[-1]) / max(float(agg['old'].iloc[-1]), 1e-15)
    rows.append({
        'shift': shift,
        'books_old_peak_yr': old_peak_yr,
        'pubmed_crossover': pubmed_cross,
        'books_crossover': books_cross,
        'lag_books_vs_pubmed': lag,
        'books_2019_new_over_old': round(ratio_2019, 2),
    })
cross_corpus = pd.DataFrame(rows)
print(cross_corpus.to_string(index=False))

               shift  books_old_peak_yr  pubmed_crossover  books_crossover  lag_books_vs_pubmed  books_2019_new_over_old
          1960s_down               1964               NaN           1978.0                  NaN                    54.55
          1980s_ptsd               1918            1980.0           1982.0                  2.0                    28.36
           1990s_did               1996            1994.0              NaN                  NaN                     0.57
            2010s_id               1978            2012.0           2016.0                  4.0                     1.55
neg_suicide_phrasing               2015               NaN              NaN                  NaN                     0.04

# For each shift, normalise both PubMed and Books to peak-of-the-pair = 1
# so the two corpora overlay on the same chart. The lag is the visual
# distance between the crossover marker on each line.
# Truncate PubMed at _PLOT_YEAR_MAX (2023); Books English-2019 already
# stops at 2019 (Google never released post-2019 ngrams).
_books_agg = (books.groupby(['shift', 'year', 'side'])['frequency']
                    .sum().reset_index())
_pubmed_yearly = []
for shift, parts in frames.items():
    for side, df in parts.items():
        if not len(df): continue
        df_trunc = df[df['year'] <= _PLOT_YEAR_MAX]
        g = df_trunc.groupby('year').size().reset_index(name='n_records')
        g['shift'] = shift; g['side'] = side; g['corpus'] = 'PubMed'
        g = g.rename(columns={'n_records': 'value'})
        _pubmed_yearly.append(g)
_pubmed_yr = pd.concat(_pubmed_yearly, ignore_index=True) if _pubmed_yearly else pd.DataFrame()
_books_agg = _books_agg.rename(columns={'frequency': 'value'})
_books_agg['corpus'] = 'GoogleBooks'

# Normalize: per (shift, corpus), divide by max across both sides
def _norm(group):
    m = group['value'].max() or 1.0
    group['norm'] = group['value'] / m
    return group
_pn = (_pubmed_yr.groupby(['shift', 'corpus'], group_keys=False).apply(_norm))
_bn = (_books_agg.groupby(['shift', 'corpus'], group_keys=False).apply(_norm))
_cc = pd.concat([_pn, _bn], ignore_index=True)
_cc = _cc[_cc['shift'].isin(['1960s_down', '1980s_ptsd', '1990s_did', '2010s_id'])]

_cc_charts = []
for sh in ['1960s_down', '1980s_ptsd', '1990s_did', '2010s_id']:
    sub = _cc[_cc['shift'] == sh].copy()
    if not len(sub): continue
    sub['series'] = sub['corpus'] + ' / ' + sub['side']
    ch = alt.Chart(sub).mark_line(strokeWidth=2).encode(
        x=alt.X('year:O', axis=alt.Axis(labelOverlap=True), title=None),
        y=alt.Y('norm:Q', title='norm to peak'),
        color=alt.Color('series:N', title=None,
                         scale=alt.Scale(domain=[
                             'PubMed / old', 'PubMed / new',
                             'GoogleBooks / old', 'GoogleBooks / new',
                         ],
                         range=['#e76f51', '#264653', '#f4a261', '#8ab17d'])),
        strokeDash=alt.condition(alt.FieldOneOfPredicate('corpus', ['GoogleBooks']),
                                  alt.value([4, 4]), alt.value([1, 0])),
        tooltip=['shift', 'corpus', 'side', 'year', 'value', 'norm'],
    ).properties(width=720, height=160, title=f'§7 {sh}: PubMed (solid) vs Books (dashed), normalised')
    _cc_charts.append(ch)
alt.vconcat(*_cc_charts).resolve_scale(y='shared')

sui_books = books[books['shift'] == 'neg_suicide_phrasing'].copy()
sui_pivot = sui_books.pivot(index='year', columns='ngram', values='frequency').fillna(0)
print(f'Books frequencies (note units are per-year-normalized, so very small):\\n')
recent = sui_pivot.loc[2000:2019]
print(recent.to_string(float_format=lambda x: f'{x:.3e}'))
s7_books_died_2000 = float(sui_pivot.loc[2000, 'died by suicide']) if 'died by suicide' in sui_pivot.columns else 0.0
s7_books_died_2019 = float(sui_pivot.loc[2019, 'died by suicide']) if 'died by suicide' in sui_pivot.columns else 0.0
s7_books_growth_ratio = s7_books_died_2019 / max(s7_books_died_2000, 1e-15)
print(f'\\n"died by suicide" growth 2000 -> 2019 in Books: {s7_books_growth_ratio:.1f}x')
print(f'PubMed records of "died by suicide" 2000-2024: 0 (zero growth)')

Books frequencies (note units are per-year-normalized, so very small):\n
ngram  committed suicide  died by suicide
year                                     
2000           1.089e-06        8.134e-09
2001           1.126e-06        1.290e-08
2002           1.153e-06        1.006e-08
2003           1.132e-06        1.170e-08
2004           1.201e-06        2.399e-08
2005           1.225e-06        1.575e-08
2006           1.243e-06        1.943e-08
2007           1.264e-06        2.095e-08
2008           1.232e-06        1.642e-08
2009           1.359e-06        2.273e-08
2010           1.338e-06        2.435e-08
2011           1.412e-06        2.541e-08
2012           1.253e-06        2.573e-08
2013           1.312e-06        2.447e-08
2014           1.419e-06        2.928e-08
2015           1.429e-06        3.054e-08
2016           1.393e-06        3.352e-08
2017           1.260e-06        4.179e-08
2018           1.229e-06        4.113e-08
2019           1.318e-06        5.735e-08
\n"died by suicide" growth 2000 -> 2019 in Books: 7.1x
PubMed records of "died by suicide" 2000-2024: 0 (zero growth)

# Books frequencies are per-million-word rates; PubMed is record-counts.
# Show Books on log-scale alongside an explicit "PubMed = 0" annotation.
_b_long = (sui_pivot.reset_index()
                     .melt(id_vars='year', var_name='ngram', value_name='freq'))
_b_long = _b_long[_b_long['year'] >= 1970]
_books_line = alt.Chart(_b_long).mark_line(strokeWidth=2).encode(
    x=alt.X('year:O', axis=alt.Axis(values=list(range(1970, 2020, 5))), title='Year'),
    y=alt.Y('freq:Q', title='Google Books frequency (log scale)',
            scale=alt.Scale(type='log', domainMin=1e-10)),
    color=alt.Color('ngram:N', title='Phrase',
                     scale=alt.Scale(range=['#e76f51', '#264653'])),
    tooltip=['ngram', 'year', 'freq'],
).properties(width=720, height=240,
    title=f'§7.1 books: "died by suicide" grew {s7_books_growth_ratio:.0f}x 2000-2019 — PubMed: 0 records (advocacy phrase didn\'t cross into peer-reviewed medical literature)')
_books_line

# Step-A counts loaded from data/pubmed_full_counts.csv (built earlier
# by build/fetch_pubmed.py --full). Here we sum per-label totals across
# the years our abstract corpus covers, then compute the retention.
step_a = pd.read_csv(Path('..') / 'data' / 'pubmed_full_counts.csv')

# Map abstract-corpus shift labels -> Step-A labels
STEPA_MAP = {
    '1960s_down_old':           'ID_old_mongolism',
    '1960s_down_new':           'ID_new_down',
    '1980s_ptsd_old':           'TRAUMA_old_shell_shock',
    '1980s_ptsd_new':           'TRAUMA_new_ptsd',
    '1990s_did_old':            'DISSOC_old_mpd',
    '1990s_did_new':            'DISSOC_new_did',
    '2010s_id_old':             'ID_old_mental_retardation',
    '2010s_id_new':             'ID_new_intellectual',
    'neg_suicide_phrasing_old': 'SUI_old_committed',
    'neg_suicide_phrasing_new': 'SUI_new_died_by',
}

rows = []
for (shift, info) in SHIFTS.items():
    for side in ('old', 'new'):
        k = f'{shift}_{side}'
        sa_label = STEPA_MAP.get(k)
        if sa_label is None: continue
        sa = int(step_a[step_a['label'] == sa_label]['n_records'].sum())
        df = frames[shift][side]
        sb = len(df)
        # True negatives (sa == 0 AND sb == 0, as designed for the negative-
        # finding row) get retention NaN, not zero — they should be reported
        # as "n/a" and excluded from the retention-floor check.
        if sa == 0 and sb == 0:
            ratio = float('nan')
            flag = 'OK (true negative)'
        elif sa == 0:
            ratio = float('inf')
            flag = 'CHECK (Step-A 0 but Step-B > 0)'
        else:
            ratio = sb / sa
            flag = 'OK' if ratio >= 0.80 else 'CHECK'
        rows.append({'shift_side': k, 'step_a': sa, 'step_b': sb, 'retention': ratio, 'flag': flag})
consistency = pd.DataFrame(rows)
print(consistency.to_string(index=False))
# Worst retention over real (non-NaN, finite) cases only
real_ratios = consistency['retention'].replace([float('inf')], float('nan')).dropna()
print(f'\nWorst retention (excluding true negatives): {real_ratios.min():.2f}')
print(f'Records flagged for follow-up: {(consistency["flag"].str.startswith("CHECK")).sum()}')

              shift_side  step_a  step_b  retention               flag
          1960s_down_old    1546    1546   1.000000                 OK
          1960s_down_new   32964   30282   0.918639                 OK
          1980s_ptsd_old     265     248   0.935849                 OK
          1980s_ptsd_new   59213   50433   0.851722                 OK
           1990s_did_old     652     635   0.973926                 OK
           1990s_did_new     574     520   0.905923                 OK
            2010s_id_old   37077   35440   0.955849                 OK
            2010s_id_new   35521   29290   0.824583                 OK
neg_suicide_phrasing_old    1941    1803   0.928903                 OK
neg_suicide_phrasing_new       0       0        NaN OK (true negative)

Worst retention (excluding true negatives): 0.82
Records flagged for follow-up: 0

placebo_years = [1985, 1995, 2000, 2020, 2023]
real_anchor = anchor4  # 2012

old_yr_long = old4.groupby('year').size().reindex(range(1980, 2025), fill_value=0)
new_yr_long = new4.groupby('year').size().reindex(range(1980, 2025), fill_value=0)

rows = []
for yr in [real_anchor] + placebo_years:
    # Re-detect crossover assuming `yr` is the anchor: window ±5 years around it.
    window = range(yr - 5, yr + 6)
    cross_in_window = next((y for y in window
                             if new_yr_long[y] > old_yr_long[y] and (new_yr_long[y]+old_yr_long[y]) >= 5),
                            None)
    rows.append({
        'anchor': yr,
        'is_real': yr == real_anchor,
        'crossover_in_window': cross_in_window,
        'aligns': cross_in_window is not None and abs(cross_in_window - yr) <= 2,
    })
placebo_df = pd.DataFrame(rows)
print(placebo_df.to_string(index=False))
print(f'\nReal anchor crossover in-window: {placebo_df[placebo_df.is_real].aligns.iloc[0]}')
print(f'Placebo anchors that "align": {placebo_df[(~placebo_df.is_real) & placebo_df.aligns].shape[0]} / 5')

 anchor  is_real  crossover_in_window  aligns
   2012     True               2012.0    True
   1985    False                  NaN   False
   1995    False                  NaN   False
   2000    False                  NaN   False
   2020    False               2015.0   False
   2023    False               2018.0   False

Real anchor crossover in-window: True
Placebo anchors that "align": 0 / 5

import time as _t

pre_id  = pcd.from_dataframe(old4[old4['year'] >= 2005], text_col='text', meta_cols=('year','journal'))
post_id = pcd.from_dataframe(new4[new4['year'] >= 2010], text_col='text', meta_cols=('year','journal'))
key_id = pcd.compare(pre_id, post_id).keyness(
    min_count=30, formula='dunning', stop_words=PUBMED_STOP, multiple_comparisons='bh',
)
obs_max = float(key_id.to_df()['g2'].abs().max())

# Shuffled null
all_docs = pd.concat([
    old4[old4['year'] >= 2005].assign(_label='old'),
    new4[new4['year'] >= 2010].assign(_label='new'),
], ignore_index=True)
n_a = (all_docs['_label'] == 'old').sum()

B = 99
rng = np.random.default_rng(0)
perm_max = []
_t0 = _t.time()
for b in range(B):
    perm = all_docs.sample(frac=1.0, random_state=rng.integers(0, 1 << 31)).reset_index(drop=True)
    a_p = pcd.from_dataframe(perm.iloc[:n_a], text_col='text')
    b_p = pcd.from_dataframe(perm.iloc[n_a:], text_col='text')
    try:
        kn = pcd.compare(a_p, b_p).keyness(min_count=30, formula='dunning', stop_words=PUBMED_STOP)
        perm_max.append(float(kn.to_df()['g2'].abs().max()))
    except Exception:
        continue
elapsed = _t.time() - _t0

p95 = float(np.percentile(perm_max, 95))
print(f'Observed max |G^2| (real labels): {obs_max:,.0f}')
print(f'Permuted null max |G^2|, B={len(perm_max)}: median {np.median(perm_max):,.0f}, 95th pct {p95:,.0f}')
print(f'Ratio observed / 95th-pct null: {obs_max / p95:.0f}x')
print(f'Walltime: {elapsed:.0f}s')

Observed max |G^2| (real labels): 30,028
Permuted null max |G^2|, B=99: median 115, 95th pct 239
Ratio observed / 95th-pct null: 126x
Walltime: 757s

_k5 = key5_ci.to_df()
_k5 = _k5[_k5['p_adjusted'].notna()].copy()
_bh_sig = _k5['p_adjusted'] < 0.05
_ci_excl = (_k5['g2_ci_lower'] > 0) | (_k5['g2_ci_upper'] < 0)
n_both = int((_bh_sig & _ci_excl).sum())
n_bh_only = int((_bh_sig & ~_ci_excl).sum())
n_ci_only = int((~_bh_sig & _ci_excl).sum())
n_either = int((_bh_sig | _ci_excl).sum())
s84_disagree_ratio = (n_bh_only + n_ci_only) / max(1, n_either)
print(f'BH-significant:          {int(_bh_sig.sum())}')
print(f'CI excludes 0:           {int(_ci_excl.sum())}')
print(f'Both flagged:            {n_both}')
print(f'BH only (CI straddles):  {n_bh_only}')
print(f'CI only (not BH-sig):    {n_ci_only}')
print(f'Disagreement / either-flagged ratio: {s84_disagree_ratio:.3f}')

BH-significant:          4222
CI excludes 0:           3911
Both flagged:            3785
BH only (CI straddles):  437
CI only (not BH-sig):    126
Disagreement / either-flagged ratio: 0.129

mc_rows = []
for mc in [10, 30, 50, 100, 200]:
    try:
        kk = pcd.compare(mr_pre, id_post).keyness(
            min_count=mc, formula='dunning', stop_words=PUBMED_STOP,
            multiple_comparisons='bh',
        )
        kdf = kk.to_df()
        top3_pre = ','.join(kdf[kdf['log_ratio'] > 0].head(3)['term'].tolist())
        top3_post = ','.join(kdf[kdf['log_ratio'] < 0].head(3)['term'].tolist())
        mc_rows.append({'min_count': mc, 'n_terms': len(kdf),
                        'top-3 pre-anchor': top3_pre, 'top-3 post-anchor': top3_post})
    except Exception as e:
        mc_rows.append({'min_count': mc, 'n_terms': 0, 'error': str(e)[:50]})
mc_df = pd.DataFrame(mc_rows)
print(mc_df.to_string(index=False))
_pre_sets = [set(s.strip() for s in r.split(',')) for r in mc_df['top-3 pre-anchor']]
_post_sets = [set(s.strip() for s in r.split(',')) for r in mc_df['top-3 post-anchor']]
s85_pre_stable = all(s == _pre_sets[0] for s in _pre_sets)
s85_post_stable = all(s == _post_sets[0] for s in _post_sets)
print(f'\\npre-anchor top-3 stable across {len(mc_rows)} min_count values:  {s85_pre_stable}')
print(f'post-anchor top-3 stable across {len(mc_rows)} min_count values: {s85_post_stable}')

 min_count  n_terms      top-3 pre-anchor          top-3 post-anchor
        10    18494 retardation,mental,mr intellectual,disability,id
        30     9820 retardation,mental,mr intellectual,disability,id
        50     7268 retardation,mental,mr intellectual,disability,id
       100     4829 retardation,mental,mr intellectual,disability,id
       200     3056 retardation,mental,mr intellectual,disability,id
\npre-anchor top-3 stable across 5 min_count values:  True
post-anchor top-3 stable across 5 min_count values: True

from scipy.stats import spearmanr
id_post_yr = new_yr4.loc[2013:2024]
years_arr = id_post_yr.index.values.astype(float)
counts_arr = id_post_yr.values.astype(float)
rho, p_sp = spearmanr(years_arr, counts_arr)
s86_rho = float(rho)
s86_p = float(p_sp)
print(f'Spearman rho on (year, ID-count) 2013-2024: rho = {s86_rho:+.3f}, p = {s86_p:.2e}')
print(f'Monotonic rising (rho > 0.7): {s86_rho > 0.7}')

Spearman rho on (year, ID-count) 2013-2024: rho = +0.944, p = 3.93e-06
Monotonic rising (rho > 0.7): True

# Pre-specified thresholds (drafted with §0b pre-registration)
TH_CROSSOVER_TOL_60S = 5   # crossover must be within 5 years of 1965
TH_FIRST_PTSD_TOL    = 1   # first PTSD record within 1 year of 1980
TH_FIRST_DID_LO      = 1993
TH_FIRST_DID_HI      = 1995
TH_CROSSOVER_TOL_10S = 2   # ID crossover within 2 years of 2012
TH_RETENTION_FLOOR   = 0.80  # Step-A vs Step-B retention
TH_NULL_RATIO_FLOOR  = 10  # observed/null at 10x
TH_TOP15_CI_EXCL     = 10  # of top-15 keyness terms, this many should have per-term CI excluding 0
TH_BURST_ONSET_LO    = 1979  # PTSD burst onset window (DSM-III anchor 1980, ±1)
TH_BURST_ONSET_HI    = 1983
TH_RHO_FLOOR         = 0.70  # Spearman rho on ID post-anchor trajectory should rise
TH_BH_CI_DISAGREE    = 0.20  # disagreement ratio between BH and bootstrap CI
                              # (matches the CBD case-study threshold; tightened
                              # from 0.30 -> 0.20 in iter-3 audit to remove
                              # the unjustified goalpost-shift)

# §2 evidence
s2_cross = crossover
s2_pass = s2_cross is not None and abs(s2_cross - anchor1) <= TH_CROSSOVER_TOL_60S

# §3 evidence
s3_first_ptsd = first_ptsd
s3_pass = s3_first_ptsd is not None and abs(s3_first_ptsd - anchor2) <= TH_FIRST_PTSD_TOL

# §4 evidence
s4_first_did = first_did
s4_pass = s4_first_did is not None and TH_FIRST_DID_LO <= s4_first_did <= TH_FIRST_DID_HI

# §5 evidence
s5_cross = crossover4
s5_pass = s5_cross is not None and abs(s5_cross - anchor4) <= TH_CROSSOVER_TOL_10S

# §6 negative finding — falsifier was zero, observed is zero
s6_pass = len(new5) == 0  # honest record of the falsification

# §7.1 retention (exclude true-negative rows where sa == sb == 0)
_real_ratios = consistency['retention'].replace([float('inf')], float('nan')).dropna()
s71_worst = float(_real_ratios.min()) if len(_real_ratios) else float('nan')
s71_pass = (s71_worst >= TH_RETENTION_FLOOR) and not np.isnan(s71_worst)

# §7.2 placebo
s72_real_aligns = bool(placebo_df[placebo_df.is_real].aligns.iloc[0])
s72_placebos_align = int(placebo_df[(~placebo_df.is_real) & placebo_df.aligns].shape[0])
s72_pass = s72_real_aligns and s72_placebos_align <= 2  # tolerate up to 2/5 spurious

# §7.3 shuffled null
s73_ratio = obs_max / p95 if p95 > 0 else float('inf')
s73_pass = s73_ratio >= TH_NULL_RATIO_FLOOR

scoreboard = pd.DataFrame([
    ('§0d Cross-package Rayson G^2 byte-equality',
     f'worst absolute error across 6 reference cases: {float(xv["abs_error"].max()):.2e} (assertion floor 1e-10)',
     'PASS' if float(xv['abs_error'].max()) < 1e-10 else 'FAIL'),
    ('§2 mongolism -> Down syndrome',
     f'crossover {s2_cross} (anchor {anchor1}, tolerance ±{TH_CROSSOVER_TOL_60S})',
     'PASS' if s2_pass else 'FAIL (pre-registered)'),
    ('§2a Bootstrap CIs on §2 contextual keyness',
     f'top-15: per-term CI excludes 0 in {s2a_top15_per_term_excl}/15; simultaneous CI excludes 0 in {s2a_top15_sim_excl}/15',
     'PASS' if s2a_top15_per_term_excl >= TH_TOP15_CI_EXCL else 'PARTIAL'),
    ('§2b Collocation shift around "syndrome"',
     f'{len(s2b_df):,} collocates analysed; top |shift| at {s2b_df.iloc[0]["collocate"]!r} (shift={s2b_df.iloc[0]["shift"]:+.2f})' if len(s2b_df) else 'no collocates',
     'PASS' if len(s2b_df) > 0 else 'PARTIAL'),
    ('§3 shell shock -> PTSD',
     f'first PTSD record {s3_first_ptsd} (anchor {anchor2}, tolerance ±{TH_FIRST_PTSD_TOL})',
     'PASS' if s3_pass else 'FAIL (pre-registered)'),
    ('§3b Burstiness detection on PTSD annual series',
     f'first burst onset: {s3b_first_burst_year}; aligned with DSM-III 1980 (window {TH_BURST_ONSET_LO}-{TH_BURST_ONSET_HI}): {s3b_aligned}',
     'PASS' if s3b_aligned else 'PARTIAL'),
    ('§4 MPD -> DID',
     f'first DID record {s4_first_did} (pre-reg window 1993-1995)',
     'PASS' if s4_pass else 'PARTIAL'),
    ('§5 mental retardation -> intellectual disability',
     f'crossover {s5_cross} (anchor {anchor4}, tolerance ±{TH_CROSSOVER_TOL_10S})',
     'PASS' if s5_pass else 'PARTIAL'),
    ('§5a Bootstrap CIs on §5 contextual keyness',
     f'top-15: per-term CI excludes 0 in {s5a_top15_per_term_excl}/15; simultaneous CI excludes 0 in {s5a_top15_sim_excl}/15',
     'PASS' if s5a_top15_per_term_excl >= TH_TOP15_CI_EXCL else 'PARTIAL'),
    ('§5.5 SIRS/Sepsis-2 -> Sepsis-3 (operational-definition revision)',
     f'first Sepsis-3 record {s55_first_sepsis3} (pre-reg window 2015-2017); aligns: {s55_aligned}',
     'PASS' if s55_aligned else 'PARTIAL'),
    ('§5.5a Bootstrap CIs on §5.5 Sepsis-3 contextual keyness',
     f'top-15: per-term CI excludes 0 in {s55a_top15_per_term_excl}/15; simultaneous CI excludes 0 in {s55a_top15_sim_excl}/15',
     'PASS' if s55a_top15_per_term_excl >= TH_TOP15_CI_EXCL else 'PARTIAL'),
    ('§5.5b Cross-corpus: Sepsis-3 in ClinicalTrials.gov registrations 2010-2024',
     f'first year >= 5 Sepsis-3/qSOFA registrations: {s55b_first_sepsis3_year}; '
     f'SIRS-vs-Sepsis-3 crossover: {s55b_crossover_year}; '
     f'totals 2010-2024: SIRS={s55b_sirs_total:,}, Sepsis-3/qSOFA={s55b_sepsis3_total:,}',
     'PASS' if (s55b_first_sepsis3_year is not None and 2015 <= s55b_first_sepsis3_year <= 2017
                and s55b_crossover_year is not None and s55b_crossover_year <= 2018)
     else 'PARTIAL'),
    ('§5.6 Asperger -> ASD (dual-rationale retirement: terminology + ethics)',
     f'crossover {s56_crossover} (terminology pre-reg 2013-2015); post-2018 decline acceleration ratio {s56_acceleration_ratio:.2f}x (ethics pre-reg >= 1.5x)',
     'PASS' if (s56_terminology_pass and s56_ethics_pass) else ('PARTIAL' if s56_terminology_pass else 'FAIL')),
    ('§5.6a Bootstrap CIs on §5.6 Asperger->ASD contextual keyness',
     f'top-15: per-term CI excludes 0 in {s56a_top15_per_term_excl}/15; simultaneous CI excludes 0 in {s56a_top15_sim_excl}/15',
     'PASS' if s56a_top15_per_term_excl >= 8 else 'PARTIAL'),
    ('§5.6b Placebo-anchor sweep on §5.6 ethical-acceleration claim',
     f'2018 anchor crosses 1.5x: {s56b_real_crosses}; placebos crossing: {s56b_n_placebos_crossing}/5',
     'PASS' if s56b_pass else ('PARTIAL' if s56b_real_crosses else 'FAIL')),
    ('§5.7 DSM-5 substance-use-disorder family + discovery-of-abuse archetype (14 sub-shifts, 5 archetypes)',
     f'{s57_n_pass} PASS + {s57_n_partial} PARTIAL of {s57_n_total} sub-shifts; '
     f'includes 2 pre-registered NEGATIVE-prediction confirmations (§5.7.7 AAS asymmetric, §5.7.8 polysubstance retired)',
     'PASS' if s57_n_pass >= 9 else 'PARTIAL'),
    ('§5.7a Clustered bootstrap CIs on §5.7.1 alcohol post-2013 new-share',
     f'naive CI width {_naive_w:.4f} vs journal-clustered CI width {_clust_w:.4f}; ratio {_ratio:.2f}x (pre-reg >= 1.5x)',
     'PASS' if _clust_pass else 'PARTIAL'),
    ('§5.7d Polysemy demonstration on 6 single-token PubMed queries',
     f'{poly_n_pass}/{poly_n_total} tokens show single-token sense mixing (intended sense not modal OR exceeded by unintended)',
     'PASS' if poly_n_pass >= 5 else 'PARTIAL'),
    ('§5.7d-ii Unsupervised cross-check (pycorpdiff induce_senses vs regex buckets)',
     f'{poly_wsi_corroborated}/{poly_wsi_n} tokens above-chance agreement (ARI>0.1); '
     f'mean ARI {poly_wsi_mean_ari:.2f}; AAS clean (topically distinct), '
     f'weed near-zero (extreme sense imbalance) -- documented WSI limitation',
     'OBSERVED'),
    ('§6 NEGATIVE FINDING: "committed" -> "died by" suicide',
     f'"died by suicide" PubMed records: {len(new5)} (falsifier was zero)',
     'FAIL (pre-registered falsifier; honestly recorded)' if s6_pass else 'PASS'),
    ('§6.5.1 AUDIT-RESOLVED: word-sense decomposition of `retard*` (iter-1 BLOCKING refutation)',
     f'slur sense: {s651_slur_n}/{s651_total:,} records = {s651_slur_pct:.3f}% (essentially absent); clinical-ID compound declines {s651_clinical_decline_pct:.0f}% from 1990s to 2020s (corroborates §5)',
     'AUDIT-RESOLVED (prior INVERSION claim REFUTED; corrected interpretation: morpheme dominated by scientific process-verb senses, slur essentially absent)'),
    ('§6.5.1b POLYSEMY-AUDITED SURVEY (iter-2/3 generalisation of iter-1 finding)',
     f'{s651b_total} labels audited by random-20-PMID sense check: {s651b_collision} COLLISIONs, {s651b_drift} DRIFTs, {s651b_valid_era} VALID era-clinical, {s651b_valid_persistent} VALID-PERSISTENT, {s651b_unmeasurable} UNMEASURABLE, {s651b_unclassifiable} UNCLASSIFIABLE',
     f'META-FINDING: {s651b_collision}/{s651b_total} = {100*s651b_collision/s651b_total:.0f}% polysemy-collision rate is the prior risk for any single-token deprecated-medical-vocabulary tracking study'),
    ('§6.5.1c MULTI-LABEL SLUR WSI DEEP AUDIT (iter-4 full-corpus extension of §6.5.1)',
     f'{s651c_n_labels} slur-like labels WSI-classified across {s651c_total_records:,} PubMed records 1950-2024; corpus-wide explicit-slur fraction: {s651c_total_slur}/{s651c_total_records:,} = {s651c_slur_pct:.4f}%; {s651c_labels_with_any_slur}/{s651c_n_labels} labels had >=1 explicit slur record',
     f'CONFIRMED: corpus-wide slur fraction <{max(0.01, s651c_slur_pct):.2f}% for every label — single-token queries on slur-like English morphemes do NOT measure slur usage'),
    ('§6.5.2 Loaded-vocab clean extinctions',
     f'{s65_n_extinct} of 43 loaded-vocab labels are extinct (peak <= 1990 and zero records in 2020s)',
     'OBSERVED'),
    ('§6.5.3 ZERO-hit indexing-curation evidence',
     f'{s65_n_zero} zero-hit labels remain in the post-iter-4-curation inventory (iter-3 had 4; all removed in iter-4 ethical review)',
     'OBSERVED'),
    ('§6.5.4 Persistent loaded-vocab (not all retire)',
     f'{s65_n_persistent} labels persist with 2020s sum >= 50 records',
     'OBSERVED'),
    ('§7 Cross-corpus: PubMed vs Google Books',
     f'PubMed leads Books for {int((cross_corpus["lag_books_vs_pubmed"] > 0).sum())} of {len(cross_corpus)} shifts; Books-"died by suicide" growth 2000->2019: {s7_books_growth_ratio:.1f}x',
     'PASS' if s7_books_growth_ratio > 1 else 'PARTIAL'),
    ('AUDIT §8.1 Step-A/Step-B retention',
     f'worst retention {s71_worst:.2f} (floor {TH_RETENTION_FLOOR})',
     'PASS' if s71_pass else 'PARTIAL'),
    ('AUDIT §8.2 Placebo anchor years',
     f'real anchor aligns: {s72_real_aligns}; placebos aligning: {s72_placebos_align}/5',
     'PASS' if s72_pass else 'PARTIAL'),
    ('AUDIT §8.3 Shuffled-label null for §5 keyness',
     f'observed |G^2|={obs_max:,.0f}; 95th-pct null={p95:,.0f}; ratio {s73_ratio:.0f}x',
     'PASS' if s73_pass else 'PARTIAL'),
    ('AUDIT §8.4 BH-vs-bootstrap-CI alignment on §5 keyness',
     f'disagreement ratio: {s84_disagree_ratio:.3f} (tolerance {TH_BH_CI_DISAGREE})',
     'PASS' if s84_disagree_ratio <= TH_BH_CI_DISAGREE else 'PARTIAL'),
    ('AUDIT §8.5 min_count sensitivity for §5 keyness',
     f'pre-anchor top-3 stable: {s85_pre_stable}; post-anchor top-3 stable: {s85_post_stable}',
     'PASS' if (s85_pre_stable and s85_post_stable) else 'PARTIAL'),
    ('AUDIT §8.6 Spearman monotonic-trend on §5 ID 2013-2024',
     f'rho = {s86_rho:+.3f}, p = {s86_p:.2e} (floor rho > {TH_RHO_FLOOR})',
     'PASS' if s86_rho > TH_RHO_FLOOR else 'PARTIAL'),
], columns=['Check', 'Observed', 'Verdict'])

with pd.option_context('display.max_colwidth', 100, 'display.width', 200):
    print(scoreboard.to_string(index=False))

                                                                                                Check                                                                                                                                                                    Observed                                                                                                                                                 Verdict
                                                           §0d Cross-package Rayson G^2 byte-equality                                                                                             worst absolute error across 6 reference cases: 1.77e-11 (assertion floor 1e-10)                                                                                                                                                    PASS
                                                                        §2 mongolism -> Down syndrome                                                                                                                                  crossover None (anchor 1965, tolerance ±5)                                                                                                                                   FAIL (pre-registered)
                                                           §2a Bootstrap CIs on §2 contextual keyness                                                                                                 top-15: per-term CI excludes 0 in 15/15; simultaneous CI excludes 0 in 6/15                                                                                                                                                    PASS
                                                              §2b Collocation shift around "syndrome"                                                                                                          3,547 collocates analysed; top |shift| at 'twinning' (shift=+8.29)                                                                                                                                                    PASS
                                                                               §3 shell shock -> PTSD                                                                                                                          first PTSD record 1980 (anchor 1980, tolerance ±1)                                                                                                                                                    PASS
                                                       §3b Burstiness detection on PTSD annual series                                                                                                first burst onset: None; aligned with DSM-III 1980 (window 1979-1983): False                                                                                                                                                 PARTIAL
                                                                                        §4 MPD -> DID                                                                                                                            first DID record 1994 (pre-reg window 1993-1995)                                                                                                                                                    PASS
                                                     §5 mental retardation -> intellectual disability                                                                                                                                  crossover 2012 (anchor 2012, tolerance ±2)                                                                                                                                                    PASS
                                                           §5a Bootstrap CIs on §5 contextual keyness                                                                                                top-15: per-term CI excludes 0 in 15/15; simultaneous CI excludes 0 in 14/15                                                                                                                                                    PASS
                                     §5.5 SIRS/Sepsis-2 -> Sepsis-3 (operational-definition revision)                                                                                                        first Sepsis-3 record 1990 (pre-reg window 2015-2017); aligns: False                                                                                                                                                 PARTIAL
                                              §5.5a Bootstrap CIs on §5.5 Sepsis-3 contextual keyness                                                                                                top-15: per-term CI excludes 0 in 15/15; simultaneous CI excludes 0 in 10/15                                                                                                                                                    PASS
                           §5.5b Cross-corpus: Sepsis-3 in ClinicalTrials.gov registrations 2010-2024                                        first year >= 5 Sepsis-3/qSOFA registrations: 2016; SIRS-vs-Sepsis-3 crossover: 2017; totals 2010-2024: SIRS=219, Sepsis-3/qSOFA=385                                                                                                                                                    PASS
                               §5.6 Asperger -> ASD (dual-rationale retirement: terminology + ethics)                                                         crossover 1980 (terminology pre-reg 2013-2015); post-2018 decline acceleration ratio 2.38x (ethics pre-reg >= 1.5x)                                                                                                                                                    FAIL
                                         §5.6a Bootstrap CIs on §5.6 Asperger->ASD contextual keyness                                                                                                 top-15: per-term CI excludes 0 in 15/15; simultaneous CI excludes 0 in 4/15                                                                                                                                                    PASS
                                        §5.6b Placebo-anchor sweep on §5.6 ethical-acceleration claim                                                                                                                      2018 anchor crosses 1.5x: True; placebos crossing: 5/5                                                                                                                                                 PARTIAL
§5.7 DSM-5 substance-use-disorder family + discovery-of-abuse archetype (14 sub-shifts, 5 archetypes)                     2 PASS + 12 PARTIAL of 14 sub-shifts; includes 2 pre-registered NEGATIVE-prediction confirmations (§5.7.7 AAS asymmetric, §5.7.8 polysubstance retired)                                                                                                                                                 PARTIAL
                                  §5.7a Clustered bootstrap CIs on §5.7.1 alcohol post-2013 new-share                                                                                   naive CI width 0.0112 vs journal-clustered CI width 0.0414; ratio 3.72x (pre-reg >= 1.5x)                                                                                                                                                    PASS
                                        §5.7d Polysemy demonstration on 6 single-token PubMed queries                                                                              5/6 tokens show single-token sense mixing (intended sense not modal OR exceeded by unintended)                                                                                                                                                    PASS
                        §5.7d-ii Unsupervised cross-check (pycorpdiff induce_senses vs regex buckets)           5/6 tokens above-chance agreement (ARI>0.1); mean ARI 0.21; AAS clean (topically distinct), weed near-zero (extreme sense imbalance) -- documented WSI limitation                                                                                                                                                OBSERVED
                                                §6 NEGATIVE FINDING: "committed" -> "died by" suicide                                                                                                                    "died by suicide" PubMed records: 0 (falsifier was zero)                                                                                                      FAIL (pre-registered falsifier; honestly recorded)
            §6.5.1 AUDIT-RESOLVED: word-sense decomposition of `retard*` (iter-1 BLOCKING refutation)                                         slur sense: 4/95,862 records = 0.004% (essentially absent); clinical-ID compound declines 77% from 1990s to 2020s (corroborates §5) AUDIT-RESOLVED (prior INVERSION claim REFUTED; corrected interpretation: morpheme dominated by scientific process-verb senses, slur essentially absent)
                          §6.5.1b POLYSEMY-AUDITED SURVEY (iter-2/3 generalisation of iter-1 finding)                         18 labels audited by random-20-PMID sense check: 7 COLLISIONs, 2 DRIFTs, 6 VALID era-clinical, 2 VALID-PERSISTENT, 0 UNMEASURABLE, 1 UNCLASSIFIABLE                    META-FINDING: 7/18 = 39% polysemy-collision rate is the prior risk for any single-token deprecated-medical-vocabulary tracking study
                     §6.5.1c MULTI-LABEL SLUR WSI DEEP AUDIT (iter-4 full-corpus extension of §6.5.1) 23 slur-like labels WSI-classified across 62,983 PubMed records 1950-2024; corpus-wide explicit-slur fraction: 1/62,983 = 0.0016%; 1/23 labels had >=1 explicit slur record             CONFIRMED: corpus-wide slur fraction <0.01% for every label — single-token queries on slur-like English morphemes do NOT measure slur usage
                                                                §6.5.2 Loaded-vocab clean extinctions                                                                                            5 of 43 loaded-vocab labels are extinct (peak <= 1990 and zero records in 2020s)                                                                                                                                                OBSERVED
                                                           §6.5.3 ZERO-hit indexing-curation evidence                                                         1 zero-hit labels remain in the post-iter-4-curation inventory (iter-3 had 4; all removed in iter-4 ethical review)                                                                                                                                                OBSERVED
                                                      §6.5.4 Persistent loaded-vocab (not all retire)                                                                                                                              13 labels persist with 2020s sum >= 50 records                                                                                                                                                OBSERVED
                                                              §7 Cross-corpus: PubMed vs Google Books                                                                                       PubMed leads Books for 2 of 5 shifts; Books-"died by suicide" growth 2000->2019: 7.1x                                                                                                                                                    PASS
                                                                   AUDIT §8.1 Step-A/Step-B retention                                                                                                                                            worst retention 0.82 (floor 0.8)                                                                                                                                                    PASS
                                                                      AUDIT §8.2 Placebo anchor years                                                                                                                            real anchor aligns: True; placebos aligning: 0/5                                                                                                                                                    PASS
                                                        AUDIT §8.3 Shuffled-label null for §5 keyness                                                                                                                        observed |G^2|=30,028; 95th-pct null=239; ratio 126x                                                                                                                                                    PASS
                                                AUDIT §8.4 BH-vs-bootstrap-CI alignment on §5 keyness                                                                                                                                   disagreement ratio: 0.129 (tolerance 0.2)                                                                                                                                                    PASS
                                                      AUDIT §8.5 min_count sensitivity for §5 keyness                                                                                                               pre-anchor top-3 stable: True; post-anchor top-3 stable: True                                                                                                                                                    PASS
                                               AUDIT §8.6 Spearman monotonic-trend on §5 ID 2013-2024                                                                                                                                rho = +0.944, p = 3.93e-06 (floor rho > 0.7)                                                                                                                                                    PASS

_sb = scoreboard.copy()
_sb['check_short'] = _sb['Check'].str.replace(r'^(§[\d\.a-z]+)\s+', r'\1 ', regex=True).str.slice(0, 70)
def _verdict_class(v):
    s = str(v)
    if s.startswith('PASS'): return 'PASS'
    if s.startswith('AUDIT-RESOLVED') or 'AUDIT-RESOLVED' in s: return 'AUDIT-RESOLVED'
    if s.startswith('META-FINDING'): return 'META-FINDING'
    if s.startswith('PARTIAL'): return 'PARTIAL'
    if s.startswith('FAIL'): return 'FAIL'
    if s.startswith('OBSERVED'): return 'OBSERVED'
    return 'OTHER'
_sb['verdict_class'] = _sb['Verdict'].apply(_verdict_class)
_sb['row_idx'] = range(len(_sb))
_pal_sb = {
    'PASS':            '#2a9d8f',
    'PARTIAL':         '#e9c46a',
    'FAIL':            '#e63946',
    'AUDIT-RESOLVED':  '#9d4edd',
    'META-FINDING':    '#3a86ff',
    'OBSERVED':        '#888888',
    'OTHER':           '#cccccc',
}
_strip_sb = alt.Chart(_sb).mark_rect(stroke='white', strokeWidth=1).encode(
    y=alt.Y('check_short:N', sort=_sb['check_short'].tolist(), title=None),
    x=alt.value(0), x2=alt.value(540),
    color=alt.Color('verdict_class:N', title='Verdict class',
                     scale=alt.Scale(domain=list(_pal_sb.keys()),
                                      range=list(_pal_sb.values()))),
    tooltip=['Check', 'Observed', 'Verdict'],
).properties(width=540, height=max(22*len(_sb), 200),
    title='§9 scoreboard verdicts (green PASS, yellow PARTIAL, red FAIL, purple AUDIT-RESOLVED, blue META, grey OBSERVED)')
_strip_sb

Era	Old term	New term	Anchor event
1960s	mongolism, Mongolian idiocy	Down syndrome, trisomy 21	Lancet 1961; WHO ICD-8 ~1965
1980s	shell shock, war neurosis, combat fatigue	post-traumatic stress disorder (PTSD)	DSM-III publication 1980
1990s	multiple personality disorder (MPD)	dissociative identity disorder (DID)	DSM-IV publication 1994
2010s	mental retardation	intellectual disability	Rosa's Law (US, 2010) + DSM-5 (2013)
—	"committed suicide"	"died by suicide"	AAS / AFSP style recommendations 2008–2017 (negative finding)

Shift	Pre-registered claim	Tolerance / falsifier
1960s Down syndrome	"mongolism" count peaks before 1970 and falls to ~0 by 2010; "Down syndrome" rises monotonically post-1965	crossover year within ±5 of 1965
1980s PTSD	"post-traumatic stress disorder" goes from ~0 pre-1980 to dominant by 1990	first appearance year within 1979–1981
1990s DID	"dissociative identity disorder" emerges 1993–1995; "MPD" persists in retrospective lit	first DID record within 1993–1995
2010s ID	"intellectual disability" overtakes "mental retardation" between 2010 and 2015	crossover year within ±2 of 2012
Suicide phrasing	"died by suicide" has measurable PubMed penetration by 2020	FALSIFIER: count == 0 would refute the prediction

#	Failure mode	Mitigation
1	Automatic Term Mapping expands an unqualified search term through MeSH synonyms. Querying `(mongolism OR "Mongolian idiocy")[Title/Abstract]` returns Down-syndrome papers because Entrez's translation rewrites it to include `"down syndrome"[MeSH Terms]` and friends — yielding ~2,200 hits in 2020 when the literal word `mongolism` returns 0	Apply `[Title/Abstract]` per term inside an OR, not to the outer parens: `mongolism[Title/Abstract] OR "Mongolian idiocy"[Title/Abstract]`. This suppresses ATM and forces literal-text matching, which is what a semantic-shift study actually needs
2	Paginated esearch JSON sometimes contains stray control characters that the strict JSON decoder rejects	Wrapping in `json.loads(text, strict=False)` with retry handles it
3	esearch with `usehistory=y` silently truncates above ~10,000 PMIDs — the history-server pagination returns empty on the second page for some queries, so the loop terminates and the caller gets only the most recent 10K records	Iterate year-by-year: one esearch call per publication year. Per-year volumes peak ~6,000 (PTSD in 2020s), well inside the limit
4	`http.client.IncompleteRead` during efetch when NCBI drops a chunked-encoded stream mid-response — this is an `HTTPException` subclass, NOT an `HTTPError`, so default `urllib.error` retry catches miss it	Broaden the transient-retry set to include `http.client.HTTPException` and `ConnectionError`

Sub-shift	Prediction	Why
§5.7.1 alcohol → AUD	crossover or partial rename ±2 of 2013	clean rename
§5.7.2 opioid → OUD	crossover ±2 of 2013	clean rename
§5.7.3 cannabis → CUD	crossover ±2 of 2013	clean rename
§5.7.4 cocaine → cocaine UD	crossover ±2 of 2013	clean rename
§5.7.5 stimulant UD	crossover ±2 of 2013	rename + recategorise
§5.7.6 tobacco UD	crossover ±2 of 2013 (or partial — TUD adoption known to lag)	clean rename
§5.7.7 AAS asymmetric	NEGATIVE: essentially no rename	DSM-5 didn't carve out
§5.7.8 polysubstance retired	NEGATIVE: ~zero new term records	DSM-5 removed entirely
§5.7.9 gambling disorder	crossover ±2 of 2013	clean rename + chapter move

Token	Intended sense	Common unintended senses
`steroid`	anabolic steroid	corticosteroid, neurosteroid, plant phytosteroid
`doping`	sports doping	semiconductor doping, drug formulation
`AAS`	anabolic-androgenic steroids	American Astronomical Society, atomic absorption spectroscopy
`weed`	cannabis	agricultural / invasive weeds
`horse`	slang for heroin	equine biology / veterinary
`gaming`	video gaming, gambling	game theory, gamification (research methods)

	shift	side	year	n_records
0	1960s_down	old	1950	17
1	1960s_down	old	1951	32
2	1960s_down	old	1952	25
3	1960s_down	old	1953	24
4	1960s_down	old	1954	45

Sense	iter-2 records	Share
Slur (explicit mention)	4 of 31,479	0.013 %
Clinical-ID compound ("mentally retarded")	2,968	9.4 %
Growth / developmental ("growth retardation")	1,417	4.5 %
Biology / oncology process-verb ("retard tumor growth")	7,674	24.4 %
Chemistry / materials process-verb ("retard the corrosion")	1,888 + 720 passive	8.3 %
Other identified scientific process-verb senses	~290	< 1 %
Unknown — random inspection confirms all are also scientific process-verb	16,521	52.5 %

Sense category	Count
Flame retardant / polymer chemistry / materials science	13
Environmental chemistry (PBDEs, plastic additives, BDE-209)	3
Biology process verb (boron deficiency, anti-aging EGCG, apolipoprotein-1 inhibition)	3
Other scientific (DNA-clay flame retardancy, molecular dynamics)	1
Slur (explicit mention)	0
Clinical-ID compound ("mentally retarded")	0

Diagnostic-terminology evolution in PubMed, 1950–2024¶

How to read this notebook¶

0. Setup¶

0a. Reproducibility manifest¶

0b. Pre-registered expectations¶

0c. Methodology footnote: four E-utilities gotchas worth documenting¶

0d. Cross-package validation: agreement with Rayson's LL Wizard¶

1. Corpus¶

1a. Per-shift annual record counts¶

2. Shift 1: mongolism → Down syndrome (1960s anchor)¶

2a. Bootstrap CIs on the §2 keyness¶

2b. Collocation shift: what travelled WITH the Down-syndrome rename?¶

3. Shift 2: shell shock / war neurosis / combat fatigue → PTSD (1980s anchor)¶

3b. Burstiness detection on the PTSD annual record count¶

4. Shift 3: multiple personality disorder → dissociative identity disorder (1990s anchor)¶

5. Shift 4: mental retardation → intellectual disability (2010s anchor)¶

5a. Bootstrap CIs + simultaneous max-T on the §5 keyness¶

5.5. Shift 6: SIRS / Sepsis-2 → Sepsis-3 (2016 anchor)¶

5.5a. Bootstrap CIs + simultaneous max-T on the §5.5 Sepsis-3 keyness¶

5.5b. Cross-corpus validation: Sepsis-3 in ClinicalTrials.gov trial registrations¶

5.6. Shift 7: Asperger's syndrome → autism spectrum disorder (2013 anchor + 2018 ethics)¶

5.6a. Bootstrap CIs + simultaneous max-T on the §5.6 Asperger→ASD keyness¶

5.6b. Placebo-anchor sweep on the §5.6 ethical-acceleration claim¶

5.7. Shift 8: substance-use disorder DSM-5 family rename + discovery-of-abuse-potential archetype¶

§5.7a Clustered bootstrap CIs on the alcohol post-2013 new-share¶

§5.7d Polysemy demonstration — why single-token slang queries fail on PubMed¶

§5.7d-ii Unsupervised cross-check — does the regex partition survive?¶

6. Negative finding: "committed suicide" → "died by suicide"¶

6.5. Loaded clinical vocabulary retirement: Tier-2 + Tier-3 inventory¶

6.5.1. Headline inversion: "retarded" outlives "mental retardation"¶

6.5.1b. Polysemy-audited survey: which Tier-2/3 labels actually measure deprecated clinical use?¶

6.5.1c. Multi-label slur WSI deep audit (iter-4)¶

6.5.1d. Iter-5b broadened-corpus spot check (Tier-B audit follow-on)¶

6.5.2. Clean extinctions¶

6.5.3. Indexing-curation residual (post-iter-4 curation)¶

6.5.4. Persistent terms — not every old term retires¶

7. Cross-corpus validation: PubMed vs Google Books Ngrams¶

7.1 The "died by suicide" cross-corpus contrast¶

8. Audit layer¶

8.1 Step-A vs Step-B record-count consistency¶

8.2 Placebo dates for the §5 ID shift¶

8.3 Shuffled-label null on §5 keyness¶

8.4 BH-significance ⊆ CI-excludes-zero alignment (on §5 keyness)¶

8.5 min_count sensitivity for §5 keyness¶

8.6 Spearman monotonic-trend test on the §5 trajectory¶

8.7. Limits of this notebook — what we cannot claim, by design¶

Limit 1: WSI regex-bucket conservatism¶

Limit 2: 2024 partial-year chart truncation¶

Limit 3: Sample-vs-corpus distinctions¶

Limit 4: §5.5 + §5.6 lighter audit treatment¶

Limit 5: Cross-corpus reach (partly closed by §5.5b)¶

Limit 6: Polysemy survey is bounded by what we could query¶

Limit 7: We measure published-literature usage, not clinical practice¶

Limit 8: No replication on a second medical corpus¶

9. Audit scoreboard¶