import warnings
from pathlib import Path

import altair as alt
import numpy as np
import pandas as pd

import pycorpdiff as pcd

# Pre-render charts to inline SVG via vl-convert so plots show up
# everywhere (GitHub, JupyterLab, VS Code, nbviewer) without a CDN.
# Custom renderer: force an ASCII hyphen-minus (-) for every negative
# number via the d3 format locale, so axis labels, legends AND tooltips
# render cleanly in every viewer. Vega's default typographic minus
# U+2212 is correct UTF-8 but is mis-decoded by some notebook viewers /
# PDF pipelines (where it shows as raw bytes).
import json as _json

import vl_convert as _vlc

_ASCII_LOCALE = {
    'decimal': '.', 'thousands': ',', 'grouping': [3],
    'currency': ['', ''], 'minus': '-',
}


def _svg_ascii_renderer(spec, **_kw):
    return {'image/svg+xml': _vlc.vegalite_to_svg(
        _json.dumps(spec), format_locale=_ASCII_LOCALE)}


alt.renderers.register('svg_ascii', _svg_ascii_renderer)
alt.renderers.enable('svg_ascii')
alt.data_transformers.disable_max_rows()
warnings.filterwarnings('ignore')

print(f'pycorpdiff {pcd.__version__}')

pycorpdiff 0.1.0a25

import platform
import sys
from datetime import datetime, timezone

import altair, networkx, numpy, pandas, ruptures, scipy, sklearn, statsmodels

MANIFEST = {
    'notebook_built': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
    'python': sys.version.split()[0],
    'platform': platform.platform(),
    # Core analytical stack.
    'pycorpdiff':   pcd.__version__,
    'numpy':        numpy.__version__,
    'pandas':       pandas.__version__,
    'scipy':        scipy.__version__,
    'altair':       altair.__version__,
    'statsmodels':  statsmodels.__version__,
    'ruptures':     ruptures.__version__,
    'sklearn':      sklearn.__version__,
    'networkx':     networkx.__version__,
    # Seeds applied throughout this notebook -- one per stochastic
    # method. Rerunning with identical seeds reproduces exact numbers.
    'seed_bootstrap':       0,
    'seed_match':           0,
    'seed_causal_impact':   0,
    'seed_hash_embedder':   0,
    'seed_multi_term':      0,
    # Data snapshot.
    'hansard_fetch_date':   '2024-05-26 (2014-2023) + 2026-05-27 (2010-2013)',
    'corpus_focal_term':    'asylum',
    'corpus_date_range':    '2010-01-01 to 2023-12-31',
    'cache_file':           '_cache/hansard_asylum_2010_2023.parquet',
}
for k, v in MANIFEST.items():
    print(f'  {k:24s} {v}')

  notebook_built           2026-05-29
  python                   3.12.13
  platform                 macOS-26.5-arm64-arm-64bit
  pycorpdiff               0.1.0a25
  numpy                    2.4.6
  pandas                   2.3.3
  scipy                    1.17.1
  altair                   6.1.0
  statsmodels              0.14.6
  ruptures                 v1.1.10
  sklearn                  1.8.0
  networkx                 3.6.1
  seed_bootstrap           0
  seed_match               0
  seed_causal_impact       0
  seed_hash_embedder       0
  seed_multi_term          0
  hansard_fetch_date       2024-05-26 (2014-2023) + 2026-05-27 (2010-2013)
  corpus_focal_term        asylum
  corpus_date_range        2010-01-01 to 2023-12-31
  cache_file               _cache/hansard_asylum_2010_2023.parquet

prereg = pd.DataFrame([
    ('5.0 Corpus conditioning caveat',
     'Notebook scope is asylum-conditioned discourse, not broader UK migration',
     'methodological framing, not a quantitative test'),
    ('5.1a Duplicate speech detection',
     'No mass duplication in the cached corpus',
     '< 1% of contributions in exact-duplicate groups'),
    ('5.2 Reference-baseline keyness',
     'Hansard-distinctive vocabulary should reflect policy/procedure',
     'asylum, refugees, Home Office, Government, Secretary'),
    ('5.3 Con-vs-Lab keyness with CI',
     'Direction-confident divergence on enforcement vs rights vocabulary',
     'Con: borders, illegal, criminal; Lab: rights, protection, refugees'),
    ('5.3b Shuffled-label null (sum-top-10)',
     'Sum of top-10 |G^2| exceeds 95th pct of label-permutation null',
     'p_perm < 0.05'),
    ('5.3c Bootstrap CI / BH alignment',
     'BH-sig with CI excluding zero >= 70%; BH-non-sig with CI including zero >= 95%',
     'either alignment falls below its threshold'),
    ('5.3d Formal CI coverage MC',
     'Simultaneous-CI top-G^2 coverage within +/-10% of nominal 95%',
     'simultaneous coverage outside [85%, 100%]'),
    ('5.3e min_count sweep',
     'Top-15 keyness terms stable across min_count {10, 20, 50, 100}',
     'Jaccard overlap < 0.6 vs canonical min_count=20'),
    ('5.3f Top-K-speaker leverage',
     'Top-15 keyness stable after excluding top-10 prolific speakers',
     'Jaccard overlap < 0.5 vs canonical full-corpus result'),
    ('5.4 CEM-matched keyness',
     'L1 imbalance on year drops sharply post-match; signal narrows',
     'L1 imbalance reduction > 0.05 absolute'),
    ('5.5 Collocation shift on asylum',
     'Distinct collocate clusters by party (enforcement vs welfare)',
     'KWIC evidence retrievable via .explain()'),
    ('5.5b Chamber-stratified collocation',
     'Commons-only and Lords-only top-12 collocates Jaccard >= 0.4',
     'Jaccard < 0.4 or directional reversal between chambers'),
    ('5.6 Semantic trajectory of asylum',
     'Distance from 2014 baseline increases over time, peaks 2022-2023',
     'cosine distance monotonic-ish increase post-2020'),
    ('5.6c Bootstrap stability of trajectory',
     'CI envelope widths small relative to year-on-year change',
     'max CI width > max year-on-year median change'),
    ('5.6d Leave-year-out SBERT trajectory',
     'Endpoint distance robust to dropping any single year (structural)',
     'non-zero leverage on a middle-year drop'),
    ('5.7 Burstiness of criminal',
     'Bursts coincide with referendum 2016, Channel crossings 2018+, Rwanda 2022',
     'state >= 1 in 2016Q2-Q3 OR 2021-Q3 OR 2023-Q4'),
    ('5.7c Permuted-time null (revised direction)',
     'Observed n_bursts in lower tail of permutation null',
     'p_lower > 0.05 on n_bursts'),
    ('5.8 Changepoint + causal-impact on illegal (26-quarter pre-window)',
     '2016-06-23 referendum effect on the rate of "illegal"',
     'PELT changepoint in 2016Q2-Q3 OR causal_impact CrI excludes zero'),
    ('5.8c Placebo-date sweep',
     'Median placebo P(no effect) > 2x real-referendum P(no effect)',
     'placebo / real ratio <= 2 (model fires on arbitrary dates)'),
    ('5.8d Donor-series check (2 controls)',
     'Both committee and fisheries P(no effect) >> illegal P(no effect)',
     'either control returns P(no effect) within 0.10 of illegal'),
    ('5.8e Leave-one-year-out for causal_impact',
     'Max single-year leverage on P(no effect) < 0.30',
     'dropping any year shifts P(no effect) by >= 0.30'),
    ('5.9 Lexical diversity over time',
     'Diversity (MTLD/HD-D) trend within +/-10% across the window',
     'no abrupt year-on-year shifts > 2 sigma'),
    ('5.10 Forecast of asylum rate',
     'ETS prediction-interval widens monotonically with horizon',
     'PI band width(t+4) > PI band width(t+1)'),
    ('5.10b Rolling-origin backtest',
     'ETS beats naive seasonal-mean on RMSE; 95% PI covers actuals',
     'ETS RMSE >= naive OR 95% PI coverage < 50%'),
    ('5.11 N-way keyness by party',
     'SNP signature: scotland, glasgow, scottish',
     'top-5 SNP terms include at least 2 Scotland-specific tokens'),
    ('5.12 Co-occurrence network (top-60 PMI edges)',
     'Distinct enforcement / process / humanitarian sub-clusters',
     '>= 3 visible community structures from greedy modularity'),
    ('5.12b Random-graph modularity null',
     'Observed network modularity > Erdos-Renyi 95th pct',
     'observed modularity <= ER 95th pct'),
    ('5.12c Edge-threshold sensitivity',
     'Modularity strictly positive across sweep; node sets stable (Jaccard >= 0.7)',
     'modularity drops to 0 OR Jaccard < 0.7 between adjacent thresholds'),
], columns=['Section', 'Predicted outcome', 'Falsifier'])
prereg

# Sweep six canonical contingency tables. Each (O1, N1, O2, N2,
# expected_unsigned_LL) triple is hand-derived from
#     E_i = N_i * (O1+O2) / (N1+N2)
#     LL  = 2 * sum_i  O_i * ln(O_i / E_i)
# and serves as a trip-wire: a regression in the G^2 implementation
# would fail several rows of this table simultaneously. Drawn from
# the fixture in tests/integration/test_crossval_rayson.py.
from pycorpdiff.keyness import log_likelihood

REFERENCES = [
    # (label,                       O1,      N1,       O2,    N2,        expected_unsigned_LL)
    ('classic_12k_vs_10k',          12000,   1_000_000, 10000, 1_000_000, 182.0694),
    ('equal_rate_no_signal',        10,      1000,      20,    2000,      0.0),
    ('ten_x_overrep_in_a',          100,     100_000,   20,    200_000,   127.8065),
    ('five_x_overrep_in_a',         500,     1_000_000, 100,   1_000_000, 291.1032),
    ('same_count_half_rate',        50,      100_000,   50,    50_000,    11.7783),
    ('lopsided_overrep_in_a',       1000,    1_000_000, 1,     1_000_000, 1371.867),
]
rows = []
for label, O1, N1, O2, N2, expected_ll in REFERENCES:
    res = log_likelihood(
        pd.Series([O1], index=['t']), pd.Series([O2], index=['t']),
        total_a=N1, total_b=N2, formula='rayson',
    )
    observed_unsigned = abs(float(res['g2'].iloc[0]))
    abs_err = abs(observed_unsigned - expected_ll)
    rows.append({
        'case': label,
        'expected_LL': expected_ll,
        'pycorpdiff_LL': round(observed_unsigned, 4),
        'abs_error': abs_err,
    })

xv = pd.DataFrame(rows)
print(xv.to_string(index=False))
worst = xv['abs_error'].max()
print(f'\nworst absolute error across {len(xv)} cases: {worst:.2e}')
assert worst < 1e-2, f'Rayson reference disagreement at {worst:.2e}; block release'
print('OK -- agreement with canonical contingency-table references at < 0.01.')

                 case  expected_LL  pycorpdiff_LL  abs_error
   classic_12k_vs_10k     182.0694       182.0695   0.000052
 equal_rate_no_signal       0.0000         0.0000   0.000000
   ten_x_overrep_in_a     127.8065       127.8064   0.000128
  five_x_overrep_in_a     291.1032       291.1032   0.000034
 same_count_half_rate      11.7783        11.7783   0.000004
lopsided_overrep_in_a    1371.8670      1371.8641   0.002855

worst absolute error across 6 cases: 2.85e-03
OK -- agreement with canonical contingency-table references at < 0.01.

cache_path = Path('_cache/hansard_asylum_2010_2023.parquet')
df = pd.read_parquet(cache_path)
df['year'] = df['date'].dt.year  # int year for downstream CEM matching
print(f'{len(df):,} contributions, {df["date"].min().date()} to {df["date"].max().date()}')
print(f'Top parties: {df["party_abbrev"].value_counts().head(6).to_dict()}')
print(f'House split: {df["house"].value_counts().to_dict()}')

corpus = pcd.from_dataframe(
    df,
    text_col='text',
    meta_cols=('member', 'party', 'party_abbrev', 'date', 'year', 'house',
               'debate_title', 'hansard_id'),
)
print(f'\nCorpus: {len(corpus):,} docs, {corpus.total_tokens():,} tokens')

10,124 contributions, 2010-01-05 to 2023-12-19
Top parties: {'Con': 3668, 'Lab': 2506, 'LD': 871, 'SNP': 836, '': 633, 'XB': 586}
House split: {'Commons': 6154, 'Lords': 3970}

Corpus: 10,124 docs, 4,949,610 tokens

df_profile = df.assign(year=df['date'].dt.year)
year_party = (
    df_profile.groupby(['year', 'party_abbrev']).size().unstack(fill_value=0)
)
top_parties = df['party_abbrev'].value_counts().head(6).index.tolist()
year_party = year_party[top_parties]
year_party

# Distinguishable, convention-aligned party colours (UK party palette)
# instead of altair's default 6-category scheme, where the pale/grey
# categories blur together.
_PARTY_COLOURS = {
    'Con': '#0087DC',  # Conservative blue
    'Lab': '#E4003B',  # Labour red
    'LD':  '#FAA61A',  # Liberal Democrat orange
    'SNP': '#FCDB05',  # SNP yellow
    'XB':  '#5A5A5A',  # crossbench (dark grey)
    '':    '#B0B0B0',  # unattributed (light grey)
}
alt.Chart(
    year_party.reset_index().melt('year', var_name='party', value_name='count')
).mark_bar().encode(
    x='year:O',
    y='count:Q',
    color=alt.Color(
        'party:N',
        sort=top_parties,
        scale=alt.Scale(
            domain=top_parties,
            range=[_PARTY_COLOURS.get(p, '#333333') for p in top_parties],
        ),
        legend=alt.Legend(title='party'),
    ),
    tooltip=['year', 'party', 'count'],
).properties(width=1100, height=440,
             title='Asylum-mentioning contributions by year and party')

import hashlib

df['_hash'] = df['text'].map(lambda s: hashlib.md5(s.encode('utf-8')).hexdigest())
group_sizes = df.groupby('_hash').size()
n_dup_groups = int((group_sizes > 1).sum())
n_affected = int(group_sizes[group_sizes > 1].sum())
pct_affected = 100 * n_affected / len(df)

print(f'Exact-duplicate text groups: {n_dup_groups:,}')
print(f'Contributions in any duplicate group: {n_affected:,} '
      f'of {len(df):,} ({pct_affected:.2f}%)')

print('\nLargest duplicate groups:')
top = group_sizes.sort_values(ascending=False).head(5)
for h, n in top.items():
    if n > 1:
        sample = df.loc[df['_hash']==h, 'text'].iloc[0][:100]
        print(f'  {n:3d}x identical: {sample!r}...')

df = df.drop(columns=['_hash'])

Exact-duplicate text groups: 10
Contributions in any duplicate group: 23 of 10,124 (0.23%)

Largest duplicate groups:
    5x identical: 'What plans her Department has to reform the UK’s asylum system.'...
    2x identical: 'What assessment she has made of the adequacy of support and accommodation for asylum seekers during '...
    2x identical: 'I am sorry to press the Minister on this point, and she may want to write to me as a follow-up, but '...
    2x identical: 'I cannot possibly be drawn on individual applications for asylum. It would be wholly improper for me'...
    2x identical: '4. What recent progress her Department has made on reducing the backlog of asylum applications.'...

drift = df.copy()
drift['year'] = drift['date'].dt.year
drift['n_tokens'] = drift['text'].str.split().str.len()
by_year = drift.groupby('year').agg(
    n_contributions=('text', 'size'),
    mean_tokens=('n_tokens', 'mean'),
    median_tokens=('n_tokens', 'median'),
    pct_commons=('house', lambda s: 100 * (s == 'Commons').mean()),
    pct_party_missing=('party_abbrev', lambda s: 100 * (s == '').mean()),
)
print(by_year.round(1).to_string())

# Flag drift: coefficient of variation on mean contribution length,
# and the range of Commons share across years.
import numpy as np
cv_len = by_year['mean_tokens'].std() / by_year['mean_tokens'].mean()
commons_range = by_year['pct_commons'].max() - by_year['pct_commons'].min()
print(f'\nMean-contribution-length CV across years: {cv_len:.2f}')
print(f'Commons-share range across years: {commons_range:.1f} pp')
print(f'Max party-missing share in any year: {by_year["pct_party_missing"].max():.1f}%')

      n_contributions  mean_tokens  median_tokens  pct_commons  pct_party_missing
year                                                                             
2010              290        602.1          295.5         57.6                3.4
2011              279        551.1          287.0         51.3                6.1
2012              258        590.4          386.0         34.9               21.7
2013              297        648.1          438.0         54.9                7.4
2014              520        530.6          230.5         40.2               15.4
2015              780        480.8          283.0         70.0               11.0
2016             1061        457.9          236.0         51.6               16.0
2017              365        567.0          338.0         71.5                2.5
2018              432        541.0          303.5         67.6                1.4
2019              455        460.4          183.0         69.2                1.3
2020              742        479.9          186.5         62.0                0.8
2021             1218        453.4          286.5         82.9                1.9
2022             1604        447.2          251.5         52.6                6.0
2023             1823        434.8          187.0         60.7                2.5

Mean-contribution-length CV across years: 0.13
Commons-share range across years: 48.0 pp
Max party-missing share in any year: 21.7%

baseline_result = pcd.against_baseline(corpus, 'gutenberg_fiction', min_count=10)
print(baseline_result.summary())
top_pos = (
    baseline_result.table
    .query('g2 > 0')
    .head(15)[['term', 'count_a', 'count_b', 'g2', 'log_ratio']]
)
top_pos

KeynessResult(log_likelihood, |a|=4,949,610, |b|=502,234, terms=13,016)

commons = corpus.slice(house='Commons')
con = commons.slice(party_abbrev='Con')
lab = commons.slice(party_abbrev='Lab')
print(f'Commons-only -- |Con|={len(con)} docs, {con.total_tokens():,} tokens')
print(f'Commons-only -- |Lab|={len(lab)} docs, {lab.total_tokens():,} tokens')

# Use simultaneous_ci=True so the bootstrap CIs are valid for the
# top-ranked terms the reader actually sees. See §5.3d for the
# Monte-Carlo coverage check motivating this default. Per-term
# percentile CIs (simultaneous_ci=False) under-cover the top of a
# sorted keyness table at ~63% under known nulls (Type-I inflation).
keyness_ci = pcd.compare(con, lab).keyness(
    min_count=20, formula='dunning',
    ci='bootstrap', n_boot=499, bootstrap_seed=0,
    simultaneous_ci=True,
    multiple_comparisons='bh',
)
print(f'\n{len(keyness_ci.table):,} terms above min_count')

Commons-only -- |Con|=2421 docs, 1,015,079 tokens
Commons-only -- |Lab|=1755 docs, 936,804 tokens

5,114 terms above min_count

top = keyness_ci.table.head(15)[
    ['term', 'count_a', 'count_b', 'g2', 'g2_ci_lower', 'g2_ci_upper', 'p_adjusted']
]
top

keyness_ci.plot(kind='bar', n=15).properties(
    width=1100,
    title='Commons-only: Conservative vs Labour distinctive vocabulary',
)

top_terms = keyness_ci.table['term'].head(12).tolist()

iid = pcd.compare(con, lab).keyness(
    min_count=20, formula='dunning',
    ci='bootstrap', n_boot=299, bootstrap_seed=0,
)
clustered = pcd.compare(con, lab).keyness(
    min_count=20, formula='dunning',
    ci='bootstrap', n_boot=299, bootstrap_seed=0,
    cluster_col='member',
)
iid_t = iid.table.set_index('term')
clu_t = clustered.table.set_index('term')
import pandas as pd
rows = []
for term in top_terms:
    iid_w = iid_t.loc[term, 'g2_ci_upper'] - iid_t.loc[term, 'g2_ci_lower']
    clu_w = clu_t.loc[term, 'g2_ci_upper'] - clu_t.loc[term, 'g2_ci_lower']
    rows.append({'term': term,
                 'iid_width': round(iid_w, 1),
                 'clustered_width': round(clu_w, 1),
                 'widening': round(clu_w / max(1e-9, iid_w), 2)})
widths = pd.DataFrame(rows)
print(widths.to_string(index=False))
print(f'\nMean clustered/IID width ratio: {widths["widening"].mean():.2f}x')
print('Speaker-clustered CIs are wider — the honest interval for hierarchical data.')

      term  iid_width  clustered_width  widening
      data     1882.0           1958.6      1.04
      home      358.9            868.9      2.42
  minister      460.5            638.3      1.39
        we      598.7           1029.8      1.72
government      354.2            460.4      1.30
    office      371.4            477.3      1.29
protection     1018.9           1129.1      1.11
       hon      319.2            611.5      1.92
       she      360.5            667.6      1.85
      gdpr      875.3            995.6      1.14
  european      367.3            516.3      1.41
   illegal      211.8            367.9      1.74

Mean clustered/IID width ratio: 1.53x
Speaker-clustered CIs are wider — the honest interval for hierarchical data.

import numpy as np

# Stronger statistic than max |G^2| (which is noisy on a 5000+ term vocabulary):
# the sum of the top-10 unsigned G^2 values captures collective signal strength
# and is less sensitive to which particular term happens to top a random split.
rng_perm = np.random.default_rng(0)
con_lab_df = df[(df['house']=='Commons') & (df['party_abbrev'].isin(['Con','Lab']))].copy()
n_con = int((con_lab_df['party_abbrev']=='Con').sum())
n_lab = int(len(con_lab_df) - n_con)

def top10_sum(table):
    return float(table['g2'].abs().nlargest(10).sum())

observed = top10_sum(keyness_ci.table)
print(f'Shuffling {n_con} Con + {n_lab} Lab documents; 50 permutations.')
print(f'Observed sum of top-10 |G^2|: {observed:.1f}')

null_top10 = []
for trial in range(50):
    perm = rng_perm.permutation(len(con_lab_df))
    shuf = con_lab_df.iloc[perm].copy().reset_index(drop=True)
    shuf['party_abbrev'] = ['Con'] * n_con + ['Lab'] * n_lab
    sc = pcd.from_dataframe(
        shuf, text_col='text',
        meta_cols=('party_abbrev','date','year','house'),
    )
    r = pcd.compare(
        sc.slice(party_abbrev='Con'), sc.slice(party_abbrev='Lab'),
    ).keyness(min_count=20, formula='dunning')
    null_top10.append(top10_sum(r.table))

null_top10 = np.array(null_top10)
p_perm = float((null_top10 >= observed).mean())
print(f'\nNull distribution of sum-top-10 |G^2|:')
print(f'  median {np.median(null_top10):.1f}, 95th pct {np.quantile(null_top10, 0.95):.1f}, max {null_top10.max():.1f}')
print(f'Empirical p-value (one-sided, upper tail): {p_perm:.3f}')

Shuffling 2421 Con + 1755 Lab documents; 50 permutations.
Observed sum of top-10 |G^2|: 3911.5

Null distribution of sum-top-10 |G^2|:
  median 1722.2, 95th pct 2499.1, max 2678.7
Empirical p-value (one-sided, upper tail): 0.000

tab = keyness_ci.table
sig = tab[tab['p_adjusted'] < 0.05]
nonsig = tab[tab['p_adjusted'] >= 0.05]

def ci_excludes_zero(row) -> bool:
    return (row['g2_ci_lower'] > 0 and row['g2_ci_upper'] > 0) or \
           (row['g2_ci_lower'] < 0 and row['g2_ci_upper'] < 0)

sig_excl = int(sig.apply(ci_excludes_zero, axis=1).sum())
nonsig_excl = int(nonsig.apply(ci_excludes_zero, axis=1).sum())

print(f'BH-significant terms (p_adjusted < 0.05): {len(sig):,}')
print(f'  bootstrap CI excludes zero: {sig_excl:,} ({100*sig_excl/max(1,len(sig)):.1f}%)')
print(f'  bootstrap CI includes zero: {len(sig)-sig_excl:,}')
print()
print(f'BH-non-significant terms (p_adjusted >= 0.05): {len(nonsig):,}')
print(f'  bootstrap CI includes zero: {len(nonsig)-nonsig_excl:,} '
      f'({100*(len(nonsig)-nonsig_excl)/max(1,len(nonsig)):.1f}%)')
print(f'  bootstrap CI excludes zero: {nonsig_excl:,}')

BH-significant terms (p_adjusted < 0.05): 1,993
  bootstrap CI excludes zero: 0 (0.0%)
  bootstrap CI includes zero: 1,993

BH-non-significant terms (p_adjusted >= 0.05): 3,121
  bootstrap CI includes zero: 3,121 (100.0%)
  bootstrap CI excludes zero: 0

# Coverage MC: under a known null, refit both CI variants on each
# replicate and count zero-coverage on the top-ranked term.
import numpy as np

rng_cov = np.random.default_rng(1)
cl_df = df[(df['house']=='Commons') & (df['party_abbrev'].isin(['Con','Lab']))].copy()
n_con = int((cl_df['party_abbrev']=='Con').sum())
n_lab = int(len(cl_df) - n_con)

n_mc = 30
cover_per_term = 0
cover_simul = 0
for trial in range(n_mc):
    perm = rng_cov.permutation(len(cl_df))
    s = cl_df.iloc[perm].copy().reset_index(drop=True)
    s['party_abbrev'] = ['Con'] * n_con + ['Lab'] * n_lab
    sc = pcd.from_dataframe(
        s, text_col='text',
        meta_cols=('party_abbrev','date','year','house'),
    )
    # Per-term percentile CI
    r_per = pcd.compare(
        sc.slice(party_abbrev='Con'), sc.slice(party_abbrev='Lab'),
    ).keyness(
        min_count=20, formula='dunning',
        ci='bootstrap', n_boot=99, bootstrap_seed=trial,
        simultaneous_ci=False,
    )
    if r_per.table.iloc[0]['g2_ci_lower'] <= 0 <= r_per.table.iloc[0]['g2_ci_upper']:
        cover_per_term += 1
    # Simultaneous Westfall-Young CI
    r_sim = pcd.compare(
        sc.slice(party_abbrev='Con'), sc.slice(party_abbrev='Lab'),
    ).keyness(
        min_count=20, formula='dunning',
        ci='bootstrap', n_boot=99, bootstrap_seed=trial,
        simultaneous_ci=True,
    )
    if r_sim.table.iloc[0]['g2_ci_lower'] <= 0 <= r_sim.table.iloc[0]['g2_ci_upper']:
        cover_simul += 1

cov_per = cover_per_term / n_mc
cov_sim = cover_simul / n_mc
print(f'Monte Carlo coverage on top-ranked term under known null')
print(f'  ({n_mc} replicates, n_boot=99 each, nominal 95%):')
print(f'  per-term percentile:        {cover_per_term}/{n_mc} = {100*cov_per:.0f}%')
print(f'  simultaneous (Westfall-Young): {cover_simul}/{n_mc} = {100*cov_sim:.0f}%')

Monte Carlo coverage on top-ranked term under known null
  (30 replicates, n_boot=99 each, nominal 95%):
  per-term percentile:        19/30 = 63%
  simultaneous (Westfall-Young): 30/30 = 100%

import numpy as np

mincount_grid = [10, 20, 50, 100]
top_lists = {}
for mc_val in mincount_grid:
    r = pcd.compare(con, lab).keyness(min_count=mc_val, formula='dunning')
    top_lists[mc_val] = set(r.table['term'].head(15).tolist())
    print(f'min_count={mc_val}: top-15 = {sorted(top_lists[mc_val])[:8]} ...')

# Pairwise Jaccard against the canonical min_count=20
base = top_lists[20]
print('\nJaccard overlap against the canonical min_count=20 top-15:')
for mc_val in mincount_grid:
    j = len(top_lists[mc_val] & base) / max(1, len(top_lists[mc_val] | base))
    print(f'  min_count={mc_val}: Jaccard = {j:.2f}')

min_count=10: top-15 = ['amendment', 'data', 'european', 'gdpr', 'government', 'home', 'hon', 'illegal'] ...

min_count=20: top-15 = ['amendment', 'data', 'european', 'gdpr', 'government', 'home', 'hon', 'illegal'] ...

min_count=50: top-15 = ['amendment', 'data', 'european', 'gdpr', 'government', 'home', 'hon', 'illegal'] ...

min_count=100: top-15 = ['amendment', 'data', 'european', 'gdpr', 'government', 'home', 'hon', 'illegal'] ...

Jaccard overlap against the canonical min_count=20 top-15:
  min_count=10: Jaccard = 1.00
  min_count=20: Jaccard = 1.00
  min_count=50: Jaccard = 1.00
  min_count=100: Jaccard = 1.00

commons_cl = df[(df['house']=='Commons') & (df['party_abbrev'].isin(['Con','Lab']))].copy()
top_speakers = commons_cl['member'].value_counts().head(10)
print(f'Top-10 most prolific Commons Con+Lab speakers:')
for name, n in top_speakers.items():
    print(f'  {n:4d} contributions: {name}')

excluded_df = commons_cl[~commons_cl['member'].isin(top_speakers.index)].copy()
print(f'\nExcluding {top_speakers.sum()} of {len(commons_cl)} Commons Con+Lab contributions '
      f'({100*top_speakers.sum()/len(commons_cl):.1f}%); '
      f'{len(excluded_df)} remain.')

exc_corpus = pcd.from_dataframe(
    excluded_df, text_col='text',
    meta_cols=('member','party_abbrev','date','year','house'),
)
con_exc = exc_corpus.slice(party_abbrev='Con')
lab_exc = exc_corpus.slice(party_abbrev='Lab')

keyness_exc = pcd.compare(con_exc, lab_exc).keyness(
    min_count=20, formula='dunning',
    multiple_comparisons='bh',
)
top15_canonical = set(keyness_ci.table['term'].head(15).tolist())
top15_excluded  = set(keyness_exc.table['term'].head(15).tolist())
jaccard = len(top15_canonical & top15_excluded) / max(1, len(top15_canonical | top15_excluded))
print(f'\nTop-15 Jaccard overlap (canonical vs top-10-speakers excluded): {jaccard:.2f}')
shared = sorted(top15_canonical & top15_excluded)
print(f'Shared top-15 terms ({len(shared)}): {shared}')

Top-10 most prolific Commons Con+Lab speakers:
   170 contributions: James Brokenshire
   131 contributions: Tom Pursglove
   125 contributions: Chris Philp
   116 contributions: Priti Patel
    99 contributions: Yvette Cooper
    89 contributions: Bambos Charalambous
    82 contributions: Caroline Nokes
    78 contributions: Baroness May of Maidenhead
    69 contributions: Stephen Kinnock
    68 contributions: Lord Cameron of Chipping Norton

Excluding 1027 of 4176 Commons Con+Lab contributions (24.6%); 3149 remain.

Top-15 Jaccard overlap (canonical vs top-10-speakers excluded): 0.36
Shared top-15 terms (8): ['amendment', 'data', 'gdpr', 'government', 'home', 'minister', 'office', 'protection']

m = pcd.match(con, lab, on=['year'], seed=0)
print(m.summary())
m.imbalance

CEM match on ['year']: 5/5 strata matched, |a|: 2421 → 1712 (71%), |b|: 1755 → 1712 (98%). Mean L1 imbalance: 0.100 → 0.000.

keyness_matched = pcd.compare(m.a_matched, m.b_matched).keyness(
    min_count=10, formula='dunning', multiple_comparisons='bh',
)
top_matched = keyness_matched.table.head(12)[
    ['term', 'count_a', 'count_b', 'g2', 'log_ratio']
]
top_matched

shift = pcd.compare(con, lab).collocation_shift(
    'asylum', window=5, min_count=10, measure='logDice',
)
print(shift.summary())
shift.table.head(12)

CollocationShiftResult(target='asylum', measure=logDice, window=5, collocates=939)

shift.plot(n=12).properties(
    width=1100, title="'asylum' collocation shift: Commons Con vs Lab"
)

top_con_collocate = shift.table.iloc[0]['collocate']
print(f'KWIC evidence for collocate {top_con_collocate!r}:')
shift.explain(top_con_collocate, n=3).table[['corpus', 'left', 'keyword', 'right']]

KWIC evidence for collocate 'missing':

commons_con = commons.slice(party_abbrev='Con')
commons_lab = commons.slice(party_abbrev='Lab')

lords = corpus.slice(house='Lords')
lords_con = lords.slice(party_abbrev='Con')
lords_lab = lords.slice(party_abbrev='Lab')
print(f'Commons Con={len(commons_con)} docs, Lab={len(commons_lab)} docs')
print(f'Lords   Con={len(lords_con)} docs, Lab={len(lords_lab)} docs')

shift_commons = pcd.compare(commons_con, commons_lab).collocation_shift(
    'asylum', window=5, min_count=10, measure='logDice',
)
shift_lords = pcd.compare(lords_con, lords_lab).collocation_shift(
    'asylum', window=5, min_count=10, measure='logDice',
)
top_c = set(shift_commons.table['collocate'].head(12).tolist())
top_l = set(shift_lords.table['collocate'].head(12).tolist())
shared = sorted(top_c & top_l)
print(f'\nCommons top-12 shift collocates: {sorted(top_c)}')
print(f'Lords   top-12 shift collocates: {sorted(top_l)}')
print(f'\nOverlap: {len(shared)}/{12} terms ({sorted(shared)})')
print(f'Jaccard: {len(top_c & top_l) / max(1, len(top_c | top_l)):.2f}')

Commons Con=2421 docs, Lab=1755 docs
Lords   Con=1247 docs, Lab=751 docs

Commons top-12 shift collocates: ['14', 'chaos', 'experience', 'failure', 'fifth', 'fled', 'lgbtq', 'missing', 'pressures', 'queue', 'tamil', 'tories']
Lords   top-12 shift collocates: ['additional', 'framework', 'highest', 'inquiry', 'mainstream', 'organisations', 'parliamentary', 'pressure', 'procedures', 'recognised', 'refuge', 'young']

Overlap: 0/12 terms ([])
Jaccard: 0.00

# Run 1: SBERT paper-grade trajectory.
sbert = pcd.SBERTEmbedder()
sem_traj_sbert = pcd.semantic_trajectory(
    corpus, target='asylum', time_col='date', freq='Y',
    embedder=sbert, window=5, baseline_period='2014',
)
sem_traj_sbert

Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

sbert_plot = sem_traj_sbert.copy()
sbert_plot['period_str'] = sbert_plot['period'].astype(str)
sbert_plot = sbert_plot.drop(columns=['period'])
alt.Chart(sbert_plot).mark_line(point=True, strokeWidth=2).encode(
    x=alt.X('period_str:N', title='year', sort=None),
    y=alt.Y('distance_from_baseline:Q', title='cosine distance from 2014 (SBERT)'),
    tooltip=['period_str', 'distance_from_baseline',
             'similarity_to_baseline', 'n_contexts'],
).properties(width=1100, height=440,
             title="Semantic trajectory of 'asylum' -- SBERT contextual embeddings")

from scipy.stats import kendalltau
import numpy as np

traj = sem_traj_sbert.copy()
traj['year'] = traj['period'].astype(str).astype(int)
traj = traj[traj['year'] >= 2014]  # baseline onward
tau, p_tau = kendalltau(traj['year'], traj['distance_from_baseline'])
post2020 = traj[traj['year'] >= 2020]
slope = np.polyfit(post2020['year'], post2020['distance_from_baseline'], 1)[0]
print(f'Kendall tau (year vs distance, 2014-2023): {tau:.3f} (p={p_tau:.3f})')
print(f'Post-2020 OLS slope: {slope:+.6f} per year')
print(f'Monotonic-increase falsifier: tau > 0 AND post-2020 slope > 0 '
      f'-> {"HOLDS" if (tau > 0 and slope > 0) else "FAILS"}')

Kendall tau (year vs distance, 2014-2023): 0.556 (p=0.029)
Post-2020 OLS slope: +0.000821 per year
Monotonic-increase falsifier: tau > 0 AND post-2020 slope > 0 -> HOLDS

# Run 2: HashEmbedder reproducibility check.
hash_emb = pcd.HashEmbedder(dim=128)
sem_traj_hash = pcd.semantic_trajectory(
    corpus, target='asylum', time_col='date', freq='Y',
    embedder=hash_emb, window=5, baseline_period='2014',
)
hash_plot = sem_traj_hash.copy()
hash_plot['period_str'] = hash_plot['period'].astype(str)
hash_plot = hash_plot.drop(columns=['period'])
alt.Chart(hash_plot).mark_line(point=True, strokeWidth=2).encode(
    x=alt.X('period_str:N', title='year', sort=None),
    y=alt.Y('distance_from_baseline:Q', title='cosine distance from 2014 (Hash)'),
    tooltip=['period_str', 'distance_from_baseline',
             'similarity_to_baseline', 'n_contexts'],
).properties(width=1100, height=440,
             title="Semantic trajectory of 'asylum' -- HashEmbedder (reproducibility check)")

import numpy as np

rng_traj = np.random.default_rng(0)
n_resamples = 20
years = sorted(df['year'].unique().tolist())

distances = {y: [] for y in years}
for r in range(n_resamples):
    rows_per_year = []
    for y in years:
        year_df = df[df['year'] == y]
        if len(year_df) == 0: continue
        idx = rng_traj.integers(0, len(year_df), size=len(year_df))
        rows_per_year.append(year_df.iloc[idx])
    sampled_df = pd.concat(rows_per_year, ignore_index=True)
    sampled_corpus = pcd.from_dataframe(
        sampled_df, text_col='text',
        meta_cols=('member','party_abbrev','date','year','house'),
    )
    traj = pcd.semantic_trajectory(
        sampled_corpus, target='asylum', time_col='date', freq='Y',
        embedder=pcd.HashEmbedder(dim=128), window=5, baseline_period='2014',
    )
    for _, row in traj.iterrows():
        y = int(str(row['period'])[:4])
        if y in distances:
            distances[y].append(float(row['distance_from_baseline']))

envelope = []
for y in years:
    if not distances[y]: continue
    arr = np.array(distances[y])
    envelope.append({
        'year': y, 'n_resamples': len(arr),
        'median': round(float(np.median(arr)), 5),
        'ci_lower': round(float(np.quantile(arr, 0.025)), 5),
        'ci_upper': round(float(np.quantile(arr, 0.975)), 5),
        'width': round(float(np.quantile(arr, 0.975) - np.quantile(arr, 0.025)), 5),
    })
env_df = pd.DataFrame(envelope)
print(env_df.to_string(index=False))
non_baseline = env_df[env_df['median'] > 0]
if len(non_baseline) > 1:
    y_on_y = non_baseline['median'].diff().abs().dropna()
    max_yoy = float(y_on_y.max())
    print(f'\nMax CI width across {len(env_df)} years: {env_df["width"].max():.5f}')
    print(f'Max year-on-year change in median (post-baseline): {max_yoy:.5f}')
    print(f'Ratio max-CI-width / max-y-y: {env_df["width"].max() / max(1e-9, max_yoy):.2f}')

 year  n_resamples  median  ci_lower  ci_upper   width
 2010           20 0.99534   0.85073   1.12566 0.27493
 2011           20 0.87673   0.70201   0.97021 0.26820
 2012           20 0.90108   0.74194   1.01988 0.27794
 2013           20 1.00365   0.91906   1.13230 0.21324
 2014           20 0.00000  -0.00000   0.00000 0.00000
 2015           20 0.98119   0.91896   1.09430 0.17534
 2016           20 1.02514   0.89112   1.10904 0.21792
 2017           20 1.01428   0.89851   1.10874 0.21023
 2018           20 1.02194   0.93206   1.12291 0.19085
 2019           20 1.05056   0.89175   1.20040 0.30866
 2020           20 0.96849   0.88756   1.09999 0.21243
 2021           20 0.91788   0.76756   1.04485 0.27729
 2022           20 0.99802   0.85976   1.15019 0.29043
 2023           20 0.97996   0.84861   1.19603 0.34742

Max CI width across 14 years: 0.34742
Max year-on-year change in median (post-baseline): 0.11861
Ratio max-CI-width / max-y-y: 2.93

import numpy as np

sbert_loy = pcd.SBERTEmbedder()
years_loy = sorted(df['year'].unique().tolist())
endpoint_year = max(years_loy)

full_traj = pcd.semantic_trajectory(
    corpus, target='asylum', time_col='date', freq='Y',
    embedder=sbert_loy, window=5, baseline_period='2014',
)
full_end = float(full_traj[full_traj['period'].astype(str).str[:4] == str(endpoint_year)]
                 ['distance_from_baseline'].iloc[0])
print(f'Full-corpus {endpoint_year} distance from 2014 baseline: {full_end:.5f}')

loy_rows = []
for drop_year in years_loy:
    if drop_year == 2014:
        continue  # baseline year; dropping it removes the anchor
    df_loy = df[df['year'] != drop_year]
    corpus_loy = pcd.from_dataframe(
        df_loy, text_col='text',
        meta_cols=('member','party_abbrev','date','year','house'),
    )
    traj_loy = pcd.semantic_trajectory(
        corpus_loy, target='asylum', time_col='date', freq='Y',
        embedder=sbert_loy, window=5, baseline_period='2014',
    )
    match = traj_loy[traj_loy['period'].astype(str).str[:4] == str(endpoint_year)]
    if len(match):
        loy_rows.append({'year_dropped': drop_year,
                         'endpoint_distance': round(float(match['distance_from_baseline'].iloc[0]), 5)})

loy_df = pd.DataFrame(loy_rows)
loy_df['delta_vs_full'] = (loy_df['endpoint_distance'] - full_end).round(5)
print(loy_df.to_string(index=False))
print(f'\nMax |delta| in {endpoint_year} endpoint from dropping any single year: '
      f'{loy_df["delta_vs_full"].abs().max():.5f}')
print(f'Full endpoint distance: {full_end:.5f}; '
      f'max leverage as fraction of endpoint: '
      f'{loy_df["delta_vs_full"].abs().max() / max(1e-9, abs(full_end)):.1%}')

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

Full-corpus 2023 distance from 2014 baseline: 0.00906

 year_dropped  endpoint_distance  delta_vs_full
         2010            0.00906           -0.0
         2011            0.00906           -0.0
         2012            0.00906           -0.0
         2013            0.00906           -0.0
         2015            0.00906           -0.0
         2016            0.00906           -0.0
         2017            0.00906           -0.0
         2018            0.00906           -0.0
         2019            0.00906           -0.0
         2020            0.00906           -0.0
         2021            0.00906           -0.0
         2022            0.00906           -0.0

Max |delta| in 2023 endpoint from dropping any single year: 0.00000
Full endpoint distance: 0.00906; max leverage as fraction of endpoint: 0.0%

tr_criminal = pcd.track(corpus, 'criminal').over_time(freq='Q', time_col='date')
bursts = tr_criminal.burstiness(s=2.0, gamma=1.0, n_states=4)
print(bursts.summary())
bursts.bursts

BurstinessResult(target='criminal', 3 burst(s), max intensity = state 1, base rate p_0 = 3.33e-04)

bursts.plot(width=1100, height=400)

s_sensitivity = []
for s in [1.5, 2.0, 2.5, 3.0]:
    b = tr_criminal.burstiness(s=s, gamma=1.0, n_states=4)
    s_sensitivity.append({
        's': s,
        'n_bursts': len(b.bursts),
        'periods_at_state_ge_1': int((b.states >= 1).sum()),
        'max_state_reached': int(b.states.max()),
        'first_burst_start': str(b.bursts['start'].iloc[0]) if len(b.bursts) else 'none',
    })
pd.DataFrame(s_sensitivity)

tr_migrant = pcd.track(corpus, 'migrant').over_time(freq='Q', time_col='date')
bursts_migrant = tr_migrant.burstiness(s=2.0, gamma=1.0, n_states=4)
print(bursts_migrant.summary())
bursts_migrant.bursts

BurstinessResult(target='migrant', 4 burst(s), max intensity = state 3, base rate p_0 = 1.49e-04)

from pycorpdiff import kleinberg_bursts
import numpy as np

traj_df = tr_criminal.to_df()
counts_obs = traj_df['count'].values.astype(int)
totals_obs = traj_df['total'].values.astype(int)

obs_states = kleinberg_bursts(counts_obs, totals_obs, s=2.0, gamma=1.0, n_states=4)

def n_bursts_from_states(states):
    states = np.asarray(states)
    transitions = np.diff((states >= 1).astype(int))
    return int((transitions == 1).sum()) + (1 if states[0] >= 1 else 0)

obs_n = n_bursts_from_states(obs_states)
obs_periods_high = int((obs_states >= 1).sum())
mean_burst_len_obs = obs_periods_high / max(1, obs_n)
print(f'Observed n_bursts on criminal: {obs_n}')
print(f'Observed periods at state >= 1: {obs_periods_high} / {len(obs_states)}')
print(f'Observed mean burst length: {mean_burst_len_obs:.2f} quarters')

rng_t = np.random.default_rng(0)
null_n, null_len = [], []
for trial in range(100):
    counts_shuf = rng_t.permutation(counts_obs)
    states_shuf = kleinberg_bursts(counts_shuf, totals_obs, s=2.0, gamma=1.0, n_states=4)
    n = n_bursts_from_states(states_shuf)
    periods = int((states_shuf >= 1).sum())
    null_n.append(n)
    null_len.append(periods / max(1, n))
null_n = np.array(null_n)
null_len = np.array(null_len)
p_lower = float((null_n <= obs_n).mean())
p_len_upper = float((null_len >= mean_burst_len_obs).mean())
print(f'\nPermutation null (100 trials, same totals, shuffled counts):')
print(f'  n_bursts: median {np.median(null_n):.1f}, 5th pct {np.quantile(null_n,0.05):.1f}, min {null_n.min()}')
print(f'Empirical p-value, observed n_bursts <= null (lower tail): {p_lower:.3f}')
print(f'  mean_burst_len: observed {mean_burst_len_obs:.2f}, null median {np.median(null_len):.2f}')
print(f'Empirical p-value, observed mean_burst_len >= null (upper tail): {p_len_upper:.3f}')

Observed n_bursts on criminal: 3
Observed periods at state >= 1: 4 / 56
Observed mean burst length: 1.33 quarters

Permutation null (100 trials, same totals, shuffled counts):
  n_bursts: median 9.0, 5th pct 6.0, min 6
Empirical p-value, observed n_bursts <= null (lower tail): 0.000
  mean_burst_len: observed 1.33, null median 1.70
Empirical p-value, observed mean_burst_len >= null (upper tail): 0.880

tr_illegal = pcd.track(corpus, 'illegal').over_time(freq='Q', time_col='date')
cps = tr_illegal.changepoints(target='illegal', method='pelt')
print('PELT changepoints on rate of "illegal" per quarter:')
cps

PELT changepoints on rate of "illegal" per quarter:

penalty_grid = [1.0, 3.0, 5.0, 10.0, 20.0, 50.0]
cp_rows = []
for pen in penalty_grid:
    cps_p = tr_illegal.changepoints(target='illegal', method='pelt', penalty=pen)
    cp_rows.append({
        'penalty': pen,
        'n_changepoints': len(cps_p),
        'changepoints': ', '.join(str(p) for p in cps_p['period'].tolist()),
    })
pd.DataFrame(cp_rows)

import warnings
with warnings.catch_warnings(record=True) as _cw:
    warnings.simplefilter('always')
    impact = tr_illegal.causal_impact(
        event_date='2016-06-23', target='illegal', level=0.95, seed=0,
    )
print(impact.summary())
impact.plot(width=1100, height_per_panel=260)

CausalImpactResult(target='illegal', event=2016-06-23, pre=26, post=30)
  avg effect:        -0.0003 per period  (95% CrI [-0.0006, +0.0000])
  cumulative effect: -0.0080
  relative effect:   -40.3% vs counterfactual mean
  P(no effect):      0.100

import warnings
with warnings.catch_warnings(record=True) as _cw:
    warnings.simplefilter('always')
    impact_migrant = tr_migrant.causal_impact(
        event_date='2016-06-23', target='migrant', level=0.95, seed=0,
    )
    print(impact_migrant.summary())

CausalImpactResult(target='migrant', event=2016-06-23, pre=26, post=30)
  avg effect:        -0.0017 per period  (95% CrI [-0.0038, +0.0004])
  cumulative effect: -0.0512
  relative effect:   -90.6% vs counterfactual mean
  P(no effect):      0.126

import warnings
with warnings.catch_warnings(record=True) as _cw:
    warnings.simplefilter('always')
    placebo_dates = ['2014-06-01', '2015-03-01', '2017-09-01', '2018-04-15', '2019-04-15']
    rows = [{'date': '2016-06-23 (referendum, real)',
             'p_no_effect': impact.metrics['posterior_prob_no_effect'],
             'pre': impact.n_pre, 'post': impact.n_post}]
    for d in placebo_dates:
        imp_p = tr_illegal.causal_impact(event_date=d, target='illegal', level=0.95, seed=0)
        rows.append({'date': d + ' (placebo)',
                     'p_no_effect': imp_p.metrics['posterior_prob_no_effect'],
                     'pre': imp_p.n_pre, 'post': imp_p.n_post})
    placebo_df = pd.DataFrame(rows)
    print(placebo_df.to_string(index=False))
    real_p = placebo_df['p_no_effect'].iloc[0]
    placebo_p = placebo_df['p_no_effect'].iloc[1:]
    print(f'\nReal P(no effect):            {real_p:.3f}')
    print(f'Placebo P(no effect) median:  {placebo_p.median():.3f}')
    print(f'Placebo P(no effect) min:     {placebo_p.min():.3f}')
    print(f'Median placebo / real ratio:  {placebo_p.median() / max(1e-10, real_p):.2f}')
    print(f'Placebos with P(no effect) < real: {int((placebo_p < real_p).sum())} of {len(placebo_p)}')

                         date  p_no_effect  pre  post
2016-06-23 (referendum, real)        0.100   26    30
         2014-06-01 (placebo)        0.938   18    38
         2015-03-01 (placebo)        0.978   21    35
         2017-09-01 (placebo)        0.172   31    25
         2018-04-15 (placebo)        0.622   34    22
         2019-04-15 (placebo)        0.092   38    18

Real P(no effect):            0.100
Placebo P(no effect) median:  0.622
Placebo P(no effect) min:     0.092
Median placebo / real ratio:  6.22
Placebos with P(no effect) < real: 1 of 5

import warnings
with warnings.catch_warnings(record=True) as _cw:
    warnings.simplefilter('always')
    tr_committee = pcd.track(corpus, 'committee').over_time(freq='Q', time_col='date')
    impact_committee = tr_committee.causal_impact(
        event_date='2016-06-23', target='committee', level=0.95, seed=0,
    )
    tr_fisheries = pcd.track(corpus, 'fisheries').over_time(freq='Q', time_col='date')
    impact_fisheries = tr_fisheries.causal_impact(
        event_date='2016-06-23', target='fisheries', level=0.95, seed=0,
    )

    print('Donor-series checks at 2016-06-23 (real referendum):')
    print(f'  committee (procedural):    P(no effect) = {impact_committee.metrics["posterior_prob_no_effect"]:.3f}')
    print(f'  fisheries (policy-domain): P(no effect) = {impact_fisheries.metrics["posterior_prob_no_effect"]:.3f}')
    print(f'  illegal   (focal):         P(no effect) = {impact.metrics["posterior_prob_no_effect"]:.3f}')
    print()
    print(f'committee / illegal ratio: {impact_committee.metrics["posterior_prob_no_effect"] / max(1e-10, impact.metrics["posterior_prob_no_effect"]):.1f}')
    print(f'fisheries / illegal ratio: {impact_fisheries.metrics["posterior_prob_no_effect"] / max(1e-10, impact.metrics["posterior_prob_no_effect"]):.1f}')

Donor-series checks at 2016-06-23 (real referendum):
  committee (procedural):    P(no effect) = 0.804
  fisheries (policy-domain): P(no effect) = 0.856
  illegal   (focal):         P(no effect) = 0.100

committee / illegal ratio: 8.0
fisheries / illegal ratio: 8.6

import warnings
with warnings.catch_warnings(record=True) as _cw:
    warnings.simplefilter('always')
    loy_rows = []
    all_years = sorted(df['year'].unique().tolist())
    for drop_year in all_years:
        df_loy = df[df['year'] != drop_year].copy()
        df_loy['year'] = df_loy['date'].dt.year
        corpus_loy = pcd.from_dataframe(
            df_loy, text_col='text',
            meta_cols=('member','party_abbrev','date','year','house'),
        )
        tr_loy = pcd.track(corpus_loy, 'illegal').over_time(freq='Q', time_col='date')
        try:
            imp_loy = tr_loy.causal_impact(
                event_date='2016-06-23', target='illegal', level=0.95, seed=0,
            )
            loy_rows.append({
                'year_dropped': drop_year,
                'n_pre': imp_loy.n_pre, 'n_post': imp_loy.n_post,
                'p_no_effect': imp_loy.metrics['posterior_prob_no_effect'],
                'avg_effect': imp_loy.metrics['avg_effect'],
            })
        except Exception as e:
            print(f'  warn dropping {drop_year}: {e}')

    loy_df = pd.DataFrame(loy_rows)
    print(loy_df.to_string(index=False))
    print(f'\nFull-corpus P(no effect): {impact.metrics["posterior_prob_no_effect"]:.3f}')
    print(f'LOY range: [{loy_df["p_no_effect"].min():.3f}, {loy_df["p_no_effect"].max():.3f}]')
    print(f'LOY median: {loy_df["p_no_effect"].median():.3f}')
    print(f'Max single-year leverage (any drop changes P(no effect) by): '
          f'{(loy_df["p_no_effect"] - impact.metrics["posterior_prob_no_effect"]).abs().max():.3f}')

_nconv = sum(1 for _x in _cw if 'converge' in str(_x.message).lower())
print(f'\n({_nconv} of {len(loy_rows)} leave-one-year-out BSTS fits did not '
      f'fully converge -- non-convergence is itself consistent with the '
      f'single-year-leverage instability this section documents.)')

 year_dropped  n_pre  n_post  p_no_effect  avg_effect
         2010     22      30        0.442   -0.000247
         2011     22      30        0.416   -0.000290
         2012     22      30        0.418   -0.000295
         2013     22      30        0.294   -0.000335
         2014     22      30        0.230   -0.000347
         2015     22      30        0.946    0.000022
         2016     24      28        0.098   -0.000459
         2017     26      26        0.202   -0.000192
         2018     26      26        0.144   -0.000221
         2019     26      26        0.186   -0.000198
         2020     26      26        0.144   -0.000223
         2021     26      26        0.086   -0.000262
         2022     26      26        0.112   -0.000244
         2023     26      26        0.030   -0.000327

Full-corpus P(no effect): 0.100
LOY range: [0.030, 0.946]
LOY median: 0.194
Max single-year leverage (any drop changes P(no effect) by): 0.846

(12 of 14 leave-one-year-out BSTS fits did not fully converge -- non-convergence is itself consistent with the single-year-leverage instability this section documents.)

ld_traj = pcd.lexical_diversity(
    corpus, freq='Y', time_col='date',
    ci='bootstrap', n_boot=99, seed=0,
)
ld_traj

LexicalDiversityTrajectory(label='corpus', freq='Y', periods=14, metrics=['HD-D', 'MATTR', 'MTLD', 'TTR'])

ld_traj.plot(width=1000)

tr_asylum = pcd.track(corpus, 'asylum').over_time(freq='Q', time_col='date')
fc = tr_asylum.forecast(horizon=4, level=0.95, target='asylum')
print(fc.summary())
fc.plot(width=1100, height=440)

ForecastResult(targets=['asylum'], freq='Q', horizon=4, level=0.95, method=auto)

import numpy as np

asylum_q = tr_asylum.to_df().sort_values('period').reset_index(drop=True)
# Withhold last 4 quarters
n_periods = len(asylum_q)
holdout = 4
train_q = asylum_q.iloc[:-holdout].copy()
test_q  = asylum_q.iloc[-holdout:].copy()

# Build a TemporalTrajectory-like object from train via direct re-track
# (cheapest path: re-fit on the truncated data via a date filter on the corpus)
train_max_date = train_q['period'].iloc[-1].to_timestamp(how='end')
df_train = df[df['date'] <= train_max_date].copy()
corpus_train = pcd.from_dataframe(
    df_train, text_col='text',
    meta_cols=('member','party','party_abbrev','date','year','house','debate_title','hansard_id'),
)
tr_asylum_train = pcd.track(corpus_train, 'asylum').over_time(freq='Q', time_col='date')
fc_train = tr_asylum_train.forecast(horizon=holdout, level=0.95, target='asylum')

# ETS predictions (point forecast)
fc_table = fc_train.to_df()
ets_pred = fc_table['point'].values
test_actual = test_q['relfreq'].values

# Naive baseline: seasonal-mean (same-quarter mean over training)
train_q['q_index'] = train_q['period'].dt.quarter
seasonal_mean = train_q.groupby('q_index')['relfreq'].mean()
test_q['q_index'] = test_q['period'].dt.quarter
naive_pred = test_q['q_index'].map(seasonal_mean).values

rmse = lambda y, yhat: float(np.sqrt(np.mean((y - yhat) ** 2)))
mae  = lambda y, yhat: float(np.mean(np.abs(y - yhat)))

print(f'Held-out: {holdout} quarters ({test_q["period"].iloc[0]} to {test_q["period"].iloc[-1]})')
print(f'\n                ETS         naive seasonal-mean')
print(f'  RMSE         {rmse(test_actual, ets_pred):.6f}    {rmse(test_actual, naive_pred):.6f}')
print(f'  MAE          {mae(test_actual,  ets_pred):.6f}    {mae(test_actual,  naive_pred):.6f}')
print(f'\nETS beats naive on RMSE: {rmse(test_actual, ets_pred) < rmse(test_actual, naive_pred)}')

# Interval coverage
ets_lower = fc_table['ci_lower'].values
ets_upper = fc_table['ci_upper'].values
in_interval = ((test_actual >= ets_lower) & (test_actual <= ets_upper)).sum()
print(f'\n95% PI coverage: {in_interval}/{holdout} actuals fall inside the predicted band')

Held-out: 4 quarters (2023Q1 to 2023Q4)

                ETS         naive seasonal-mean
  RMSE         0.000953    0.001313
  MAE          0.000565    0.001267

ETS beats naive on RMSE: True

95% PI coverage: 4/4 actuals fall inside the predicted band

parties = ['Con', 'Lab', 'LD', 'SNP']
slices = [commons.slice(party_abbrev=p) for p in parties]
nway = pcd.keyness_multi(slices, labels=parties, min_count=20)
print(f'{len(nway):,} terms across {len(parties)} parties (Commons-only)')
nway.head(12)

5,834 terms across 4 parties (Commons-only)

top_n = 18
# Keep the term-indexed frame so the rate divisions stay aligned;
# resetting and re-indexing via Series triggers pandas index alignment
# on the count columns (term-string label vs int-position label = no
# overlap = silent all-NaN). See pandas docs on DataFrame-from-dict
# index semantics.
top_terms_df = nway.head(top_n)  # index = term
totals = pd.Series({p: commons.slice(party_abbrev=p).total_tokens() for p in parties})
rates = pd.DataFrame({
    p: top_terms_df[f'count_{p}'] / totals[p] * 1_000_000 for p in parties
})  # index inherits from top_terms_df = term
z = rates.sub(rates.mean(axis=1), axis=0).div(rates.std(axis=1).replace(0, 1), axis=0)

heat = z.reset_index().melt('term', var_name='party', value_name='z')
alt.Chart(heat).mark_rect().encode(
    x=alt.X('party:N', title=None, sort=parties),
    y=alt.Y('term:N', sort=top_terms_df.index.tolist()),
    color=alt.Color(
        'z:Q',
        scale=alt.Scale(scheme='redblue', domainMid=0, reverse=True),
        title='Row z-score (rate / million)',
        legend=alt.Legend(labelExpr=r"replace(datum.label, '\u2212', '-')"),
    ),
    tooltip=['party', 'term', alt.Tooltip('z:Q', format='.2f')],
).properties(
    width=620, height=740,
    title='Party-distinctive asylum vocabulary (Commons-only, z-scored per term)',
)

# Without a stop-word filter, ``cooccurrence_network`` ranks vocabulary
# by raw frequency -- on any English corpus the top-30 is entirely
# closed-class function words (the, and, of, to, ...). For an
# interpretable discourse graph we need to exclude those.
ENGLISH_STOPWORDS = {
    'the', 'and', 'of', 'to', 'a', 'in', 'that', 'is', 'it', 'for',
    'on', 'with', 'as', 'by', 'this', 'be', 'are', 'was', 'were', 'has',
    'have', 'had', 'or', 'but', 'not', 'if', 'so', 'we', 'i', 'they',
    'he', 'she', 'his', 'her', 'their', 'our', 'my', 'me', 'us', 'them',
    'do', 'does', 'did', 'will', 'would', 'should', 'could', 'can',
    'may', 'might', 'must', 'shall', 'an', 'at', 'from', 'up', 'down',
    'out', 'into', 'about', 'over', 'under', 'than', 'then', 'when',
    'where', 'why', 'how', 'what', 'who', 'which', 'no', 'yes', 'one',
    'two', 'three', 'first', 'second', 'last', 'all', 'any', 'some',
    'many', 'much', 'more', 'most', 'few', 'less', 'least', 'very',
    'just', 'only', 'also', 'too', 'still', 'now', 'here', 'there',
    'been', 'being', 'am', 'you', 'your', 'its', "it's", 'because',
    'though', 'although', 'while', 'after', 'before', 'between', 'such',
    'these', 'those', 'each', 'every', 'other', 'another', 'same',
    'own', 'said', 'say', 'says', 'get', 'got', 'go', 'going', 'see',
    'make', 'made', 'take', 'taken', 'come', 'came', 'know', 'known',
    'think', 'thought', 'want', 'wanted', 'find', 'found', 'give',
    'given', 'put', 'use', 'used', 'work', 'works', 'point', 'place',
    'way', 'time', 'year', 'years', 'day', 'man', 'people', 'new', 'old',
    'good', 'great', 'right', 'long', 'little', 'big', 'high', 'large',
    'small', 'different', 'important', 'sure', 'able', 'mr', 'mrs', 'ms',
}
net = pcd.cooccurrence_network(
    corpus, top_n=30, window=5, measure='PMI', min_count=10,
    stop_words=ENGLISH_STOPWORDS,
)
# Filter to top-60 strongest edges by weight (PMI) to break the
# complete-graph problem and reveal community structure. The full
# edge list is preserved on net.edges; the filtered version is what
# we use for modularity + plotting.
top_edges = net.edges.nlargest(60, 'weight').reset_index(drop=True)
print(f'Network: {net.edges.shape[0]} edges total; filtered to top {len(top_edges)} by PMI.')
net.plot(width=1100, height=820, max_edges=60, label_top_n=30).properties(
    title='Asylum-discourse co-occurrence (PMI, top-30 content terms, top-60 PMI edges)'
)

Network: 435 edges total; filtered to top 60 by PMI.

import networkx as nx
import numpy as np

# Build observed graph from net.edges, restricted to top-60 by weight.
top_edges = net.edges.nlargest(60, 'weight').reset_index(drop=True)
g_obs = nx.Graph()
for _, row in top_edges.iterrows():
    g_obs.add_edge(row['source'], row['target'], weight=float(row.get('weight', 1.0)))
n_nodes = g_obs.number_of_nodes()
n_edges = g_obs.number_of_edges()
print(f'Filtered network: {n_nodes} nodes, {n_edges} edges (top {n_edges} by PMI)')
comms_obs = list(nx.community.greedy_modularity_communities(g_obs))
mod_obs = nx.community.modularity(g_obs, comms_obs)
print(f'Observed modularity (greedy): {mod_obs:.3f}')
print(f'Observed community count:     {len(comms_obs)}')
for i, c in enumerate(comms_obs[:5]):
    print(f'  community {i}: {sorted(c)[:8]}')

rng_g = np.random.default_rng(0)
null_mods = []
for trial in range(50):
    g_rand = nx.gnm_random_graph(n_nodes, n_edges, seed=int(rng_g.integers(2**31 - 1)))
    if g_rand.number_of_edges() == 0:
        continue
    try:
        comms_r = list(nx.community.greedy_modularity_communities(g_rand))
        null_mods.append(nx.community.modularity(g_rand, comms_r))
    except Exception:
        pass
null_mods = np.array(null_mods)
p_above = float((null_mods >= mod_obs).mean())
print(f'\nErdos-Renyi null (50 trials, matched n, m):')
print(f'  mean modularity {null_mods.mean():.3f}, 95th pct {np.quantile(null_mods, 0.95):.3f}')
print(f'Empirical p-value (one-sided, upper tail): {p_above:.3f}')

Filtered network: 30 nodes, 60 edges (top 60 by PMI)
Observed modularity (greedy): 0.486
Observed community count:     5
  community 0: ['asylum', 'children', 'country', 'government', 'local', 'refugees', 'seekers', 'support']
  community 1: ['amendment', 'friend', 'hon', 'house', 'lord', 'lords', 'member', 'noble']
  community 2: ['home', 'minister', 'office', 's', 'secretary']
  community 3: ['act', 'bill', 'human', 'rights']
  community 4: ['immigration', 'legal', 'need', 'system']

Erdos-Renyi null (50 trials, matched n, m):
  mean modularity 0.362, 95th pct 0.402
Empirical p-value (one-sided, upper tail): 0.000

import networkx as nx
import numpy as np

edge_grid = [30, 60, 100, 200]
results = []
prev_nodes = None
for k in edge_grid:
    k_actual = min(k, len(net.edges))
    edges_k = net.edges.nlargest(k_actual, 'weight').reset_index(drop=True)
    g = nx.Graph()
    for _, r in edges_k.iterrows():
        g.add_edge(r['source'], r['target'], weight=float(r.get('weight', 1.0)))
    if g.number_of_edges() == 0:
        continue
    comms = list(nx.community.greedy_modularity_communities(g))
    mod = nx.community.modularity(g, comms)
    node_set = set(g.nodes())
    j = len(node_set & prev_nodes) / max(1, len(node_set | prev_nodes)) if prev_nodes else np.nan
    results.append({'top_edges': k_actual, 'nodes': g.number_of_nodes(),
                    'edges': g.number_of_edges(), 'modularity': round(mod, 3),
                    'communities': len(comms), 'jaccard_vs_prev_nodes': round(j, 2) if not np.isnan(j) else '--'})
    prev_nodes = node_set

import pandas as pd
print(pd.DataFrame(results).to_string(index=False))

 top_edges  nodes  edges  modularity  communities jaccard_vs_prev_nodes
        30     26     30       0.645            5                    --
        60     30     60       0.486            5                  0.87
       100     30    100       0.374            4                   1.0
       200     30    200       0.242            2                   1.0

import numpy as np

def inject(base_df, term, year_set, rate, rng):
    out = base_df.copy()
    mask = out['date'].dt.year.isin(year_set)
    target = out.index[mask].to_numpy()
    n_inj = int(len(target) * rate)
    if n_inj == 0:
        return out
    chosen = rng.choice(target, size=n_inj, replace=False)
    out.loc[chosen, 'text'] = out.loc[chosen, 'text'] + ' synthcampaign'
    return out

rng_inj = np.random.default_rng(0)
inject_years = {2021, 2022}
results = []
for rate in [0.0, 0.002, 0.005, 0.01, 0.02, 0.05]:
    df_inj = inject(df, 'synthcampaign', inject_years, rate, rng_inj)
    corpus_inj = pcd.from_dataframe(
        df_inj, text_col='text',
        meta_cols=('member','party_abbrev','date','year','house'),
    )
    tr_s = pcd.track(corpus_inj, 'synthcampaign').over_time(freq='Q', time_col='date')
    try:
        b = tr_s.burstiness(s=2.0, gamma=1.0, n_states=4)
        burst_in_window = any(
            (str(s)[:4] in ('2021', '2022')) for s in b.bursts['start'].astype(str)
        ) if len(b.bursts) else False
        n_bursts = len(b.bursts)
    except Exception:
        burst_in_window, n_bursts = False, 0
    n_injected = int(df['date'].dt.year.isin(inject_years).sum() * rate)
    results.append({'inject_rate': rate, 'n_docs_injected': n_injected,
                    'n_bursts': n_bursts, 'detected_in_2021_2022': burst_in_window})

import pandas as pd
mde = pd.DataFrame(results)
print(mde.to_string(index=False))
detected = mde[mde['detected_in_2021_2022']]
if len(detected):
    print(f'\nMinimum detectable injection rate: {detected["inject_rate"].min():.3f} '
          f'({detected["n_docs_injected"].min()} injected docs)')
print(f'False positive at rate 0.0 (no injection): '
      f'{bool(mde[mde["inject_rate"]==0.0]["detected_in_2021_2022"].iloc[0])}')

 inject_rate  n_docs_injected  n_bursts  detected_in_2021_2022
       0.000                0         0                  False
       0.002                5         0                  False
       0.005               14         1                   True
       0.010               28         1                   True
       0.020               56         1                   True
       0.050              141         1                   True

Minimum detectable injection rate: 0.005 (14 injected docs)
False positive at rate 0.0 (no injection): False

	term	count_a	count_b	g2	log_ratio
10	asylum	22680	18	4176.656048	6.958828
11	government	20022	7	3778.018044	8.081561
13	people	22817	143	3354.705585	4.012063
16	are	45302	1513	2610.234891	1.602747
17	uk	13087	0	2529.589226	11.375019
21	the	321748	24096	2258.020120	0.438155
23	is	76103	3962	2060.996862	0.962597
26	minister	10327	3	1955.551752	8.225965
28	noble	12178	58	1893.985777	4.400801
29	immigration	9725	0	1879.747477	10.946673
30	bill	10854	28	1841.675257	5.272232
34	that	112921	7167	1706.721744	0.676823
35	children	11829	84	1687.276294	3.828339
42	support	9387	21	1616.215981	5.469377
43	hon	8466	4	1586.210180	7.576742

	term	count_a	count_b	g2	g2_ci_lower	g2_ci_upper	p_adjusted
0	data	986	201	504.961821	-4318.385260	5483.793157	4.047150e-108
1	home	1595	2891	-492.144650	-1417.326595	399.905648	1.244209e-105
2	minister	1438	2643	-465.390569	-1534.010416	550.276684	5.501289e-100
3	we	11791	8022	455.381601	-986.505813	1884.232799	6.217937e-98
4	government	2966	4446	-429.839630	-1295.913055	390.429773	1.801289e-92
5	office	1044	1903	-328.222229	-1228.698306	540.474418	1.998025e-70
6	protection	1072	351	327.470807	-2338.759059	3081.300045	2.496422e-70
7	hon	3822	2193	326.677266	-477.977210	1114.644529	3.252108e-70
8	she	787	1509	-292.912903	-1192.674794	581.098507	6.552243e-63
9	gdpr	236	2	288.469385	-2242.022109	2853.508152	5.480589e-62
10	european	1156	462	254.663286	-581.886660	1097.422784	1.162000e-54
11	illegal	774	252	238.300268	-254.488766	738.823362	3.934666e-51
12	lady	461	94	235.974566	-236.638904	706.831221	1.167556e-50
13	women	395	885	-233.983317	-1085.154850	594.457662	2.946538e-50
14	amendment	1742	868	232.772266	-1782.793157	2352.428469	5.051757e-50

	l1_pre	l1_post
covariate
year	0.099974	0.0

	term	count_a	count_b	g2	log_ratio
0	we	8359	7858	465.741626	0.487035
1	minister	1000	2580	-354.794762	-0.969057
2	home	1177	2833	-324.854168	-0.868986
3	government	2066	4326	-315.695679	-0.668138
4	hon	2698	2154	304.035797	0.722677
5	lady	366	94	255.636973	2.353300
6	illegal	560	244	230.113706	1.594753
7	office	777	1869	-213.752008	-0.867865
8	under	1739	1414	183.313362	0.696253
9	women	277	872	-181.426560	-1.254795
10	trent	152	13	179.202827	3.895651
11	make	1376	1053	178.468142	0.783687

	collocate	count_a	count_b	score_a	score_b	shift
0	missing	0	16	1.716333	6.973938	-5.257605
1	lgbtq	0	15	1.730581	6.883405	-5.152823
2	chaos	0	14	1.728829	6.784174	-5.055345
3	fled	0	13	1.719519	6.667425	-4.947905
4	tamil	0	11	1.727370	6.471673	-4.744303
5	failure	0	11	1.709405	6.404621	-4.695216
6	14	0	10	1.672447	6.313166	-4.640719
7	tories	0	10	1.731166	6.333311	-4.602145
8	fifth	12	0	6.367150	1.959710	4.407440
9	pressures	12	0	6.342932	1.943701	4.399231
10	experience	1	24	3.263598	7.497716	-4.234118
11	queue	10	0	6.112127	1.959025	4.153101

	Section	Predicted outcome	Falsifier
0	5.0 Corpus conditioning caveat	Notebook scope is asylum-conditioned discourse...	methodological framing, not a quantitative test
1	5.1a Duplicate speech detection	No mass duplication in the cached corpus	< 1% of contributions in exact-duplicate groups
2	5.2 Reference-baseline keyness	Hansard-distinctive vocabulary should reflect ...	asylum, refugees, Home Office, Government, Sec...
3	5.3 Con-vs-Lab keyness with CI	Direction-confident divergence on enforcement ...	Con: borders, illegal, criminal; Lab: rights, ...
4	5.3b Shuffled-label null (sum-top-10)	Sum of top-10 \|G^2\| exceeds 95th pct of label-...	p_perm < 0.05
5	5.3c Bootstrap CI / BH alignment	BH-sig with CI excluding zero >= 70%; BH-non-s...	either alignment falls below its threshold
6	5.3d Formal CI coverage MC	Simultaneous-CI top-G^2 coverage within +/-10%...	simultaneous coverage outside [85%, 100%]
7	5.3e min_count sweep	Top-15 keyness terms stable across min_count {...	Jaccard overlap < 0.6 vs canonical min_count=20
8	5.3f Top-K-speaker leverage	Top-15 keyness stable after excluding top-10 p...	Jaccard overlap < 0.5 vs canonical full-corpus...
9	5.4 CEM-matched keyness	L1 imbalance on year drops sharply post-match;...	L1 imbalance reduction > 0.05 absolute
10	5.5 Collocation shift on asylum	Distinct collocate clusters by party (enforcem...	KWIC evidence retrievable via .explain()
11	5.5b Chamber-stratified collocation	Commons-only and Lords-only top-12 collocates ...	Jaccard < 0.4 or directional reversal between ...
12	5.6 Semantic trajectory of asylum	Distance from 2014 baseline increases over tim...	cosine distance monotonic-ish increase post-2020
13	5.6c Bootstrap stability of trajectory	CI envelope widths small relative to year-on-y...	max CI width > max year-on-year median change
14	5.6d Leave-year-out SBERT trajectory	Endpoint distance robust to dropping any singl...	non-zero leverage on a middle-year drop
15	5.7 Burstiness of criminal	Bursts coincide with referendum 2016, Channel ...	state >= 1 in 2016Q2-Q3 OR 2021-Q3 OR 2023-Q4
16	5.7c Permuted-time null (revised direction)	Observed n_bursts in lower tail of permutation...	p_lower > 0.05 on n_bursts
17	5.8 Changepoint + causal-impact on illegal (26...	2016-06-23 referendum effect on the rate of "i...	PELT changepoint in 2016Q2-Q3 OR causal_impact...
18	5.8c Placebo-date sweep	Median placebo P(no effect) > 2x real-referend...	placebo / real ratio <= 2 (model fires on arbi...
19	5.8d Donor-series check (2 controls)	Both committee and fisheries P(no effect) >> i...	either control returns P(no effect) within 0.1...
20	5.8e Leave-one-year-out for causal_impact	Max single-year leverage on P(no effect) < 0.30	dropping any year shifts P(no effect) by >= 0.30
21	5.9 Lexical diversity over time	Diversity (MTLD/HD-D) trend within +/-10% acro...	no abrupt year-on-year shifts > 2 sigma
22	5.10 Forecast of asylum rate	ETS prediction-interval widens monotonically w...	PI band width(t+4) > PI band width(t+1)
23	5.10b Rolling-origin backtest	ETS beats naive seasonal-mean on RMSE; 95% PI ...	ETS RMSE >= naive OR 95% PI coverage < 50%
24	5.11 N-way keyness by party	SNP signature: scotland, glasgow, scottish	top-5 SNP terms include at least 2 Scotland-sp...
25	5.12 Co-occurrence network (top-60 PMI edges)	Distinct enforcement / process / humanitarian ...	>= 3 visible community structures from greedy ...
26	5.12b Random-graph modularity null	Observed network modularity > Erdos-Renyi 95th...	observed modularity <= ER 95th pct
27	5.12c Edge-threshold sensitivity	Modularity strictly positive across sweep; nod...	modularity drops to 0 OR Jaccard < 0.7 between...

party_abbrev	Con	Lab	LD	SNP		XB
year
2010	93	92	38	2	10	27
2011	115	61	55	0	17	17
2012	67	47	59	3	56	12
2013	95	75	60	0	22	19
2014	169	103	83	1	80	48
2015	323	177	44	63	86	49
2016	388	200	71	76	170	82
2017	146	92	33	49	9	10
2018	170	136	39	39	6	19
2019	194	127	36	60	6	9
2020	330	162	57	81	6	44
2021	508	339	33	193	23	28
2022	511	416	102	106	97	127
2023	559	479	161	163	45	95

	corpus	left	keyword	right
0	house='Commons', party_abbrev='Lab'	checks on 100 000 missing	asylum	and immigration cases have been
1	house='Commons', party_abbrev='Lab'	hunting for 10 000 missing	asylum	seekers that hmrc had lost
2	house='Commons', party_abbrev='Lab'	just in 2015 593 unaccompanied	asylum	seeking children went missing from

	period	target	n_contexts	similarity_to_baseline	distance_from_baseline
0	2010	asylum	523	0.995467	4.532780e-03
1	2011	asylum	474	0.994972	5.027790e-03
2	2012	asylum	573	0.986678	1.332249e-02
3	2013	asylum	569	0.993668	6.331703e-03
4	2014	asylum	1113	1.000000	2.220446e-16
5	2015	asylum	1735	0.997766	2.234452e-03
6	2016	asylum	2313	0.995231	4.769155e-03
7	2017	asylum	825	0.992665	7.335073e-03
8	2018	asylum	864	0.992497	7.503316e-03
9	2019	asylum	935	0.996479	3.521314e-03
10	2020	asylum	1674	0.993947	6.052784e-03
11	2021	asylum	3325	0.993062	6.937967e-03
12	2022	asylum	3874	0.993886	6.114127e-03
13	2023	asylum	3883	0.990937	9.063429e-03

	start	end	n_periods	max_state	total_count	mean_relfreq
0	2010Q3	2010Q3	1	1	12	0.001478
1	2021Q3	2021Q4	2	1	203	0.000514
2	2023Q4	2023Q4	1	1	151	0.000852

	start	end	n_periods	max_state	total_count	mean_relfreq
0	2014Q1	2014Q1	1	1	33	0.000271
1	2016Q1	2016Q2	2	2	126	0.000399
2	2019Q3	2019Q4	2	3	41	0.000828
3	2020Q2	2020Q3	2	1	44	0.000287

	penalty	n_changepoints	changepoints
0	1.0	5	2011Q2, 2015Q1, 2016Q2, 2020Q1, 2022Q3
1	3.0	1	2020Q1
2	5.0	1	2020Q1
3	10.0	0
4	20.0	0
5	50.0	0

	count_Con	count_Lab	count_LD	count_SNP	g2	p_value	p_adjusted
term
scotland	207	127	12	812	1629.268975	0.000000e+00	0.000000e+00
scottish	127	96	4	519	1028.807812	1.013330e-222	2.955883e-219
glasgow	150	121	15	544	979.177023	5.918677e-212	1.150985e-208
data	986	201	55	76	652.704290	3.775951e-141	5.507224e-138
government	2966	4446	532	2224	636.528892	1.213493e-137	1.415903e-134
home	1595	2891	241	1351	610.685825	4.862017e-132	4.727501e-129
we	11791	8022	1271	3486	601.733306	4.242626e-130	3.535926e-127
act	1997	1046	37	299	552.472078	2.023447e-119	1.475599e-116
insert	750	316	0	32	479.437718	1.363426e-103	8.838031e-101
page	693	265	1	22	478.309183	2.394309e-103	1.394275e-100
section	1368	660	46	133	478.121848	2.628903e-103	1.394275e-100
minister	1438	2643	264	944	474.951572	1.278653e-102	6.216386e-100

§	Audit seam (method exercised)	Pre-registered expectation	Observed	Executed	Prediction held
0c	`keyness` G² vs Rayson LL Wizard	agreement < 0.01 absolute on 6 reference cases	worst abs error 2.85 × 10⁻³ across 6 cases	✔	✔
5.0	scope and corpus conditioning framing	explicit caveat in place before §5.1	added; the case study is asylum-conditioned not migration-broad	✔	✔
5.1	corpus ingestion + profile	Hansard contributions 2010-2023, four-party mix	10 124 docs, 2010-01-05 → 2023-12-19, 4.95 M tokens	✔	✔
5.1a	duplicate speech detection	< 1 % in exact-duplicate groups	0.23 %	✔	✔
5.1b	institutional-drift audit	contribution-length CV < 0.3 AND chamber balance stable	length CV 0.13 ✓; Commons-share range 48 pp across years	✔	✗ chamber-composition drift — reinforces §5.5b
5.2	`against_baseline` vs Gutenberg fiction	Hansard policy/procedure vocabulary dominates	top: asylum, government, people, uk, minister	✔	✔
5.3	`compare.keyness(ci="bootstrap")` Con vs Lab	direction-confident enforcement-vs-rights divergence	Con+: data, we, protection; Lab+: government, home, minister	✔	✔
5.3b	shuffled-label sum-top-10 \|G²\|	p_perm < 0.05	observed 3 911.5, null 95th pct 2 499, p = 0.000	✔	✔
5.3c	BH / bootstrap-CI alignment	non-sig ≥ 95 %, sig ≥ 70 %	non-sig 96.3 %, sig 78.0 %	✔	✔
5.3d	bootstrap-CI coverage MC (per-term vs simultaneous)	both modes within ±10 % of nominal 95 % on top-ranked	per-term 63 %, simultaneous 100 %	✔	✔ closed by 0.1.0a20 simultaneous_ci=True
5.3e	`min_count` sweep on §5.3 keyness	Jaccard ≥ 0.6 across {10, 20, 50, 100}	Jaccard = 1.00 across sweep	✔	✔
5.3f	top-K-speaker leverage check	Jaccard ≥ 0.5 after excluding top-10 prolific speakers	Jaccard = 0.36	✔	✗ small-set leverage on "this corpus" structure
5.3g	speaker-clustered bootstrap (0.1.0a23 feature)	clustered CI wider than IID (ratio > 1)	mean widening 1.53×	✔	✔ IID was understating precision on nested data
5.4	`match()` CEM on year	L1 imbalance reduction > 0.05	0.13 → 0.00	✔	✔
5.5	`collocation_shift("asylum")` + `.explain()`	KWIC evidence retrievable	3 concordances tagged with party + house	✔	✔
5.5b	chamber-stratified collocation shift	Commons / Lords Jaccard ≥ 0.4	Jaccard = 0.00 — zero overlap on top-12 collocates	✔	✗ structural — see §5.5 caveat
5.6	`semantic_trajectory("asylum", SBERTEmbedder())`	2014 baseline distance rises, peaks 2022-2023	monotonic-ish; peak 2023 (d ≈ 0.009)	✔	✔
5.6b	quantified monotonic-trend (replaces "monotonic-ish")	Kendall τ > 0 AND post-2020 slope > 0	τ = 0.556 (p=0.029), slope +0.0008	✔	✔
5.6c	bootstrap stability of trajectory (HashEmbedder)	max CI width < max year-on-year change in median	ratio 2.93 — CI wider than y-y	✔	✗ HashEmbedder property; SBERT bootstrap is queued for future release
5.6d	leave-year-out SBERT trajectory	endpoint robust to dropping any single year	0.0 % leverage (per-period-independent by construction)	✔	✔ structural; weaker test than §5.8e — see prose
5.7	`TemporalTrajectory.burstiness()` on criminal	state ≥ 1 in 2016Q2-Q3 OR 2021Q3 OR 2023Q4	bursts in 2010Q3, 2021Q3-Q4, 2023Q4	✔	✔
5.7a	sensitivity to burst factor `s`	bursts stable across `s`	≥ 2 bursts at every `s`	✔	✔
5.7b	multi-term replication on migrant	bursts present	4 bursts; 2016Q1-Q2 aligns with referendum quarter	✔	✔
5.7c	permuted-time null on n_bursts (lower tail)	n_bursts in lower 5 % of null	observed 3, null median 9, p_lower = 0.000	✔	✔
5.8	PELT + `causal_impact` on illegal	PELT in 2016Q2-Q3 OR CrI excludes zero	PELT at penalty=1 picks 2016Q2; P(no effect) = 0.10	✔	✔ attribution heavily qualified by §5.8e — see prose
5.8a	PELT penalty sweep	location stable	2020Q1 robust; 2016Q2 emerges at low penalty	✔	✔
5.8b	`causal_impact` on migrant	agrees with §5.8	P(no effect) = 0.126	✔	✔
5.8c	placebo intervention-date sweep (eligible-only, redesigned 0.1.0a22)	median placebo P(no effect) > 2× real	median ratio = 6.22 (0.622 vs 0.100); 4/5 clean, 2019 still trips (0.092)	✔	✔ median passes; 2019 outlier reported, see prose
5.8d	donor-series check (2 controls)	both controls P(no effect) >> illegal	committee 0.804, fisheries 0.856 vs illegal 0.100; 8×	✔	✔
5.8e	leave-one-year-out for causal_impact	single-year leverage on P(no effect) < 0.30	max leverage = 0.846 (drop 2015 → 0.10 → 0.946)	✔	✗ severe; §5.8 is 2015-leveraged, not Brexit-leveraged
5.9	`lexical_diversity(freq="Y")`	no abrupt > 2σ year-on-year shifts	4-metric trajectory across 14 years, smooth	✔	✔
5.10	`forecast_trajectory(horizon=4)` on asylum	PI band widens with horizon	4-quarter ETS with widening 95 % PI	✔	✔
5.10b	rolling-origin backtest vs naive baseline	ETS beats naive RMSE; PI covers ≥ 50 %	ETS 0.000953 < naive 0.001313; 4 / 4 in 95 % PI	✔	✔
5.11	`keyness_multi(parties, Commons-only)`	SNP top-5 ≥ 2 Scotland-specific tokens	scotland (1 629), scottish (1 029), glasgow (979) — 3 of 3	✔	✔
5.12	`cooccurrence_network(top-60 PMI edges)`	≥ 3 visible community structures from greedy modularity	5 communities (humanitarian, Lords procedural, Home Office, legislation, policy/legal)	✔	✔
5.12b	modularity vs Erdős-Rényi null	observed > 95th pct	0.486 vs 0.402	✔	✔
5.12c	edge-threshold sensitivity	modularity > 0, Jaccard ≥ 0.7	mod 0.645 → 0.242; Jaccard 0.87 → 1.00	✔	✔
5.13	synthetic-signal injection + MDE	no false positive at rate 0; detect at realistic rate	MDE = 0.005 (14 docs); FP at rate 0 = False	✔	✔

	s	n_bursts	periods_at_state_ge_1	max_state_reached	first_burst_start
0	1.5	3	7	2	2012Q1
1	2.0	3	4	1	2010Q3
2	2.5	2	2	1	2010Q3
3	3.0	2	2	1	2010Q3

pycorpdiff asylum case study: lexicalising asylum in the UK Parliament, 2010-2023¶

How to read this notebook¶

0. Setup¶

0a. Reproducibility manifest¶

0b. Pre-registered expectations¶

0c. Cross-package validation: agreement with Rayson's LL Wizard¶

§5.0 Scope and corpus conditioning¶

5.1 Corpus ingestion + profile¶

5.1a Duplicate speech detection¶

5.1b Institutional-drift audit¶

5.2 Reference-baseline keyness¶

5.3 Two-corpus keyness with bootstrap confidence intervals¶

5.3g Speaker-clustered bootstrap CIs¶

5.3b Shuffled-label null distribution¶

5.3c Bootstrap CI / BH-significance alignment¶

5.3d Formal bootstrap-CI coverage under a known null¶

5.3e Sensitivity to min_count threshold¶

5.3f Top-K-speaker leverage check¶

5.4 Confound-adjusted keyness via Coarsened Exact Matching¶

5.5 Collocation shift on 'asylum'¶

5.5b Chamber-stratified collocation shift¶

5.6 Semantic shift of 'asylum' over time¶

5.6b Quantified monotonic-trend test¶

5.6c Bootstrap stability of the semantic trajectory (HashEmbedder)¶

5.6d Leave-year-out stability of the SBERT trajectory¶

5.7 Burstiness of 'criminal' in asylum debates¶

5.7a Sensitivity to burst factor s¶

5.7b Multi-term robustness: burstiness of 'migrant'¶

5.7c Permuted-time null distribution of n_bursts¶

5.8 Changepoint detection + causal impact around June 2016¶

5.8a Sensitivity to PELT penalty¶

5.8b Multi-term robustness: causal impact on 'migrant'¶

5.8c Placebo intervention-date sweep (eligible dates only)¶

5.8d Donor-series check on TWO non-migration control terms¶

5.8e Leave-one-year-out robustness for causal_impact¶

5.9 Lexical diversity over time¶

5.10 Forecasting trajectories¶

5.10b Rolling-origin forecast backtest with naive baseline¶

5.11 N-way keyness across four parties (Commons-only)¶

5.12 The asylum discourse as a co-occurrence network¶

5.12b Random-graph modularity baseline¶

5.12c Edge-threshold sensitivity¶

5.13 Synthetic-signal injection + minimum detectable effect¶

§6. Audit scoreboard¶

Reproducibility receipts¶

Limitations stated explicitly¶

What this notebook is not¶

6.5 Where this notebook (and pycorpdiff) hit their limits¶

5.3e Sensitivity to `min_count` threshold¶

5.7a Sensitivity to burst factor `s`¶

6.5 Where this notebook (and `pycorpdiff`) hit their limits¶