In [1]:
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import pandas as pd

def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
    """
    word             : the word being tested
    counts_recent    : raw count in week 5
    counts_reference : raw count in weeks 1-4
    total_recent     : total tokens in week 5
    total_reference  : total tokens in weeks 1-4
    """
    a = counts_recent      # word in recent
    b = counts_reference   # word in reference
    c = total_recent - a   # non-word in recent
    d = total_reference - b  # non-word in reference

    contingency = np.array([[a, b],
                             [c, d]])

    # --- Chi-Squared ---
    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)

    # --- Log-Likelihood (G²) ---
    # G² = 2 * sum(observed * log(observed / expected))
    # scipy's chi2_contingency with lambda_="log-likelihood" computes this
    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")

    # --- Effect sizes ---
    freq_recent    = a / total_recent
    freq_reference = b / total_reference

    pct_diff = (freq_recent - freq_reference) / freq_reference * 100

    # Avoid log(0) with a small epsilon
    eps = 1e-9
    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))

    return {
        "word":           word,
        "freq_recent":    freq_recent,
        "freq_reference": freq_reference,
        "pct_diff":       pct_diff,
        "log_ratio":      log_ratio,
        "chi2":           chi2_stat,
        "p_chi2":         p_chi2,
        "g2":             g2_stat,
        "p_g2":           p_g2,
    }
In [2]:
# Example data
counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}
counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}
total_recent = sum(counts_recent.values())
total_reference = sum(counts_reference.values())
In [3]:
# Run tests on whole vocabulary, including correction for multiple tests
# (false discovery rate).

results = [
    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
                 total_recent, total_reference)
    for word in counts_recent]

# FDR correction across all words
p_values = [r["p_g2"] for r in results]
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")

for r, p_adj in zip(results, p_adjusted):
    r["p_g2_adjusted"] = p_adj
In [4]:
results = pd.DataFrame(results)
results
Out[4]:
word freq_recent freq_reference pct_diff log_ratio chi2 p_chi2 g2 p_g2 p_g2_adjusted
0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 5.067080e-07 24.764140 6.479173e-07 0.000002
1 tafel 0.286119 0.313480 -8.728045 -0.131756 0.598371 4.392004e-01 0.474701 4.908322e-01 0.490832
2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 2.618028e-05 17.051468 3.638025e-05 0.000055
In [5]:
# Significant according to Chi2
results[results['p_chi2'] < 0.05]
Out[5]:
word freq_recent freq_reference pct_diff log_ratio chi2 p_chi2 g2 p_g2 p_g2_adjusted
0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 5.067080e-07 24.764140 6.479173e-07 0.000002
2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 2.618028e-05 17.051468 3.638025e-05 0.000055
In [6]:
# Significant according to G2 (LLR)
results[results['p_g2_adjusted'] < 0.05]
Out[6]:
word freq_recent freq_reference pct_diff log_ratio chi2 p_chi2 g2 p_g2 p_g2_adjusted
0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 5.067080e-07 24.764140 6.479173e-07 0.000002
2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 2.618028e-05 17.051468 3.638025e-05 0.000055
In [ ]: