In [1]:
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import pandas as pd
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
"""
word : the word being tested
counts_recent : raw count in week 5
counts_reference : raw count in weeks 1-4
total_recent : total tokens in week 5
total_reference : total tokens in weeks 1-4
"""
a = counts_recent # word in recent
b = counts_reference # word in reference
c = total_recent - a # non-word in recent
d = total_reference - b # non-word in reference
contingency = np.array([[a, b],
[c, d]])
# --- Chi-Squared ---
chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
# --- Log-Likelihood (G²) ---
# G² = 2 * sum(observed * log(observed / expected))
# scipy's chi2_contingency with lambda_="log-likelihood" computes this
g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
# --- Effect sizes ---
freq_recent = a / total_recent
freq_reference = b / total_reference
pct_diff = (freq_recent - freq_reference) / freq_reference * 100
# Avoid log(0) with a small epsilon
eps = 1e-9
log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
return {
"word": word,
"freq_recent": freq_recent,
"freq_reference": freq_reference,
"pct_diff": pct_diff,
"log_ratio": log_ratio,
"chi2": chi2_stat,
"p_chi2": p_chi2,
"g2": g2_stat,
"p_g2": p_g2,
}
In [2]:
# Example data
counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}
counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}
total_recent = sum(counts_recent.values())
total_reference = sum(counts_reference.values())
In [3]:
# Run tests on whole vocabulary, including correction for multiple tests
# (false discovery rate).
results = [
corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
total_recent, total_reference)
for word in counts_recent]
# FDR correction across all words
p_values = [r["p_g2"] for r in results]
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
for r, p_adj in zip(results, p_adjusted):
r["p_g2_adjusted"] = p_adj
In [4]:
results = pd.DataFrame(results)
results
Out[4]:
| word | freq_recent | freq_reference | pct_diff | log_ratio | chi2 | p_chi2 | g2 | p_g2 | p_g2_adjusted | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | eend | 0.424929 | 0.241379 | 76.042088 | 0.815920 | 25.238117 | 5.067080e-07 | 24.764140 | 6.479173e-07 | 0.000002 |
| 1 | tafel | 0.286119 | 0.313480 | -8.728045 | -0.131756 | 0.598371 | 4.392004e-01 | 0.474701 | 4.908322e-01 | 0.490832 |
| 2 | fiets | 0.288952 | 0.445141 | -35.087579 | -0.623434 | 17.676782 | 2.618028e-05 | 17.051468 | 3.638025e-05 | 0.000055 |
In [5]:
# Significant according to Chi2
results[results['p_chi2'] < 0.05]
Out[5]:
| word | freq_recent | freq_reference | pct_diff | log_ratio | chi2 | p_chi2 | g2 | p_g2 | p_g2_adjusted | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | eend | 0.424929 | 0.241379 | 76.042088 | 0.815920 | 25.238117 | 5.067080e-07 | 24.764140 | 6.479173e-07 | 0.000002 |
| 2 | fiets | 0.288952 | 0.445141 | -35.087579 | -0.623434 | 17.676782 | 2.618028e-05 | 17.051468 | 3.638025e-05 | 0.000055 |
In [6]:
# Significant according to G2 (LLR)
results[results['p_g2_adjusted'] < 0.05]
Out[6]:
| word | freq_recent | freq_reference | pct_diff | log_ratio | chi2 | p_chi2 | g2 | p_g2 | p_g2_adjusted | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | eend | 0.424929 | 0.241379 | 76.042088 | 0.815920 | 25.238117 | 5.067080e-07 | 24.764140 | 6.479173e-07 | 0.000002 |
| 2 | fiets | 0.288952 | 0.445141 | -35.087579 | -0.623434 | 17.676782 | 2.618028e-05 | 17.051468 | 3.638025e-05 | 0.000055 |
In [ ]: