Files
nlnieuws/python/namen.py
Peter Kleiweg 01e6d48665 stijgers
2026-06-18 12:52:40 +02:00

83 lines
2.6 KiB
Python
Executable File

#!/net/corpora/nlnieuws/notebook/bin/python3
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import pandas as pd
# waarom werkt dit niet?
pd.set_option('display.max_rows', 40)
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
"""
word : the word being tested
counts_recent : raw count in week 5
counts_reference : raw count in weeks 1-4
total_recent : total tokens in week 5
total_reference : total tokens in weeks 1-4
"""
a = counts_recent # word in recent
b = counts_reference # word in reference
c = total_recent - a # non-word in recent
d = total_reference - b # non-word in reference
contingency = np.array([[a, b],
[c, d]])
# --- Chi-Squared ---
chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
# --- Log-Likelihood (G²) ---
# G² = 2 * sum(observed * log(observed / expected))
# scipy's chi2_contingency with lambda_="log-likelihood" computes this
g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
# --- Effect sizes ---
freq_recent = a / total_recent
freq_reference = b / total_reference
pct_diff = (freq_recent - freq_reference) / freq_reference * 100
# Avoid log(0) with a small epsilon
eps = 1e-9
log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
return {
"word": word,
"freq_recent": freq_recent,
"freq_reference": freq_reference,
"pct_diff": pct_diff,
"log_ratio": log_ratio,
"chi2": chi2_stat,
"p_chi2": p_chi2,
"g2": g2_stat,
"p_g2": p_g2,
}
counts_recent = {}
counts_reference = {}
with open("data.txt", "rt", encoding="utf-8") as fp:
for line in fp:
aa = line.split("\t")
counts_reference[aa[0]] = max(int(aa[1]), 0.5)
counts_recent[aa[0]] = max(int(aa[2]), 0.5)
total_recent = sum(counts_recent.values())
total_reference = sum(counts_reference.values())
results = [
corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
total_recent, total_reference)
for word in counts_recent]
# FDR correction across all words
p_values = [r["p_g2"] for r in results]
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
for r, p_adj in zip(results, p_adjusted):
r["p_g2_adjusted"] = p_adj
results = pd.DataFrame(results)
print(results)
print(results.sort_values('g2'))
print(results.sort_values('pct_diff'))