meer stijgers

2026-06-18 14:49:00 +02:00
parent 01e6d48665
commit e0550197e3
2 changed files with 9224 additions and 0 deletions
--- a/python/namen.html
+++ b/python/namen.html
--- a/python/stijgers.py
+++ b/python/stijgers.py
@@ -0,0 +1,101 @@
+#!/net/corpora/nlnieuws/notebook/bin/python3
+
+import sys
+import numpy as np
+from scipy.stats import chi2_contingency
+from statsmodels.stats.multitest import multipletests
+import pandas as pd
+
+# waarom werkt dit niet?
+pd.set_option('display.max_rows', 40)
+pd.set_option('display.max_columns', None)
+
+def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
+    """
+    word             : the word being tested
+    counts_recent    : raw count in week 5
+    counts_reference : raw count in weeks 1-4
+    total_recent     : total tokens in week 5
+    total_reference  : total tokens in weeks 1-4
+    """
+    a = counts_recent      # word in recent
+    b = counts_reference   # word in reference
+    c = total_recent - a   # non-word in recent
+    d = total_reference - b  # non-word in reference
+
+    contingency = np.array([[a, b],
+                             [c, d]])
+
+    # --- Chi-Squared ---
+    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
+
+    # --- Log-Likelihood (G²) ---
+    # G² = 2 * sum(observed * log(observed / expected))
+    # scipy's chi2_contingency with lambda_="log-likelihood" computes this
+    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
+
+    # --- Effect sizes ---
+    freq_recent    = a / total_recent
+    freq_reference = b / total_reference
+
+    pct_diff = (freq_recent - freq_reference) / freq_reference * 100
+
+    # Avoid log(0) with a small epsilon
+    eps = 1e-9
+    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
+
+    return {
+        "word":           word,
+        "freq_recent":    freq_recent,
+        "freq_reference": freq_reference,
+        "pct_diff":       pct_diff,
+        "log_ratio":      log_ratio,
+        "chi2":           chi2_stat,
+        "p_chi2":         p_chi2,
+        "g2":             g2_stat,
+        "p_g2":           p_g2,
+    }
+
+oudfile = sys.argv[1]
+nieuwfile = sys.argv[2]
+
+counts_recent = {}
+counts_reference = {}
+
+with open(oudfile, "rt", encoding="utf-8") as fp:
+    for line in fp:
+        aa = line.split("\t")
+        counts_reference[aa[1].strip()] = int(aa[0])
+with open(nieuwfile, "rt", encoding="utf-8") as fp:
+    for line in fp:
+        aa = line.split("\t")
+        counts_recent[aa[1].strip()] = int(aa[0])
+
+for key in counts_recent:
+    if not key in counts_reference:
+        counts_reference[key] = 0.5
+for key in counts_reference:
+    if not key in counts_recent:
+        counts_recent[key] = 0.5
+
+total_recent = sum(counts_recent.values())
+total_reference = sum(counts_reference.values())
+
+results = [
+    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
+                 total_recent, total_reference)
+    for word in counts_recent]
+
+# FDR correction across all words
+p_values = [r["p_g2"] for r in results]
+_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
+
+for r, p_adj in zip(results, p_adjusted):
+    r["p_g2_adjusted"] = p_adj
+
+results = pd.DataFrame(results)
+
+print("STIJGERS")
+print(results[results.pct_diff > 0].sort_values('g2', ascending=False)[:40].to_string())
+print("\nDALERS")
+print(results[results.pct_diff < 0].sort_values('g2', ascending=False)[:40].to_string())