From c2389c65af54989aecc6adf9a655179813405747 Mon Sep 17 00:00:00 2001
From: Peter Kleiweg <kleiweg@ziggo.nl>
Date: Fri, 19 Jun 2026 17:53:03 +0200
Subject: [PATCH] stijgers2json.py

---
 collect.sh              | 26 ++++++-----
 python/stijgers.py      | 11 +++--
 python/stijgers2json.py | 99 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 121 insertions(+), 15 deletions(-)
 create mode 100755 python/stijgers2json.py

diff --git a/collect.sh b/collect.sh
index 6f5f3a1..1faeef9 100755
--- a/collect.sh
+++ b/collect.sh
@@ -181,21 +181,25 @@ do
     done
 
     # score
-    say "$part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 > $part-allewoorden-$ds.score14"
-    stijgers.py $part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 > $part-allewoorden-$ds.score14
+    say $part-allewoorden-$ds.score14
+    stijgers.py $part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 $part-allewoorden-$ds.score14
+    stijgers2json.py $part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 $part-allewoorden-$ds.score14.json
 
-    say "$part-locaties-$ds1-4 $part-locaties-$ds-1 > $part-locaties-$ds.score14"
-    stijgers.py $part-locaties-$ds1-4 $part-locaties-$ds-1 > $part-locaties-$ds.score14
+    say $part-locaties-$ds.score14
+    stijgers.py $part-locaties-$ds1-4 $part-locaties-$ds-1 $part-locaties-$ds.score14
+    stijgers2json.py $part-locaties-$ds1-4 $part-locaties-$ds-1 $part-locaties-$ds.score14.json
 
-    say "$part-personen-$ds1-4 $part-personen-$ds-1 > $part-personen-$ds.score14"
-    stijgers.py $part-personen-$ds1-4 $part-personen-$ds-1 > $part-personen-$ds.score14
+    say $part-personen-$ds.score14
+    stijgers.py $part-personen-$ds1-4 $part-personen-$ds-1 $part-personen-$ds.score14
+    stijgers2json.py $part-personen-$ds1-4 $part-personen-$ds-1 $part-personen-$ds.score14.json
 
-    say "$part-organisaties-$ds1-4 $part-organisaties-$ds-1 > $part-organisaties-$ds.score14"
-    stijgers.py $part-organisaties-$ds1-4 $part-organisaties-$ds-1 > $part-organisaties-$ds.score14
-
-    say "$part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 > $part-overige-namen-$ds.score14"
-    stijgers.py $part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 > $part-overige-namen-$ds.score14
+    say $part-organisaties-$ds.score14
+    stijgers.py $part-organisaties-$ds1-4 $part-organisaties-$ds-1 $part-organisaties-$ds.score14
+    stijgers2json.py $part-organisaties-$ds1-4 $part-organisaties-$ds-1 $part-organisaties-$ds.score14.json
 
+    say $part-overige-namen-$ds.score14
+    stijgers.py $part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 $part-overige-namen-$ds.score14
+    stijgers2json.py $part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 $part-overige-namen-$ds.score14.json
 
 done
 
diff --git a/python/stijgers.py b/python/stijgers.py
index efe5b18..b5eb62d 100755
--- a/python/stijgers.py
+++ b/python/stijgers.py
@@ -58,6 +58,7 @@ def corpus_stats(word, counts_recent, counts_reference, total_recent, total_refe
 
 oudfile = sys.argv[1]
 nieuwfile = sys.argv[2]
+textfile = sys.argv[3]
 
 counts_recent = {}
 counts_reference = {}
@@ -95,7 +96,9 @@ for r, p_adj in zip(results, p_adjusted):
 
 results = pd.DataFrame(results)
 
-print("STIJGERS")
-print(results[results.pct_diff > 0].sort_values('g2', ascending=False)[:40].to_string())
-print("\nDALERS")
-print(results[results.pct_diff < 0].sort_values('g2', ascending=False)[:40].to_string())
+with open(textfile, "wt", encoding="utf-8") as fp:
+    fp.write("STIJGERS\n")
+    fp.write(results[results.pct_diff > 0].sort_values('g2', ascending=False)[:40].to_string())
+    fp.write("\n\nDALERS\n")
+    fp.write(results[results.pct_diff < 0].sort_values('g2', ascending=False)[:40].to_string())
+    fp.write("\n")
diff --git a/python/stijgers2json.py b/python/stijgers2json.py
new file mode 100755
index 0000000..44bfbd1
--- /dev/null
+++ b/python/stijgers2json.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+import json
+import sys
+import numpy as np
+from scipy.stats import chi2_contingency
+from statsmodels.stats.multitest import multipletests
+
+def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
+    """
+    word             : the word being tested
+    counts_recent    : raw count in week 5
+    counts_reference : raw count in weeks 1-4
+    total_recent     : total tokens in week 5
+    total_reference  : total tokens in weeks 1-4
+    """
+    a = counts_recent      # word in recent
+    b = counts_reference   # word in reference
+    c = total_recent - a   # non-word in recent
+    d = total_reference - b  # non-word in reference
+
+    contingency = np.array([[a, b],
+                             [c, d]])
+
+    # --- Chi-Squared ---
+    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
+
+    # --- Log-Likelihood (G²) ---
+    # G² = 2 * sum(observed * log(observed / expected))
+    # scipy's chi2_contingency with lambda_="log-likelihood" computes this
+    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
+
+    # --- Effect sizes ---
+    freq_recent    = a / total_recent
+    freq_reference = b / total_reference
+
+    pct_diff = (freq_recent - freq_reference) / freq_reference * 100
+
+    # Avoid log(0) with a small epsilon
+    eps = 1e-9
+    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
+
+    return {
+        "word":           word,
+        "freq_recent":    freq_recent,
+        "freq_reference": freq_reference,
+        "pct_diff":       pct_diff,
+        "log_ratio":      log_ratio,
+        "chi2":           chi2_stat,
+        "p_chi2":         p_chi2,
+        "g2":             g2_stat,
+        "p_g2":           p_g2,
+    }
+
+oudfile = sys.argv[1]
+nieuwfile = sys.argv[2]
+jsonfile = sys.argv[3]
+
+counts_recent = {}
+counts_reference = {}
+
+with open(oudfile, "rt", encoding="utf-8") as fp:
+    for line in fp:
+        aa = line.split("\t")
+        counts_reference[aa[1].strip()] = int(aa[0])
+with open(nieuwfile, "rt", encoding="utf-8") as fp:
+    for line in fp:
+        aa = line.split("\t")
+        counts_recent[aa[1].strip()] = int(aa[0])
+
+for key in counts_recent:
+    if not key in counts_reference:
+        counts_reference[key] = 0.5
+for key in counts_reference:
+    if not key in counts_recent:
+        counts_recent[key] = 0.5
+
+total_recent = sum(counts_recent.values())
+total_reference = sum(counts_reference.values())
+
+results = [
+    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
+                 total_recent, total_reference)
+    for word in counts_recent]
+
+# FDR correction across all words
+p_values = [r["p_g2"] for r in results]
+_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
+
+for r, p_adj in zip(results, p_adjusted):
+    r["p_g2_adjusted"] = p_adj
+
+o = {}
+#o['up'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40]
+#o['dn'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40]
+o['up'] = sorted([x for x in results if x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40]
+o['dn'] = sorted([x for x in results if x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40]
+with open(jsonfile, "wt", encoding="utf-8") as fp:
+    json.dump(o, fp)