From e0550197e368aa78526aff8f933ec9243f18090c Mon Sep 17 00:00:00 2001 From: Peter Kleiweg Date: Thu, 18 Jun 2026 14:49:00 +0200 Subject: [PATCH] meer stijgers --- python/namen.html | 9123 ++++++++++++++++++++++++++++++++++++++++++++ python/stijgers.py | 101 + 2 files changed, 9224 insertions(+) create mode 100644 python/namen.html create mode 100755 python/stijgers.py diff --git a/python/namen.html b/python/namen.html new file mode 100644 index 0000000..7f3d79a --- /dev/null +++ b/python/namen.html @@ -0,0 +1,9123 @@ + + + + + +namen + + + + + + + + + + + + +
+
+ + diff --git a/python/stijgers.py b/python/stijgers.py new file mode 100755 index 0000000..a9b6012 --- /dev/null +++ b/python/stijgers.py @@ -0,0 +1,101 @@ +#!/net/corpora/nlnieuws/notebook/bin/python3 + +import sys +import numpy as np +from scipy.stats import chi2_contingency +from statsmodels.stats.multitest import multipletests +import pandas as pd + +# waarom werkt dit niet? +pd.set_option('display.max_rows', 40) +pd.set_option('display.max_columns', None) + +def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference): + """ + word : the word being tested + counts_recent : raw count in week 5 + counts_reference : raw count in weeks 1-4 + total_recent : total tokens in week 5 + total_reference : total tokens in weeks 1-4 + """ + a = counts_recent # word in recent + b = counts_reference # word in reference + c = total_recent - a # non-word in recent + d = total_reference - b # non-word in reference + + contingency = np.array([[a, b], + [c, d]]) + + # --- Chi-Squared --- + chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False) + + # --- Log-Likelihood (G²) --- + # G² = 2 * sum(observed * log(observed / expected)) + # scipy's chi2_contingency with lambda_="log-likelihood" computes this + g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood") + + # --- Effect sizes --- + freq_recent = a / total_recent + freq_reference = b / total_reference + + pct_diff = (freq_recent - freq_reference) / freq_reference * 100 + + # Avoid log(0) with a small epsilon + eps = 1e-9 + log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps)) + + return { + "word": word, + "freq_recent": freq_recent, + "freq_reference": freq_reference, + "pct_diff": pct_diff, + "log_ratio": log_ratio, + "chi2": chi2_stat, + "p_chi2": p_chi2, + "g2": g2_stat, + "p_g2": p_g2, + } + +oudfile = sys.argv[1] +nieuwfile = sys.argv[2] + +counts_recent = {} +counts_reference = {} + +with open(oudfile, "rt", encoding="utf-8") as fp: + for line in fp: + aa = line.split("\t") + counts_reference[aa[1].strip()] = int(aa[0]) +with open(nieuwfile, "rt", encoding="utf-8") as fp: + for line in fp: + aa = line.split("\t") + counts_recent[aa[1].strip()] = int(aa[0]) + +for key in counts_recent: + if not key in counts_reference: + counts_reference[key] = 0.5 +for key in counts_reference: + if not key in counts_recent: + counts_recent[key] = 0.5 + +total_recent = sum(counts_recent.values()) +total_reference = sum(counts_reference.values()) + +results = [ + corpus_stats(word, counts_recent[word], counts_reference.get(word, 0), + total_recent, total_reference) + for word in counts_recent] + +# FDR correction across all words +p_values = [r["p_g2"] for r in results] +_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh") + +for r, p_adj in zip(results, p_adjusted): + r["p_g2_adjusted"] = p_adj + +results = pd.DataFrame(results) + +print("STIJGERS") +print(results[results.pct_diff > 0].sort_values('g2', ascending=False)[:40].to_string()) +print("\nDALERS") +print(results[results.pct_diff < 0].sort_values('g2', ascending=False)[:40].to_string())