From c2389c65af54989aecc6adf9a655179813405747 Mon Sep 17 00:00:00 2001 From: Peter Kleiweg Date: Fri, 19 Jun 2026 17:53:03 +0200 Subject: [PATCH] stijgers2json.py --- collect.sh | 26 ++++++----- python/stijgers.py | 11 +++-- python/stijgers2json.py | 99 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 121 insertions(+), 15 deletions(-) create mode 100755 python/stijgers2json.py diff --git a/collect.sh b/collect.sh index 6f5f3a1..1faeef9 100755 --- a/collect.sh +++ b/collect.sh @@ -181,21 +181,25 @@ do done # score - say "$part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 > $part-allewoorden-$ds.score14" - stijgers.py $part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 > $part-allewoorden-$ds.score14 + say $part-allewoorden-$ds.score14 + stijgers.py $part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 $part-allewoorden-$ds.score14 + stijgers2json.py $part-allewoorden-$ds1-4 $part-allewoorden-$ds-1 $part-allewoorden-$ds.score14.json - say "$part-locaties-$ds1-4 $part-locaties-$ds-1 > $part-locaties-$ds.score14" - stijgers.py $part-locaties-$ds1-4 $part-locaties-$ds-1 > $part-locaties-$ds.score14 + say $part-locaties-$ds.score14 + stijgers.py $part-locaties-$ds1-4 $part-locaties-$ds-1 $part-locaties-$ds.score14 + stijgers2json.py $part-locaties-$ds1-4 $part-locaties-$ds-1 $part-locaties-$ds.score14.json - say "$part-personen-$ds1-4 $part-personen-$ds-1 > $part-personen-$ds.score14" - stijgers.py $part-personen-$ds1-4 $part-personen-$ds-1 > $part-personen-$ds.score14 + say $part-personen-$ds.score14 + stijgers.py $part-personen-$ds1-4 $part-personen-$ds-1 $part-personen-$ds.score14 + stijgers2json.py $part-personen-$ds1-4 $part-personen-$ds-1 $part-personen-$ds.score14.json - say "$part-organisaties-$ds1-4 $part-organisaties-$ds-1 > $part-organisaties-$ds.score14" - stijgers.py $part-organisaties-$ds1-4 $part-organisaties-$ds-1 > $part-organisaties-$ds.score14 - - say "$part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 > $part-overige-namen-$ds.score14" - stijgers.py $part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 > $part-overige-namen-$ds.score14 + say $part-organisaties-$ds.score14 + stijgers.py $part-organisaties-$ds1-4 $part-organisaties-$ds-1 $part-organisaties-$ds.score14 + stijgers2json.py $part-organisaties-$ds1-4 $part-organisaties-$ds-1 $part-organisaties-$ds.score14.json + say $part-overige-namen-$ds.score14 + stijgers.py $part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 $part-overige-namen-$ds.score14 + stijgers2json.py $part-overige-namen-$ds1-4 $part-overige-namen-$ds-1 $part-overige-namen-$ds.score14.json done diff --git a/python/stijgers.py b/python/stijgers.py index efe5b18..b5eb62d 100755 --- a/python/stijgers.py +++ b/python/stijgers.py @@ -58,6 +58,7 @@ def corpus_stats(word, counts_recent, counts_reference, total_recent, total_refe oudfile = sys.argv[1] nieuwfile = sys.argv[2] +textfile = sys.argv[3] counts_recent = {} counts_reference = {} @@ -95,7 +96,9 @@ for r, p_adj in zip(results, p_adjusted): results = pd.DataFrame(results) -print("STIJGERS") -print(results[results.pct_diff > 0].sort_values('g2', ascending=False)[:40].to_string()) -print("\nDALERS") -print(results[results.pct_diff < 0].sort_values('g2', ascending=False)[:40].to_string()) +with open(textfile, "wt", encoding="utf-8") as fp: + fp.write("STIJGERS\n") + fp.write(results[results.pct_diff > 0].sort_values('g2', ascending=False)[:40].to_string()) + fp.write("\n\nDALERS\n") + fp.write(results[results.pct_diff < 0].sort_values('g2', ascending=False)[:40].to_string()) + fp.write("\n") diff --git a/python/stijgers2json.py b/python/stijgers2json.py new file mode 100755 index 0000000..44bfbd1 --- /dev/null +++ b/python/stijgers2json.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +import json +import sys +import numpy as np +from scipy.stats import chi2_contingency +from statsmodels.stats.multitest import multipletests + +def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference): + """ + word : the word being tested + counts_recent : raw count in week 5 + counts_reference : raw count in weeks 1-4 + total_recent : total tokens in week 5 + total_reference : total tokens in weeks 1-4 + """ + a = counts_recent # word in recent + b = counts_reference # word in reference + c = total_recent - a # non-word in recent + d = total_reference - b # non-word in reference + + contingency = np.array([[a, b], + [c, d]]) + + # --- Chi-Squared --- + chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False) + + # --- Log-Likelihood (G²) --- + # G² = 2 * sum(observed * log(observed / expected)) + # scipy's chi2_contingency with lambda_="log-likelihood" computes this + g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood") + + # --- Effect sizes --- + freq_recent = a / total_recent + freq_reference = b / total_reference + + pct_diff = (freq_recent - freq_reference) / freq_reference * 100 + + # Avoid log(0) with a small epsilon + eps = 1e-9 + log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps)) + + return { + "word": word, + "freq_recent": freq_recent, + "freq_reference": freq_reference, + "pct_diff": pct_diff, + "log_ratio": log_ratio, + "chi2": chi2_stat, + "p_chi2": p_chi2, + "g2": g2_stat, + "p_g2": p_g2, + } + +oudfile = sys.argv[1] +nieuwfile = sys.argv[2] +jsonfile = sys.argv[3] + +counts_recent = {} +counts_reference = {} + +with open(oudfile, "rt", encoding="utf-8") as fp: + for line in fp: + aa = line.split("\t") + counts_reference[aa[1].strip()] = int(aa[0]) +with open(nieuwfile, "rt", encoding="utf-8") as fp: + for line in fp: + aa = line.split("\t") + counts_recent[aa[1].strip()] = int(aa[0]) + +for key in counts_recent: + if not key in counts_reference: + counts_reference[key] = 0.5 +for key in counts_reference: + if not key in counts_recent: + counts_recent[key] = 0.5 + +total_recent = sum(counts_recent.values()) +total_reference = sum(counts_reference.values()) + +results = [ + corpus_stats(word, counts_recent[word], counts_reference.get(word, 0), + total_recent, total_reference) + for word in counts_recent] + +# FDR correction across all words +p_values = [r["p_g2"] for r in results] +_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh") + +for r, p_adj in zip(results, p_adjusted): + r["p_g2_adjusted"] = p_adj + +o = {} +#o['up'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40] +#o['dn'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40] +o['up'] = sorted([x for x in results if x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40] +o['dn'] = sorted([x for x in results if x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40] +with open(jsonfile, "wt", encoding="utf-8") as fp: + json.dump(o, fp)