#!/usr/bin/env python3 import sys import numpy as np from scipy.stats import chi2_contingency from statsmodels.stats.multitest import multipletests import pandas as pd # waarom werkt dit niet? pd.set_option('display.max_rows', 40) pd.set_option('display.max_columns', None) def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference): """ word : the word being tested counts_recent : raw count in week 5 counts_reference : raw count in weeks 1-4 total_recent : total tokens in week 5 total_reference : total tokens in weeks 1-4 """ a = counts_recent # word in recent b = counts_reference # word in reference c = total_recent - a # non-word in recent d = total_reference - b # non-word in reference contingency = np.array([[a, b], [c, d]]) # --- Chi-Squared --- chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False) # --- Log-Likelihood (G²) --- # G² = 2 * sum(observed * log(observed / expected)) # scipy's chi2_contingency with lambda_="log-likelihood" computes this g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood") # --- Effect sizes --- freq_recent = a / total_recent freq_reference = b / total_reference pct_diff = (freq_recent - freq_reference) / freq_reference * 100 # Avoid log(0) with a small epsilon eps = 1e-9 log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps)) return { "word": word, "freq_recent": freq_recent, "freq_reference": freq_reference, "pct_diff": pct_diff, "log_ratio": log_ratio, "chi2": chi2_stat, "p_chi2": p_chi2, "g2": g2_stat, "p_g2": p_g2, } oudfile = sys.argv[1] nieuwfile = sys.argv[2] textfile = sys.argv[3] counts_recent = {} counts_reference = {} with open(oudfile, "rt", encoding="utf-8") as fp: for line in fp: aa = line.split("\t") counts_reference[aa[1].strip()] = int(aa[0]) with open(nieuwfile, "rt", encoding="utf-8") as fp: for line in fp: aa = line.split("\t") counts_recent[aa[1].strip()] = int(aa[0]) for key in counts_recent: if not key in counts_reference: counts_reference[key] = 0.5 for key in counts_reference: if not key in counts_recent: counts_recent[key] = 0.5 total_recent = sum(counts_recent.values()) total_reference = sum(counts_reference.values()) results = [ corpus_stats(word, counts_recent[word], counts_reference.get(word, 0), total_recent, total_reference) for word in counts_recent] # FDR correction across all words p_values = [r["p_g2"] for r in results] _, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh") for r, p_adj in zip(results, p_adjusted): r["p_g2_adjusted"] = p_adj results = pd.DataFrame(results) with open(textfile, "wt", encoding="utf-8") as fp: fp.write("STIJGERS\n") fp.write(results[results.pct_diff > 0].sort_values('g2', ascending=False)[:40].to_string()) fp.write("\n\nDALERS\n") fp.write(results[results.pct_diff < 0].sort_values('g2', ascending=False)[:40].to_string()) fp.write("\n")