100 lines
3.3 KiB
Python
Executable File
100 lines
3.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import json
|
|
import sys
|
|
import numpy as np
|
|
from scipy.stats import chi2_contingency
|
|
from statsmodels.stats.multitest import multipletests
|
|
|
|
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
|
|
"""
|
|
word : the word being tested
|
|
counts_recent : raw count in week 5
|
|
counts_reference : raw count in weeks 1-4
|
|
total_recent : total tokens in week 5
|
|
total_reference : total tokens in weeks 1-4
|
|
"""
|
|
a = counts_recent # word in recent
|
|
b = counts_reference # word in reference
|
|
c = total_recent - a # non-word in recent
|
|
d = total_reference - b # non-word in reference
|
|
|
|
contingency = np.array([[a, b],
|
|
[c, d]])
|
|
|
|
# --- Chi-Squared ---
|
|
chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
|
|
|
|
# --- Log-Likelihood (G²) ---
|
|
# G² = 2 * sum(observed * log(observed / expected))
|
|
# scipy's chi2_contingency with lambda_="log-likelihood" computes this
|
|
g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
|
|
|
|
# --- Effect sizes ---
|
|
freq_recent = a / total_recent
|
|
freq_reference = b / total_reference
|
|
|
|
pct_diff = (freq_recent - freq_reference) / freq_reference * 100
|
|
|
|
# Avoid log(0) with a small epsilon
|
|
eps = 1e-9
|
|
log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
|
|
|
|
return {
|
|
"word": word,
|
|
"freq_recent": freq_recent,
|
|
"freq_reference": freq_reference,
|
|
"pct_diff": pct_diff,
|
|
"log_ratio": log_ratio,
|
|
"chi2": chi2_stat,
|
|
"p_chi2": p_chi2,
|
|
"g2": g2_stat,
|
|
"p_g2": p_g2,
|
|
}
|
|
|
|
oudfile = sys.argv[1]
|
|
nieuwfile = sys.argv[2]
|
|
jsonfile = sys.argv[3]
|
|
|
|
counts_recent = {}
|
|
counts_reference = {}
|
|
|
|
with open(oudfile, "rt", encoding="utf-8") as fp:
|
|
for line in fp:
|
|
aa = line.split("\t")
|
|
counts_reference[aa[1].strip()] = int(aa[0])
|
|
with open(nieuwfile, "rt", encoding="utf-8") as fp:
|
|
for line in fp:
|
|
aa = line.split("\t")
|
|
counts_recent[aa[1].strip()] = int(aa[0])
|
|
|
|
for key in counts_recent:
|
|
if not key in counts_reference:
|
|
counts_reference[key] = 0.5
|
|
for key in counts_reference:
|
|
if not key in counts_recent:
|
|
counts_recent[key] = 0.5
|
|
|
|
total_recent = sum(counts_recent.values())
|
|
total_reference = sum(counts_reference.values())
|
|
|
|
results = [
|
|
corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
|
|
total_recent, total_reference)
|
|
for word in counts_recent]
|
|
|
|
# FDR correction across all words
|
|
p_values = [r["p_g2"] for r in results]
|
|
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
|
|
|
|
for r, p_adj in zip(results, p_adjusted):
|
|
r["p_g2_adjusted"] = p_adj
|
|
|
|
o = {}
|
|
#o['up'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40]
|
|
#o['dn'] = sorted([x for x in results if x['p_g2'] < .05 and x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40]
|
|
o['up'] = sorted([x for x in results if x['pct_diff'] > 0], key=lambda x: x['g2'], reverse=True)[:40]
|
|
o['dn'] = sorted([x for x in results if x['pct_diff'] < 0], key=lambda x: x['g2'], reverse=True)[:40]
|
|
with open(jsonfile, "wt", encoding="utf-8") as fp:
|
|
json.dump(o, fp)
|