In [1]:
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import pandas as pd
In [2]:
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
"""
word : the word being tested
counts_recent : raw count in week 5
counts_reference : raw count in weeks 1-4
total_recent : total tokens in week 5
total_reference : total tokens in weeks 1-4
"""
a = counts_recent # word in recent
b = counts_reference # word in reference
c = total_recent - a # non-word in recent
d = total_reference - b # non-word in reference
contingency = np.array([[a, b],
[c, d]])
# --- Chi-Squared ---
chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
# --- Log-Likelihood (G²) ---
# G² = 2 * sum(observed * log(observed / expected))
# scipy's chi2_contingency with lambda_="log-likelihood" computes this
g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
# --- Effect sizes ---
freq_recent = a / total_recent
freq_reference = b / total_reference
pct_diff = (freq_recent - freq_reference) / freq_reference * 100
# Avoid log(0) with a small epsilon
eps = 1e-9
log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
return {
"word": word,
"freq_recent": freq_recent,
"freq_reference": freq_reference,
"pct_diff": pct_diff,
"log_ratio": log_ratio,
"chi2": chi2_stat,
"p_chi2": p_chi2,
"g2": g2_stat,
"p_g2": p_g2,
}
In [3]:
counts_recent = {}
counts_reference = {}
with open("data.txt", "rt", encoding="utf-8") as fp:
for line in fp:
aa = line.split("\t")
counts_reference[aa[0]] = max(int(aa[1]), 0.5)
counts_recent[aa[0]] = max(int(aa[2]), 0.5)
total_recent = sum(counts_recent.values())
total_reference = sum(counts_reference.values())
In [4]:
results = [
corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
total_recent, total_reference)
for word in counts_recent]
In [5]:
# FDR correction across all words
p_values = [r["p_g2"] for r in results]
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
for r, p_adj in zip(results, p_adjusted):
r["p_g2_adjusted"] = p_adj
In [6]:
results = pd.DataFrame(results)
In [7]:
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)
In [8]:
# stijgers
results[results.pct_diff > 0].sort_values('g2', ascending=False)[:40]
Out[8]:
| word | freq_recent | freq_reference | pct_diff | log_ratio | chi2 | p_chi2 | g2 | p_g2 | p_g2_adjusted | |
|---|---|---|---|---|---|---|---|---|---|---|
| 4666 | Ye | 0.009191 | 0.001412 | 551.030639 | 2.702725 | 88.873013 | 4.209875e-21 | 73.611094 | 9.512852e-18 | 2.265010e-14 |
| 2233 | Kanye West | 0.006175 | 0.001150 | 436.822849 | 2.424445 | 51.860997 | 5.957208e-13 | 42.512795 | 7.022134e-11 | 1.114647e-07 |
| 2521 | Lieke Marsman | 0.002298 | 0.000026 | 8688.913621 | 6.457558 | 41.675092 | 1.077733e-10 | 34.046050 | 5.382305e-09 | 6.407634e-06 |
| 55 | Ahmed Marcouch | 0.003016 | 0.000261 | 1053.544913 | 3.527997 | 38.897401 | 4.466750e-10 | 30.774198 | 2.898665e-08 | 2.760689e-05 |
| 1343 | Flavio Cobolli | 0.002010 | 0.000026 | 7590.299418 | 6.264913 | 36.186456 | 1.793125e-09 | 29.023926 | 7.148986e-08 | 5.673912e-05 |
| 4362 | Van Dissel | 0.002585 | 0.000209 | 1135.940978 | 3.627532 | 34.197157 | 4.980167e-09 | 26.757903 | 2.306067e-07 | 1.546231e-04 |
| 2797 | Marsman | 0.001867 | 0.000026 | 7040.992317 | 6.157998 | 33.443641 | 7.335924e-09 | 26.527883 | 2.597616e-07 | 1.546231e-04 |
| 2503 | Lewis Hamilton | 0.004021 | 0.000784 | 412.686628 | 2.358076 | 32.500200 | 1.191803e-08 | 26.104423 | 3.234422e-07 | 1.711368e-04 |
| 673 | Charles Leclerc | 0.002872 | 0.000418 | 586.633877 | 2.779538 | 28.670638 | 8.579474e-08 | 22.442992 | 2.164739e-06 | 9.371352e-04 |
| 1858 | Jaap van Dissel | 0.002872 | 0.000418 | 586.633877 | 2.779538 | 28.670638 | 8.579474e-08 | 22.442992 | 2.164739e-06 | 9.371352e-04 |
| 4516 | Vollering | 0.001867 | 0.000105 | 1685.248079 | 4.158040 | 27.590589 | 1.499064e-07 | 21.070561 | 4.426770e-06 | 1.716105e-03 |
| 2923 | Mette-Marit | 0.004595 | 0.001307 | 251.556545 | 1.813756 | 25.318681 | 4.859791e-07 | 20.713510 | 5.333836e-06 | 1.716105e-03 |
| 1801 | Ingrid Alexandra | 0.002010 | 0.000157 | 1181.716570 | 3.679997 | 26.935070 | 2.104062e-07 | 20.564349 | 5.765997e-06 | 1.716105e-03 |
| 3466 | Pols | 0.002010 | 0.000157 | 1181.716570 | 3.679997 | 26.935070 | 2.104062e-07 | 20.564349 | 5.765997e-06 | 1.716105e-03 |
| 2717 | Marcouch | 0.002441 | 0.000314 | 678.185060 | 2.960109 | 26.234266 | 3.024086e-07 | 20.256468 | 6.772409e-06 | 1.897071e-03 |
| 1053 | Donald Pols | 0.002154 | 0.000261 | 723.960652 | 3.042571 | 23.871944 | 1.029615e-06 | 18.209227 | 1.979176e-05 | 5.236021e-03 |
| 3971 | Sjoerdsma | 0.001867 | 0.000209 | 792.624040 | 3.158047 | 21.542862 | 3.460080e-06 | 16.178300 | 5.765077e-05 | 1.372665e-02 |
| 2734 | Marianne Thieme | 0.001149 | 0.000026 | 4294.456811 | 5.457559 | 19.755766 | 8.799584e-06 | 14.281240 | 1.574261e-04 | 2.404300e-02 |
| 2583 | Lotte van Kruistum | 0.001149 | 0.000026 | 4294.456811 | 5.457559 | 19.755766 | 8.799584e-06 | 14.281240 | 1.574261e-04 | 2.404300e-02 |
| 3003 | Mirra Andreeva | 0.001149 | 0.000026 | 4294.456811 | 5.457559 | 19.755766 | 8.799584e-06 | 14.281240 | 1.574261e-04 | 2.404300e-02 |
| 2351 | Kluytmans | 0.001149 | 0.000026 | 4294.456811 | 5.457559 | 19.755766 | 8.799584e-06 | 14.281240 | 1.574261e-04 | 2.404300e-02 |
| 249 | Antonia Niedermaier | 0.001149 | 0.000026 | 4294.456811 | 5.457559 | 19.755766 | 8.799584e-06 | 14.281240 | 1.574261e-04 | 2.404300e-02 |
| 1791 | Ilse Kuijt | 0.001149 | 0.000026 | 4294.456811 | 5.457559 | 19.755766 | 8.799584e-06 | 14.281240 | 1.574261e-04 | 2.404300e-02 |
| 742 | Cobolli | 0.001149 | 0.000026 | 4294.456811 | 5.457559 | 19.755766 | 8.799584e-06 | 14.281240 | 1.574261e-04 | 2.404300e-02 |
| 4724 | Zoë Slagter | 0.001149 | 0.000026 | 4294.456811 | 5.457559 | 19.755766 | 8.799584e-06 | 14.281240 | 1.574261e-04 | 2.404300e-02 |
| 278 | Arjan Veurink | 0.001580 | 0.000157 | 907.063019 | 3.332074 | 19.268029 | 1.135926e-05 | 14.174484 | 1.666147e-04 | 2.404300e-02 |
| 3098 | Naomi Mestrum | 0.001292 | 0.000105 | 1135.940978 | 3.627525 | 17.091366 | 3.562389e-05 | 12.221543 | 4.724085e-04 | 6.103910e-02 |
| 2665 | Maja Chwalinska | 0.001580 | 0.000209 | 655.297264 | 2.917039 | 16.686682 | 4.408944e-05 | 12.214228 | 4.742643e-04 | 6.103910e-02 |
| 4420 | Van der Breggen | 0.001436 | 0.000157 | 815.511836 | 3.194570 | 16.772235 | 4.214548e-05 | 12.131568 | 4.957543e-04 | 6.212584e-02 |
| 1731 | Hitler | 0.001005 | 0.000026 | 3745.149709 | 5.264914 | 17.027730 | 3.683788e-05 | 11.905641 | 5.596421e-04 | 6.345276e-02 |
| 2519 | Lieke | 0.001005 | 0.000026 | 3745.149709 | 5.264914 | 17.027730 | 3.683788e-05 | 11.905641 | 5.596421e-04 | 6.345276e-02 |
| 4176 | Teunissen | 0.001005 | 0.000026 | 3745.149709 | 5.264914 | 17.027730 | 3.683788e-05 | 11.905641 | 5.596421e-04 | 6.345276e-02 |
| 4464 | Veurink | 0.001005 | 0.000026 | 3745.149709 | 5.264914 | 17.027730 | 3.683788e-05 | 11.905641 | 5.596421e-04 | 6.345276e-02 |
| 968 | Denzel Dumfries | 0.002010 | 0.000418 | 380.643714 | 2.264965 | 15.359454 | 8.887510e-05 | 11.600092 | 6.594857e-04 | 7.303421e-02 |
| 2329 | Kimi Antonelli | 0.003447 | 0.001203 | 186.595009 | 1.519013 | 14.294065 | 1.563571e-04 | 11.539896 | 6.811844e-04 | 7.372272e-02 |
| 2465 | Leclerc | 0.001292 | 0.000157 | 723.960652 | 3.042567 | 14.318772 | 1.543183e-04 | 10.145781 | 1.446319e-03 | 1.306897e-01 |
| 1250 | Esther Ouwehand | 0.001292 | 0.000157 | 723.960652 | 3.042567 | 14.318772 | 1.543183e-04 | 10.145781 | 1.446319e-03 | 1.306897e-01 |
| 713 | Christine Teunissen | 0.001149 | 0.000105 | 998.614203 | 3.457600 | 14.529034 | 1.380160e-04 | 10.122801 | 1.464463e-03 | 1.306897e-01 |
| 1917 | Jan Kluytmans | 0.001149 | 0.000105 | 998.614203 | 3.457600 | 14.529034 | 1.380160e-04 | 10.122801 | 1.464463e-03 | 1.306897e-01 |
| 2124 | Joost Luiten | 0.001149 | 0.000105 | 998.614203 | 3.457600 | 14.529034 | 1.380160e-04 | 10.122801 | 1.464463e-03 | 1.306897e-01 |
In [9]:
# dalers
results[results.pct_diff < 0].sort_values('g2', ascending=False)[:40]
Out[9]:
| word | freq_recent | freq_reference | pct_diff | log_ratio | chi2 | p_chi2 | g2 | p_g2 | p_g2_adjusted | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1743 | Hondius | 0.000431 | 0.010457 | -95.880197 | -4.601278 | 66.470517 | 3.551664e-16 | 98.275792 | 3.639886e-23 | 1.733314e-19 |
| 4215 | Thymen Arensman | 0.000072 | 0.002353 | -96.948294 | -5.034221 | 15.256919 | 9.383239e-05 | 20.686464 | 5.409706e-06 | 1.716105e-03 |
| 1508 | Gidi Markuszower | 0.000072 | 0.002091 | -96.566831 | -4.864296 | 13.434157 | 2.470840e-04 | 17.811640 | 2.438866e-05 | 6.112569e-03 |
| 4495 | Vingegaard | 0.000072 | 0.001882 | -96.185367 | -4.712293 | 11.977136 | 5.385731e-04 | 15.534557 | 8.101072e-05 | 1.837014e-02 |
| 2782 | Markuszower | 0.000072 | 0.001830 | -96.076378 | -4.671651 | 11.613072 | 6.548989e-04 | 14.969018 | 1.092909e-04 | 2.365652e-02 |
| 2109 | Jonas Vingegaard | 0.000431 | 0.002667 | -83.843909 | -2.629847 | 12.353905 | 4.400650e-04 | 14.707179 | 1.255674e-04 | 2.404300e-02 |
| 3750 | Rubio | 0.000287 | 0.002301 | -87.515748 | -3.001814 | 11.757125 | 6.061077e-04 | 14.212300 | 1.632996e-04 | 2.404300e-02 |
| 115 | Ali B | 0.000072 | 0.001725 | -95.838583 | -4.586762 | 10.885206 | 9.693541e-04 | 13.843023 | 1.987332e-04 | 2.783434e-02 |
| 512 | Botic van de Zandschulp | 0.000072 | 0.001621 | -95.570104 | -4.496564 | 10.157724 | 1.436979e-03 | 12.724502 | 3.608960e-04 | 4.910248e-02 |
| 921 | De Jong | 0.000574 | 0.002562 | -77.579302 | -2.157095 | 9.947317 | 1.610840e-03 | 11.220837 | 8.088400e-04 | 8.559325e-02 |
| 4034 | Starmer | 0.000574 | 0.002510 | -77.112204 | -2.127348 | 9.611842 | 1.933266e-03 | 10.778088 | 1.027087e-03 | 1.060152e-01 |
| 1054 | Donald Trump | 0.010196 | 0.015529 | -34.342080 | -0.606959 | 10.440052 | 1.233118e-03 | 10.743700 | 1.046349e-03 | 1.060152e-01 |
| 4169 | Tedros Adhanom Ghebreyesus | 0.000072 | 0.001412 | -94.913823 | -4.297255 | 8.704154 | 3.174857e-03 | 10.514021 | 1.184722e-03 | 1.175343e-01 |
| 269 | Arensman | 0.000072 | 0.001359 | -94.718201 | -4.242808 | 8.341110 | 3.875791e-03 | 9.967878 | 1.592950e-03 | 1.306897e-01 |
| 3185 | Noam Bettan | 0.000072 | 0.001307 | -94.506929 | -4.186224 | 7.978231 | 4.734318e-03 | 9.424725 | 2.140792e-03 | 1.396500e-01 |
| 3082 | Máxima | 0.002010 | 0.004497 | -55.288957 | -1.161296 | 8.263354 | 4.045333e-03 | 8.639908 | 3.288780e-03 | 2.088156e-01 |
| 1099 | Dylan Groenewegen | 0.000072 | 0.001203 | -94.029271 | -4.065930 | 7.253040 | 7.078107e-03 | 8.348388 | 3.860297e-03 | 2.418781e-01 |
| 4258 | Tom Berendsen | 0.000574 | 0.002196 | -73.842519 | -1.934703 | 7.626670 | 5.751139e-03 | 8.204369 | 4.178964e-03 | 2.456818e-01 |
| 4598 | Willem-Alexander | 0.001436 | 0.003555 | -59.609772 | -1.307921 | 7.692603 | 5.544761e-03 | 8.044226 | 4.564892e-03 | 2.587859e-01 |
| 3222 | Oceanwide Expeditions | 0.000072 | 0.001150 | -93.757874 | -4.001800 | 6.890771 | 8.664187e-03 | 7.815786 | 5.179183e-03 | 2.603795e-01 |
| 1411 | Freek | 0.000072 | 0.001150 | -93.757874 | -4.001800 | 6.890771 | 8.664187e-03 | 7.815786 | 5.179183e-03 | 2.603795e-01 |
| 2226 | Kaja Kallas | 0.000072 | 0.001098 | -93.460630 | -3.934686 | 6.528755 | 1.061442e-02 | 7.287350 | 6.944183e-03 | 2.603795e-01 |
| 4454 | Verhoeven | 0.000072 | 0.001098 | -93.460630 | -3.934686 | 6.528755 | 1.061442e-02 | 7.287350 | 6.944183e-03 | 2.603795e-01 |
| 1401 | Fred Rutten | 0.000072 | 0.001098 | -93.460630 | -3.934686 | 6.528755 | 1.061442e-02 | 7.287350 | 6.944183e-03 | 2.603795e-01 |
| 1981 | Jerdy Schouten | 0.000072 | 0.001046 | -93.133661 | -3.864296 | 6.167024 | 1.301534e-02 | 6.763485 | 9.304187e-03 | 3.370342e-01 |
| 2977 | Mikel Arteta | 0.000072 | 0.001046 | -93.133661 | -3.864296 | 6.167024 | 1.301534e-02 | 6.763485 | 9.304187e-03 | 3.370342e-01 |
| 3013 | Modi | 0.000072 | 0.001046 | -93.133661 | -3.864296 | 6.167024 | 1.301534e-02 | 6.763485 | 9.304187e-03 | 3.370342e-01 |
| 3237 | Oleksandr Usyk | 0.000072 | 0.001046 | -93.133661 | -3.864296 | 6.167024 | 1.301534e-02 | 6.763485 | 9.304187e-03 | 3.370342e-01 |
| 417 | Berendsen | 0.000718 | 0.002248 | -68.063541 | -1.646722 | 6.509375 | 1.073072e-02 | 6.756171 | 9.342400e-03 | 3.370342e-01 |
| 2016 | Jetten | 0.002441 | 0.004601 | -46.941928 | -0.914356 | 5.941062 | 1.479210e-02 | 5.969003 | 1.455951e-02 | 4.952314e-01 |
| 1310 | Felix Gall | 0.000072 | 0.000941 | -92.370735 | -3.712293 | 5.444587 | 1.962906e-02 | 5.731410 | 1.666410e-02 | 5.501480e-01 |
| 1167 | Ellen ten Damme | 0.000072 | 0.000941 | -92.370735 | -3.712293 | 5.444587 | 1.962906e-02 | 5.731410 | 1.666410e-02 | 5.501480e-01 |
| 4101 | Suzan & Freek | 0.000072 | 0.000941 | -92.370735 | -3.712293 | 5.444587 | 1.962906e-02 | 5.731410 | 1.666410e-02 | 5.501480e-01 |
| 4141 | Tallon Griekspoor | 0.000431 | 0.001621 | -73.420624 | -1.911618 | 5.554715 | 1.843098e-02 | 5.650035 | 1.745502e-02 | 5.501480e-01 |
| 4655 | Xi Jinping | 0.001149 | 0.002719 | -57.745608 | -1.242826 | 5.484134 | 1.918982e-02 | 5.488524 | 1.914170e-02 | 5.501480e-01 |
| 133 | Amalia | 0.000431 | 0.001569 | -72.534645 | -1.864313 | 5.230992 | 2.218788e-02 | 5.249197 | 2.195690e-02 | 5.501480e-01 |
| 3767 | Rutten | 0.000072 | 0.000889 | -91.921954 | -3.629831 | 5.083988 | 2.414777e-02 | 5.224377 | 2.227242e-02 | 5.501480e-01 |
| 2970 | Mika Godts | 0.000072 | 0.000889 | -91.921954 | -3.629831 | 5.083988 | 2.414777e-02 | 5.224377 | 2.227242e-02 | 5.501480e-01 |
| 537 | Brekelmans | 0.000072 | 0.000889 | -91.921954 | -3.629831 | 5.083988 | 2.414777e-02 | 5.224377 | 2.227242e-02 | 5.501480e-01 |
| 2712 | Marco Rubio | 0.000862 | 0.002248 | -61.676249 | -1.383688 | 5.236100 | 2.212281e-02 | 5.213944 | 2.240645e-02 | 5.501480e-01 |