In [1]:
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import pandas as pd
In [2]:
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
    """
    word             : the word being tested
    counts_recent    : raw count in week 5
    counts_reference : raw count in weeks 1-4
    total_recent     : total tokens in week 5
    total_reference  : total tokens in weeks 1-4
    """
    a = counts_recent      # word in recent
    b = counts_reference   # word in reference
    c = total_recent - a   # non-word in recent
    d = total_reference - b  # non-word in reference

    contingency = np.array([[a, b],
                             [c, d]])

    # --- Chi-Squared ---
    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)

    # --- Log-Likelihood (G²) ---
    # G² = 2 * sum(observed * log(observed / expected))
    # scipy's chi2_contingency with lambda_="log-likelihood" computes this
    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")

    # --- Effect sizes ---
    freq_recent    = a / total_recent
    freq_reference = b / total_reference

    pct_diff = (freq_recent - freq_reference) / freq_reference * 100

    # Avoid log(0) with a small epsilon
    eps = 1e-9
    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))

    return {
        "word":           word,
        "freq_recent":    freq_recent,
        "freq_reference": freq_reference,
        "pct_diff":       pct_diff,
        "log_ratio":      log_ratio,
        "chi2":           chi2_stat,
        "p_chi2":         p_chi2,
        "g2":             g2_stat,
        "p_g2":           p_g2,
    }
In [3]:
counts_recent = {}
counts_reference = {}
with open("data.txt", "rt", encoding="utf-8") as fp:
    for line in fp:
        aa = line.split("\t")
        counts_reference[aa[0]] = max(int(aa[1]), 0.5)
        counts_recent[aa[0]] = max(int(aa[2]), 0.5)
total_recent = sum(counts_recent.values())
total_reference = sum(counts_reference.values())
In [4]:
results = [
    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
                 total_recent, total_reference)
    for word in counts_recent]
In [5]:
# FDR correction across all words
p_values = [r["p_g2"] for r in results]
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")

for r, p_adj in zip(results, p_adjusted):
    r["p_g2_adjusted"] = p_adj
In [6]:
results = pd.DataFrame(results)
In [7]:
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)
In [8]:
# stijgers
results[results.pct_diff > 0].sort_values('g2', ascending=False)[:40]
Out[8]:
word freq_recent freq_reference pct_diff log_ratio chi2 p_chi2 g2 p_g2 p_g2_adjusted
4666 Ye 0.009191 0.001412 551.030639 2.702725 88.873013 4.209875e-21 73.611094 9.512852e-18 2.265010e-14
2233 Kanye West 0.006175 0.001150 436.822849 2.424445 51.860997 5.957208e-13 42.512795 7.022134e-11 1.114647e-07
2521 Lieke Marsman 0.002298 0.000026 8688.913621 6.457558 41.675092 1.077733e-10 34.046050 5.382305e-09 6.407634e-06
55 Ahmed Marcouch 0.003016 0.000261 1053.544913 3.527997 38.897401 4.466750e-10 30.774198 2.898665e-08 2.760689e-05
1343 Flavio Cobolli 0.002010 0.000026 7590.299418 6.264913 36.186456 1.793125e-09 29.023926 7.148986e-08 5.673912e-05
4362 Van Dissel 0.002585 0.000209 1135.940978 3.627532 34.197157 4.980167e-09 26.757903 2.306067e-07 1.546231e-04
2797 Marsman 0.001867 0.000026 7040.992317 6.157998 33.443641 7.335924e-09 26.527883 2.597616e-07 1.546231e-04
2503 Lewis Hamilton 0.004021 0.000784 412.686628 2.358076 32.500200 1.191803e-08 26.104423 3.234422e-07 1.711368e-04
673 Charles Leclerc 0.002872 0.000418 586.633877 2.779538 28.670638 8.579474e-08 22.442992 2.164739e-06 9.371352e-04
1858 Jaap van Dissel 0.002872 0.000418 586.633877 2.779538 28.670638 8.579474e-08 22.442992 2.164739e-06 9.371352e-04
4516 Vollering 0.001867 0.000105 1685.248079 4.158040 27.590589 1.499064e-07 21.070561 4.426770e-06 1.716105e-03
2923 Mette-Marit 0.004595 0.001307 251.556545 1.813756 25.318681 4.859791e-07 20.713510 5.333836e-06 1.716105e-03
1801 Ingrid Alexandra 0.002010 0.000157 1181.716570 3.679997 26.935070 2.104062e-07 20.564349 5.765997e-06 1.716105e-03
3466 Pols 0.002010 0.000157 1181.716570 3.679997 26.935070 2.104062e-07 20.564349 5.765997e-06 1.716105e-03
2717 Marcouch 0.002441 0.000314 678.185060 2.960109 26.234266 3.024086e-07 20.256468 6.772409e-06 1.897071e-03
1053 Donald Pols 0.002154 0.000261 723.960652 3.042571 23.871944 1.029615e-06 18.209227 1.979176e-05 5.236021e-03
3971 Sjoerdsma 0.001867 0.000209 792.624040 3.158047 21.542862 3.460080e-06 16.178300 5.765077e-05 1.372665e-02
2734 Marianne Thieme 0.001149 0.000026 4294.456811 5.457559 19.755766 8.799584e-06 14.281240 1.574261e-04 2.404300e-02
2583 Lotte van Kruistum 0.001149 0.000026 4294.456811 5.457559 19.755766 8.799584e-06 14.281240 1.574261e-04 2.404300e-02
3003 Mirra Andreeva 0.001149 0.000026 4294.456811 5.457559 19.755766 8.799584e-06 14.281240 1.574261e-04 2.404300e-02
2351 Kluytmans 0.001149 0.000026 4294.456811 5.457559 19.755766 8.799584e-06 14.281240 1.574261e-04 2.404300e-02
249 Antonia Niedermaier 0.001149 0.000026 4294.456811 5.457559 19.755766 8.799584e-06 14.281240 1.574261e-04 2.404300e-02
1791 Ilse Kuijt 0.001149 0.000026 4294.456811 5.457559 19.755766 8.799584e-06 14.281240 1.574261e-04 2.404300e-02
742 Cobolli 0.001149 0.000026 4294.456811 5.457559 19.755766 8.799584e-06 14.281240 1.574261e-04 2.404300e-02
4724 Zoë Slagter 0.001149 0.000026 4294.456811 5.457559 19.755766 8.799584e-06 14.281240 1.574261e-04 2.404300e-02
278 Arjan Veurink 0.001580 0.000157 907.063019 3.332074 19.268029 1.135926e-05 14.174484 1.666147e-04 2.404300e-02
3098 Naomi Mestrum 0.001292 0.000105 1135.940978 3.627525 17.091366 3.562389e-05 12.221543 4.724085e-04 6.103910e-02
2665 Maja Chwalinska 0.001580 0.000209 655.297264 2.917039 16.686682 4.408944e-05 12.214228 4.742643e-04 6.103910e-02
4420 Van der Breggen 0.001436 0.000157 815.511836 3.194570 16.772235 4.214548e-05 12.131568 4.957543e-04 6.212584e-02
1731 Hitler 0.001005 0.000026 3745.149709 5.264914 17.027730 3.683788e-05 11.905641 5.596421e-04 6.345276e-02
2519 Lieke 0.001005 0.000026 3745.149709 5.264914 17.027730 3.683788e-05 11.905641 5.596421e-04 6.345276e-02
4176 Teunissen 0.001005 0.000026 3745.149709 5.264914 17.027730 3.683788e-05 11.905641 5.596421e-04 6.345276e-02
4464 Veurink 0.001005 0.000026 3745.149709 5.264914 17.027730 3.683788e-05 11.905641 5.596421e-04 6.345276e-02
968 Denzel Dumfries 0.002010 0.000418 380.643714 2.264965 15.359454 8.887510e-05 11.600092 6.594857e-04 7.303421e-02
2329 Kimi Antonelli 0.003447 0.001203 186.595009 1.519013 14.294065 1.563571e-04 11.539896 6.811844e-04 7.372272e-02
2465 Leclerc 0.001292 0.000157 723.960652 3.042567 14.318772 1.543183e-04 10.145781 1.446319e-03 1.306897e-01
1250 Esther Ouwehand 0.001292 0.000157 723.960652 3.042567 14.318772 1.543183e-04 10.145781 1.446319e-03 1.306897e-01
713 Christine Teunissen 0.001149 0.000105 998.614203 3.457600 14.529034 1.380160e-04 10.122801 1.464463e-03 1.306897e-01
1917 Jan Kluytmans 0.001149 0.000105 998.614203 3.457600 14.529034 1.380160e-04 10.122801 1.464463e-03 1.306897e-01
2124 Joost Luiten 0.001149 0.000105 998.614203 3.457600 14.529034 1.380160e-04 10.122801 1.464463e-03 1.306897e-01
In [9]:
# dalers
results[results.pct_diff < 0].sort_values('g2', ascending=False)[:40]
Out[9]:
word freq_recent freq_reference pct_diff log_ratio chi2 p_chi2 g2 p_g2 p_g2_adjusted
1743 Hondius 0.000431 0.010457 -95.880197 -4.601278 66.470517 3.551664e-16 98.275792 3.639886e-23 1.733314e-19
4215 Thymen Arensman 0.000072 0.002353 -96.948294 -5.034221 15.256919 9.383239e-05 20.686464 5.409706e-06 1.716105e-03
1508 Gidi Markuszower 0.000072 0.002091 -96.566831 -4.864296 13.434157 2.470840e-04 17.811640 2.438866e-05 6.112569e-03
4495 Vingegaard 0.000072 0.001882 -96.185367 -4.712293 11.977136 5.385731e-04 15.534557 8.101072e-05 1.837014e-02
2782 Markuszower 0.000072 0.001830 -96.076378 -4.671651 11.613072 6.548989e-04 14.969018 1.092909e-04 2.365652e-02
2109 Jonas Vingegaard 0.000431 0.002667 -83.843909 -2.629847 12.353905 4.400650e-04 14.707179 1.255674e-04 2.404300e-02
3750 Rubio 0.000287 0.002301 -87.515748 -3.001814 11.757125 6.061077e-04 14.212300 1.632996e-04 2.404300e-02
115 Ali B 0.000072 0.001725 -95.838583 -4.586762 10.885206 9.693541e-04 13.843023 1.987332e-04 2.783434e-02
512 Botic van de Zandschulp 0.000072 0.001621 -95.570104 -4.496564 10.157724 1.436979e-03 12.724502 3.608960e-04 4.910248e-02
921 De Jong 0.000574 0.002562 -77.579302 -2.157095 9.947317 1.610840e-03 11.220837 8.088400e-04 8.559325e-02
4034 Starmer 0.000574 0.002510 -77.112204 -2.127348 9.611842 1.933266e-03 10.778088 1.027087e-03 1.060152e-01
1054 Donald Trump 0.010196 0.015529 -34.342080 -0.606959 10.440052 1.233118e-03 10.743700 1.046349e-03 1.060152e-01
4169 Tedros Adhanom Ghebreyesus 0.000072 0.001412 -94.913823 -4.297255 8.704154 3.174857e-03 10.514021 1.184722e-03 1.175343e-01
269 Arensman 0.000072 0.001359 -94.718201 -4.242808 8.341110 3.875791e-03 9.967878 1.592950e-03 1.306897e-01
3185 Noam Bettan 0.000072 0.001307 -94.506929 -4.186224 7.978231 4.734318e-03 9.424725 2.140792e-03 1.396500e-01
3082 Máxima 0.002010 0.004497 -55.288957 -1.161296 8.263354 4.045333e-03 8.639908 3.288780e-03 2.088156e-01
1099 Dylan Groenewegen 0.000072 0.001203 -94.029271 -4.065930 7.253040 7.078107e-03 8.348388 3.860297e-03 2.418781e-01
4258 Tom Berendsen 0.000574 0.002196 -73.842519 -1.934703 7.626670 5.751139e-03 8.204369 4.178964e-03 2.456818e-01
4598 Willem-Alexander 0.001436 0.003555 -59.609772 -1.307921 7.692603 5.544761e-03 8.044226 4.564892e-03 2.587859e-01
3222 Oceanwide Expeditions 0.000072 0.001150 -93.757874 -4.001800 6.890771 8.664187e-03 7.815786 5.179183e-03 2.603795e-01
1411 Freek 0.000072 0.001150 -93.757874 -4.001800 6.890771 8.664187e-03 7.815786 5.179183e-03 2.603795e-01
2226 Kaja Kallas 0.000072 0.001098 -93.460630 -3.934686 6.528755 1.061442e-02 7.287350 6.944183e-03 2.603795e-01
4454 Verhoeven 0.000072 0.001098 -93.460630 -3.934686 6.528755 1.061442e-02 7.287350 6.944183e-03 2.603795e-01
1401 Fred Rutten 0.000072 0.001098 -93.460630 -3.934686 6.528755 1.061442e-02 7.287350 6.944183e-03 2.603795e-01
1981 Jerdy Schouten 0.000072 0.001046 -93.133661 -3.864296 6.167024 1.301534e-02 6.763485 9.304187e-03 3.370342e-01
2977 Mikel Arteta 0.000072 0.001046 -93.133661 -3.864296 6.167024 1.301534e-02 6.763485 9.304187e-03 3.370342e-01
3013 Modi 0.000072 0.001046 -93.133661 -3.864296 6.167024 1.301534e-02 6.763485 9.304187e-03 3.370342e-01
3237 Oleksandr Usyk 0.000072 0.001046 -93.133661 -3.864296 6.167024 1.301534e-02 6.763485 9.304187e-03 3.370342e-01
417 Berendsen 0.000718 0.002248 -68.063541 -1.646722 6.509375 1.073072e-02 6.756171 9.342400e-03 3.370342e-01
2016 Jetten 0.002441 0.004601 -46.941928 -0.914356 5.941062 1.479210e-02 5.969003 1.455951e-02 4.952314e-01
1310 Felix Gall 0.000072 0.000941 -92.370735 -3.712293 5.444587 1.962906e-02 5.731410 1.666410e-02 5.501480e-01
1167 Ellen ten Damme 0.000072 0.000941 -92.370735 -3.712293 5.444587 1.962906e-02 5.731410 1.666410e-02 5.501480e-01
4101 Suzan & Freek 0.000072 0.000941 -92.370735 -3.712293 5.444587 1.962906e-02 5.731410 1.666410e-02 5.501480e-01
4141 Tallon Griekspoor 0.000431 0.001621 -73.420624 -1.911618 5.554715 1.843098e-02 5.650035 1.745502e-02 5.501480e-01
4655 Xi Jinping 0.001149 0.002719 -57.745608 -1.242826 5.484134 1.918982e-02 5.488524 1.914170e-02 5.501480e-01
133 Amalia 0.000431 0.001569 -72.534645 -1.864313 5.230992 2.218788e-02 5.249197 2.195690e-02 5.501480e-01
3767 Rutten 0.000072 0.000889 -91.921954 -3.629831 5.083988 2.414777e-02 5.224377 2.227242e-02 5.501480e-01
2970 Mika Godts 0.000072 0.000889 -91.921954 -3.629831 5.083988 2.414777e-02 5.224377 2.227242e-02 5.501480e-01
537 Brekelmans 0.000072 0.000889 -91.921954 -3.629831 5.083988 2.414777e-02 5.224377 2.227242e-02 5.501480e-01
2712 Marco Rubio 0.000862 0.002248 -61.676249 -1.383688 5.236100 2.212281e-02 5.213944 2.240645e-02 5.501480e-01