+
+
+
+
+
+
+In [1]:
+
+
+
+
+
+import numpy as np
+from scipy.stats import chi2_contingency
+from statsmodels.stats.multitest import multipletests
+import pandas as pd
+
+
+
+
+
+
+
+In [2]:
+
+
+
+
+
+def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
+ """
+ word : the word being tested
+ counts_recent : raw count in week 5
+ counts_reference : raw count in weeks 1-4
+ total_recent : total tokens in week 5
+ total_reference : total tokens in weeks 1-4
+ """
+ a = counts_recent # word in recent
+ b = counts_reference # word in reference
+ c = total_recent - a # non-word in recent
+ d = total_reference - b # non-word in reference
+
+ contingency = np.array([[a, b],
+ [c, d]])
+
+ # --- Chi-Squared ---
+ chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
+
+ # --- Log-Likelihood (G²) ---
+ # G² = 2 * sum(observed * log(observed / expected))
+ # scipy's chi2_contingency with lambda_="log-likelihood" computes this
+ g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
+
+ # --- Effect sizes ---
+ freq_recent = a / total_recent
+ freq_reference = b / total_reference
+
+ pct_diff = (freq_recent - freq_reference) / freq_reference * 100
+
+ # Avoid log(0) with a small epsilon
+ eps = 1e-9
+ log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
+
+ return {
+ "word": word,
+ "freq_recent": freq_recent,
+ "freq_reference": freq_reference,
+ "pct_diff": pct_diff,
+ "log_ratio": log_ratio,
+ "chi2": chi2_stat,
+ "p_chi2": p_chi2,
+ "g2": g2_stat,
+ "p_g2": p_g2,
+ }
+
+
+
+
+
+
+
+In [3]:
+
+
+
+
+
+counts_recent = {}
+counts_reference = {}
+with open("data.txt", "rt", encoding="utf-8") as fp:
+ for line in fp:
+ aa = line.split("\t")
+ counts_reference[aa[0]] = max(int(aa[1]), 0.5)
+ counts_recent[aa[0]] = max(int(aa[2]), 0.5)
+total_recent = sum(counts_recent.values())
+total_reference = sum(counts_reference.values())
+
+
+
+
+
+
+
+In [4]:
+
+
+
+
+
+results = [
+ corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
+ total_recent, total_reference)
+ for word in counts_recent]
+
+
+
+
+
+
+
+In [5]:
+
+
+
+
+
+# FDR correction across all words
+p_values = [r["p_g2"] for r in results]
+_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
+
+for r, p_adj in zip(results, p_adjusted):
+ r["p_g2_adjusted"] = p_adj
+
+
+
+
+
+
+
+In [6]:
+
+
+
+
+
+results = pd.DataFrame(results)
+
+
+
+
+
+
+
+In [7]:
+
+
+
+
+
+#pd.set_option('display.max_rows', None)
+#pd.set_option('display.max_columns', None)
+
+
+
+
+
+
+
+In [8]:
+
+
+
+
+
+# stijgers
+results[results.pct_diff > 0].sort_values('g2', ascending=False)[:40]
+
+
+
+
+
+
+
+
+Out[8]:
+
+
+
+
+
+
+
+
+| + | word | +freq_recent | +freq_reference | +pct_diff | +log_ratio | +chi2 | +p_chi2 | +g2 | +p_g2 | +p_g2_adjusted | +
|---|---|---|---|---|---|---|---|---|---|---|
| 4666 | +Ye | +0.009191 | +0.001412 | +551.030639 | +2.702725 | +88.873013 | +4.209875e-21 | +73.611094 | +9.512852e-18 | +2.265010e-14 | +
| 2233 | +Kanye West | +0.006175 | +0.001150 | +436.822849 | +2.424445 | +51.860997 | +5.957208e-13 | +42.512795 | +7.022134e-11 | +1.114647e-07 | +
| 2521 | +Lieke Marsman | +0.002298 | +0.000026 | +8688.913621 | +6.457558 | +41.675092 | +1.077733e-10 | +34.046050 | +5.382305e-09 | +6.407634e-06 | +
| 55 | +Ahmed Marcouch | +0.003016 | +0.000261 | +1053.544913 | +3.527997 | +38.897401 | +4.466750e-10 | +30.774198 | +2.898665e-08 | +2.760689e-05 | +
| 1343 | +Flavio Cobolli | +0.002010 | +0.000026 | +7590.299418 | +6.264913 | +36.186456 | +1.793125e-09 | +29.023926 | +7.148986e-08 | +5.673912e-05 | +
| 4362 | +Van Dissel | +0.002585 | +0.000209 | +1135.940978 | +3.627532 | +34.197157 | +4.980167e-09 | +26.757903 | +2.306067e-07 | +1.546231e-04 | +
| 2797 | +Marsman | +0.001867 | +0.000026 | +7040.992317 | +6.157998 | +33.443641 | +7.335924e-09 | +26.527883 | +2.597616e-07 | +1.546231e-04 | +
| 2503 | +Lewis Hamilton | +0.004021 | +0.000784 | +412.686628 | +2.358076 | +32.500200 | +1.191803e-08 | +26.104423 | +3.234422e-07 | +1.711368e-04 | +
| 673 | +Charles Leclerc | +0.002872 | +0.000418 | +586.633877 | +2.779538 | +28.670638 | +8.579474e-08 | +22.442992 | +2.164739e-06 | +9.371352e-04 | +
| 1858 | +Jaap van Dissel | +0.002872 | +0.000418 | +586.633877 | +2.779538 | +28.670638 | +8.579474e-08 | +22.442992 | +2.164739e-06 | +9.371352e-04 | +
| 4516 | +Vollering | +0.001867 | +0.000105 | +1685.248079 | +4.158040 | +27.590589 | +1.499064e-07 | +21.070561 | +4.426770e-06 | +1.716105e-03 | +
| 2923 | +Mette-Marit | +0.004595 | +0.001307 | +251.556545 | +1.813756 | +25.318681 | +4.859791e-07 | +20.713510 | +5.333836e-06 | +1.716105e-03 | +
| 1801 | +Ingrid Alexandra | +0.002010 | +0.000157 | +1181.716570 | +3.679997 | +26.935070 | +2.104062e-07 | +20.564349 | +5.765997e-06 | +1.716105e-03 | +
| 3466 | +Pols | +0.002010 | +0.000157 | +1181.716570 | +3.679997 | +26.935070 | +2.104062e-07 | +20.564349 | +5.765997e-06 | +1.716105e-03 | +
| 2717 | +Marcouch | +0.002441 | +0.000314 | +678.185060 | +2.960109 | +26.234266 | +3.024086e-07 | +20.256468 | +6.772409e-06 | +1.897071e-03 | +
| 1053 | +Donald Pols | +0.002154 | +0.000261 | +723.960652 | +3.042571 | +23.871944 | +1.029615e-06 | +18.209227 | +1.979176e-05 | +5.236021e-03 | +
| 3971 | +Sjoerdsma | +0.001867 | +0.000209 | +792.624040 | +3.158047 | +21.542862 | +3.460080e-06 | +16.178300 | +5.765077e-05 | +1.372665e-02 | +
| 2734 | +Marianne Thieme | +0.001149 | +0.000026 | +4294.456811 | +5.457559 | +19.755766 | +8.799584e-06 | +14.281240 | +1.574261e-04 | +2.404300e-02 | +
| 2583 | +Lotte van Kruistum | +0.001149 | +0.000026 | +4294.456811 | +5.457559 | +19.755766 | +8.799584e-06 | +14.281240 | +1.574261e-04 | +2.404300e-02 | +
| 3003 | +Mirra Andreeva | +0.001149 | +0.000026 | +4294.456811 | +5.457559 | +19.755766 | +8.799584e-06 | +14.281240 | +1.574261e-04 | +2.404300e-02 | +
| 2351 | +Kluytmans | +0.001149 | +0.000026 | +4294.456811 | +5.457559 | +19.755766 | +8.799584e-06 | +14.281240 | +1.574261e-04 | +2.404300e-02 | +
| 249 | +Antonia Niedermaier | +0.001149 | +0.000026 | +4294.456811 | +5.457559 | +19.755766 | +8.799584e-06 | +14.281240 | +1.574261e-04 | +2.404300e-02 | +
| 1791 | +Ilse Kuijt | +0.001149 | +0.000026 | +4294.456811 | +5.457559 | +19.755766 | +8.799584e-06 | +14.281240 | +1.574261e-04 | +2.404300e-02 | +
| 742 | +Cobolli | +0.001149 | +0.000026 | +4294.456811 | +5.457559 | +19.755766 | +8.799584e-06 | +14.281240 | +1.574261e-04 | +2.404300e-02 | +
| 4724 | +Zoë Slagter | +0.001149 | +0.000026 | +4294.456811 | +5.457559 | +19.755766 | +8.799584e-06 | +14.281240 | +1.574261e-04 | +2.404300e-02 | +
| 278 | +Arjan Veurink | +0.001580 | +0.000157 | +907.063019 | +3.332074 | +19.268029 | +1.135926e-05 | +14.174484 | +1.666147e-04 | +2.404300e-02 | +
| 3098 | +Naomi Mestrum | +0.001292 | +0.000105 | +1135.940978 | +3.627525 | +17.091366 | +3.562389e-05 | +12.221543 | +4.724085e-04 | +6.103910e-02 | +
| 2665 | +Maja Chwalinska | +0.001580 | +0.000209 | +655.297264 | +2.917039 | +16.686682 | +4.408944e-05 | +12.214228 | +4.742643e-04 | +6.103910e-02 | +
| 4420 | +Van der Breggen | +0.001436 | +0.000157 | +815.511836 | +3.194570 | +16.772235 | +4.214548e-05 | +12.131568 | +4.957543e-04 | +6.212584e-02 | +
| 1731 | +Hitler | +0.001005 | +0.000026 | +3745.149709 | +5.264914 | +17.027730 | +3.683788e-05 | +11.905641 | +5.596421e-04 | +6.345276e-02 | +
| 2519 | +Lieke | +0.001005 | +0.000026 | +3745.149709 | +5.264914 | +17.027730 | +3.683788e-05 | +11.905641 | +5.596421e-04 | +6.345276e-02 | +
| 4176 | +Teunissen | +0.001005 | +0.000026 | +3745.149709 | +5.264914 | +17.027730 | +3.683788e-05 | +11.905641 | +5.596421e-04 | +6.345276e-02 | +
| 4464 | +Veurink | +0.001005 | +0.000026 | +3745.149709 | +5.264914 | +17.027730 | +3.683788e-05 | +11.905641 | +5.596421e-04 | +6.345276e-02 | +
| 968 | +Denzel Dumfries | +0.002010 | +0.000418 | +380.643714 | +2.264965 | +15.359454 | +8.887510e-05 | +11.600092 | +6.594857e-04 | +7.303421e-02 | +
| 2329 | +Kimi Antonelli | +0.003447 | +0.001203 | +186.595009 | +1.519013 | +14.294065 | +1.563571e-04 | +11.539896 | +6.811844e-04 | +7.372272e-02 | +
| 2465 | +Leclerc | +0.001292 | +0.000157 | +723.960652 | +3.042567 | +14.318772 | +1.543183e-04 | +10.145781 | +1.446319e-03 | +1.306897e-01 | +
| 1250 | +Esther Ouwehand | +0.001292 | +0.000157 | +723.960652 | +3.042567 | +14.318772 | +1.543183e-04 | +10.145781 | +1.446319e-03 | +1.306897e-01 | +
| 713 | +Christine Teunissen | +0.001149 | +0.000105 | +998.614203 | +3.457600 | +14.529034 | +1.380160e-04 | +10.122801 | +1.464463e-03 | +1.306897e-01 | +
| 1917 | +Jan Kluytmans | +0.001149 | +0.000105 | +998.614203 | +3.457600 | +14.529034 | +1.380160e-04 | +10.122801 | +1.464463e-03 | +1.306897e-01 | +
| 2124 | +Joost Luiten | +0.001149 | +0.000105 | +998.614203 | +3.457600 | +14.529034 | +1.380160e-04 | +10.122801 | +1.464463e-03 | +1.306897e-01 | +
+
+
+
+
+
+
+
+In [9]:
+
+
+
+
+
+# dalers
+results[results.pct_diff < 0].sort_values('g2', ascending=False)[:40]
+
+
+
+
+
+
+
+
+Out[9]:
+
+
+
+
+
+
+
+
+| + | word | +freq_recent | +freq_reference | +pct_diff | +log_ratio | +chi2 | +p_chi2 | +g2 | +p_g2 | +p_g2_adjusted | +
|---|---|---|---|---|---|---|---|---|---|---|
| 1743 | +Hondius | +0.000431 | +0.010457 | +-95.880197 | +-4.601278 | +66.470517 | +3.551664e-16 | +98.275792 | +3.639886e-23 | +1.733314e-19 | +
| 4215 | +Thymen Arensman | +0.000072 | +0.002353 | +-96.948294 | +-5.034221 | +15.256919 | +9.383239e-05 | +20.686464 | +5.409706e-06 | +1.716105e-03 | +
| 1508 | +Gidi Markuszower | +0.000072 | +0.002091 | +-96.566831 | +-4.864296 | +13.434157 | +2.470840e-04 | +17.811640 | +2.438866e-05 | +6.112569e-03 | +
| 4495 | +Vingegaard | +0.000072 | +0.001882 | +-96.185367 | +-4.712293 | +11.977136 | +5.385731e-04 | +15.534557 | +8.101072e-05 | +1.837014e-02 | +
| 2782 | +Markuszower | +0.000072 | +0.001830 | +-96.076378 | +-4.671651 | +11.613072 | +6.548989e-04 | +14.969018 | +1.092909e-04 | +2.365652e-02 | +
| 2109 | +Jonas Vingegaard | +0.000431 | +0.002667 | +-83.843909 | +-2.629847 | +12.353905 | +4.400650e-04 | +14.707179 | +1.255674e-04 | +2.404300e-02 | +
| 3750 | +Rubio | +0.000287 | +0.002301 | +-87.515748 | +-3.001814 | +11.757125 | +6.061077e-04 | +14.212300 | +1.632996e-04 | +2.404300e-02 | +
| 115 | +Ali B | +0.000072 | +0.001725 | +-95.838583 | +-4.586762 | +10.885206 | +9.693541e-04 | +13.843023 | +1.987332e-04 | +2.783434e-02 | +
| 512 | +Botic van de Zandschulp | +0.000072 | +0.001621 | +-95.570104 | +-4.496564 | +10.157724 | +1.436979e-03 | +12.724502 | +3.608960e-04 | +4.910248e-02 | +
| 921 | +De Jong | +0.000574 | +0.002562 | +-77.579302 | +-2.157095 | +9.947317 | +1.610840e-03 | +11.220837 | +8.088400e-04 | +8.559325e-02 | +
| 4034 | +Starmer | +0.000574 | +0.002510 | +-77.112204 | +-2.127348 | +9.611842 | +1.933266e-03 | +10.778088 | +1.027087e-03 | +1.060152e-01 | +
| 1054 | +Donald Trump | +0.010196 | +0.015529 | +-34.342080 | +-0.606959 | +10.440052 | +1.233118e-03 | +10.743700 | +1.046349e-03 | +1.060152e-01 | +
| 4169 | +Tedros Adhanom Ghebreyesus | +0.000072 | +0.001412 | +-94.913823 | +-4.297255 | +8.704154 | +3.174857e-03 | +10.514021 | +1.184722e-03 | +1.175343e-01 | +
| 269 | +Arensman | +0.000072 | +0.001359 | +-94.718201 | +-4.242808 | +8.341110 | +3.875791e-03 | +9.967878 | +1.592950e-03 | +1.306897e-01 | +
| 3185 | +Noam Bettan | +0.000072 | +0.001307 | +-94.506929 | +-4.186224 | +7.978231 | +4.734318e-03 | +9.424725 | +2.140792e-03 | +1.396500e-01 | +
| 3082 | +Máxima | +0.002010 | +0.004497 | +-55.288957 | +-1.161296 | +8.263354 | +4.045333e-03 | +8.639908 | +3.288780e-03 | +2.088156e-01 | +
| 1099 | +Dylan Groenewegen | +0.000072 | +0.001203 | +-94.029271 | +-4.065930 | +7.253040 | +7.078107e-03 | +8.348388 | +3.860297e-03 | +2.418781e-01 | +
| 4258 | +Tom Berendsen | +0.000574 | +0.002196 | +-73.842519 | +-1.934703 | +7.626670 | +5.751139e-03 | +8.204369 | +4.178964e-03 | +2.456818e-01 | +
| 4598 | +Willem-Alexander | +0.001436 | +0.003555 | +-59.609772 | +-1.307921 | +7.692603 | +5.544761e-03 | +8.044226 | +4.564892e-03 | +2.587859e-01 | +
| 3222 | +Oceanwide Expeditions | +0.000072 | +0.001150 | +-93.757874 | +-4.001800 | +6.890771 | +8.664187e-03 | +7.815786 | +5.179183e-03 | +2.603795e-01 | +
| 1411 | +Freek | +0.000072 | +0.001150 | +-93.757874 | +-4.001800 | +6.890771 | +8.664187e-03 | +7.815786 | +5.179183e-03 | +2.603795e-01 | +
| 2226 | +Kaja Kallas | +0.000072 | +0.001098 | +-93.460630 | +-3.934686 | +6.528755 | +1.061442e-02 | +7.287350 | +6.944183e-03 | +2.603795e-01 | +
| 4454 | +Verhoeven | +0.000072 | +0.001098 | +-93.460630 | +-3.934686 | +6.528755 | +1.061442e-02 | +7.287350 | +6.944183e-03 | +2.603795e-01 | +
| 1401 | +Fred Rutten | +0.000072 | +0.001098 | +-93.460630 | +-3.934686 | +6.528755 | +1.061442e-02 | +7.287350 | +6.944183e-03 | +2.603795e-01 | +
| 1981 | +Jerdy Schouten | +0.000072 | +0.001046 | +-93.133661 | +-3.864296 | +6.167024 | +1.301534e-02 | +6.763485 | +9.304187e-03 | +3.370342e-01 | +
| 2977 | +Mikel Arteta | +0.000072 | +0.001046 | +-93.133661 | +-3.864296 | +6.167024 | +1.301534e-02 | +6.763485 | +9.304187e-03 | +3.370342e-01 | +
| 3013 | +Modi | +0.000072 | +0.001046 | +-93.133661 | +-3.864296 | +6.167024 | +1.301534e-02 | +6.763485 | +9.304187e-03 | +3.370342e-01 | +
| 3237 | +Oleksandr Usyk | +0.000072 | +0.001046 | +-93.133661 | +-3.864296 | +6.167024 | +1.301534e-02 | +6.763485 | +9.304187e-03 | +3.370342e-01 | +
| 417 | +Berendsen | +0.000718 | +0.002248 | +-68.063541 | +-1.646722 | +6.509375 | +1.073072e-02 | +6.756171 | +9.342400e-03 | +3.370342e-01 | +
| 2016 | +Jetten | +0.002441 | +0.004601 | +-46.941928 | +-0.914356 | +5.941062 | +1.479210e-02 | +5.969003 | +1.455951e-02 | +4.952314e-01 | +
| 1310 | +Felix Gall | +0.000072 | +0.000941 | +-92.370735 | +-3.712293 | +5.444587 | +1.962906e-02 | +5.731410 | +1.666410e-02 | +5.501480e-01 | +
| 1167 | +Ellen ten Damme | +0.000072 | +0.000941 | +-92.370735 | +-3.712293 | +5.444587 | +1.962906e-02 | +5.731410 | +1.666410e-02 | +5.501480e-01 | +
| 4101 | +Suzan & Freek | +0.000072 | +0.000941 | +-92.370735 | +-3.712293 | +5.444587 | +1.962906e-02 | +5.731410 | +1.666410e-02 | +5.501480e-01 | +
| 4141 | +Tallon Griekspoor | +0.000431 | +0.001621 | +-73.420624 | +-1.911618 | +5.554715 | +1.843098e-02 | +5.650035 | +1.745502e-02 | +5.501480e-01 | +
| 4655 | +Xi Jinping | +0.001149 | +0.002719 | +-57.745608 | +-1.242826 | +5.484134 | +1.918982e-02 | +5.488524 | +1.914170e-02 | +5.501480e-01 | +
| 133 | +Amalia | +0.000431 | +0.001569 | +-72.534645 | +-1.864313 | +5.230992 | +2.218788e-02 | +5.249197 | +2.195690e-02 | +5.501480e-01 | +
| 3767 | +Rutten | +0.000072 | +0.000889 | +-91.921954 | +-3.629831 | +5.083988 | +2.414777e-02 | +5.224377 | +2.227242e-02 | +5.501480e-01 | +
| 2970 | +Mika Godts | +0.000072 | +0.000889 | +-91.921954 | +-3.629831 | +5.083988 | +2.414777e-02 | +5.224377 | +2.227242e-02 | +5.501480e-01 | +
| 537 | +Brekelmans | +0.000072 | +0.000889 | +-91.921954 | +-3.629831 | +5.083988 | +2.414777e-02 | +5.224377 | +2.227242e-02 | +5.501480e-01 | +
| 2712 | +Marco Rubio | +0.000862 | +0.002248 | +-61.676249 | +-1.383688 | +5.236100 | +2.212281e-02 | +5.213944 | +2.240645e-02 | +5.501480e-01 | +