stijgers

2026-06-18 12:52:40 +02:00
parent a8bea0ab44
commit 01e6d48665
13 changed files with 15363 additions and 8 deletions
--- a/python/TODO.txt
+++ b/python/TODO.txt
@@ -0,0 +1 @@
+python: notebook en pakketten installeren
--- a/python/data.txt
+++ b/python/data.txt
--- a/python/namen.ipynb
+++ b/python/namen.ipynb
--- a/python/namen.py
+++ b/python/namen.py
@@ -0,0 +1,82 @@
+#!/net/corpora/nlnieuws/notebook/bin/python3
+
+import numpy as np
+from scipy.stats import chi2_contingency
+from statsmodels.stats.multitest import multipletests
+import pandas as pd
+
+# waarom werkt dit niet?
+pd.set_option('display.max_rows', 40)
+
+def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
+    """
+    word             : the word being tested
+    counts_recent    : raw count in week 5
+    counts_reference : raw count in weeks 1-4
+    total_recent     : total tokens in week 5
+    total_reference  : total tokens in weeks 1-4
+    """
+    a = counts_recent      # word in recent
+    b = counts_reference   # word in reference
+    c = total_recent - a   # non-word in recent
+    d = total_reference - b  # non-word in reference
+
+    contingency = np.array([[a, b],
+                             [c, d]])
+
+    # --- Chi-Squared ---
+    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
+
+    # --- Log-Likelihood (G²) ---
+    # G² = 2 * sum(observed * log(observed / expected))
+    # scipy's chi2_contingency with lambda_="log-likelihood" computes this
+    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
+
+    # --- Effect sizes ---
+    freq_recent    = a / total_recent
+    freq_reference = b / total_reference
+
+    pct_diff = (freq_recent - freq_reference) / freq_reference * 100
+
+    # Avoid log(0) with a small epsilon
+    eps = 1e-9
+    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
+
+    return {
+        "word":           word,
+        "freq_recent":    freq_recent,
+        "freq_reference": freq_reference,
+        "pct_diff":       pct_diff,
+        "log_ratio":      log_ratio,
+        "chi2":           chi2_stat,
+        "p_chi2":         p_chi2,
+        "g2":             g2_stat,
+        "p_g2":           p_g2,
+    }
+
+counts_recent = {}
+counts_reference = {}
+with open("data.txt", "rt", encoding="utf-8") as fp:
+    for line in fp:
+        aa = line.split("\t")
+        counts_reference[aa[0]] = max(int(aa[1]), 0.5)
+        counts_recent[aa[0]] = max(int(aa[2]), 0.5)
+total_recent = sum(counts_recent.values())
+total_reference = sum(counts_reference.values())
+
+results = [
+    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
+                 total_recent, total_reference)
+    for word in counts_recent]
+
+# FDR correction across all words
+p_values = [r["p_g2"] for r in results]
+_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
+
+for r, p_adj in zip(results, p_adjusted):
+    r["p_g2_adjusted"] = p_adj
+
+results = pd.DataFrame(results)
+print(results)
+print(results.sort_values('g2'))
+print(results.sort_values('pct_diff'))
--- a/python/score.txt
+++ b/python/score.txt
@@ -0,0 +1,30 @@
+Er zijn twee simpele formules om de "effect size" van het verschil tussen twee
+relatieve frequenties te rapporteren:
+ *  %DIFF = (freq_B  - freq_A) / freq_A * 100
+    Percentage verschil in relatieve frequenties, makkelijk te interpreteren,
+    maar niet symmetrisch.
+ *  Log Ratio: log2(freq_A / freq_B)
+    Een symmetrische en interpreteerbare effect size; +1 is een verdubbeling, -1
+    een halvering
+Twee populaire methodes om de significantie van frequentieverschillen te testen
+(ook wel keyword extraction):
+ *  Log-Likelihood Ratio (G^2): meest gebruikte methode in Corpus Linguistics.
+    Vergelijkt observed vs expected frequency.
+ *  Chi-Squared test (X^2): simpeler dan G^2, maar geeft meer false positives
+    bij sparse data, werkt niet goed met lage frequenties.
+Je kunt dan de gebruiker alleen de woorden met significante verschillen laten
+zien (dit zijn dan de keywords). Ik heb met behulp van Claude een notebook in
+elkaar gezet met een demonstratie van deze methodes:
+→ Word freq comparison.ipynb
+
+
+Er zijn ook geavanceerdere methodes die me te ingewikkeld lijken om te
+implementeren, maar ik noem ze voor de volledigheid. In de stylometrie
+is Burrow's Zeta populair, deze is bijv. beschikbaar in Stylo
+https://github.com/computationalstylistics/stylo onder de oppose()
+functie
+https://cran.r-project.org/web/packages/stylo/stylo.pdf#Rfn.oppose.1 .
+Er is ook een methode die gebruik maakt van Bayesiaanse statistiek en
+frequenties uit een achtergrondcorpus, de Fightin' Words methode van
+Monroe et al: https://github.com/jmhessel/FightingWords
+
--- a/python/word_freq_comparison.html
+++ b/python/word_freq_comparison.html
--- a/python/word_freq_comparison.ipynb
+++ b/python/word_freq_comparison.ipynb
@@ -0,0 +1,500 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 1341,
+     "status": "ok",
+     "timestamp": 1781100698726,
+     "user": {
+      "displayName": "Andreas van Cranenburgh",
+      "userId": "13143063654677287265"
+     },
+     "user_tz": -120
+    },
+    "id": "YMifluhW2rZp"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from scipy.stats import chi2_contingency\n",
+    "from statsmodels.stats.multitest import multipletests\n",
+    "import pandas as pd\n",
+    "\n",
+    "def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):\n",
+    "    \"\"\"\n",
+    "    word             : the word being tested\n",
+    "    counts_recent    : raw count in week 5\n",
+    "    counts_reference : raw count in weeks 1-4\n",
+    "    total_recent     : total tokens in week 5\n",
+    "    total_reference  : total tokens in weeks 1-4\n",
+    "    \"\"\"\n",
+    "    a = counts_recent      # word in recent\n",
+    "    b = counts_reference   # word in reference\n",
+    "    c = total_recent - a   # non-word in recent\n",
+    "    d = total_reference - b  # non-word in reference\n",
+    "\n",
+    "    contingency = np.array([[a, b],\n",
+    "                             [c, d]])\n",
+    "\n",
+    "    # --- Chi-Squared ---\n",
+    "    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)\n",
+    "\n",
+    "    # --- Log-Likelihood (G²) ---\n",
+    "    # G² = 2 * sum(observed * log(observed / expected))\n",
+    "    # scipy's chi2_contingency with lambda_=\"log-likelihood\" computes this\n",
+    "    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_=\"log-likelihood\")\n",
+    "\n",
+    "    # --- Effect sizes ---\n",
+    "    freq_recent    = a / total_recent\n",
+    "    freq_reference = b / total_reference\n",
+    "\n",
+    "    pct_diff = (freq_recent - freq_reference) / freq_reference * 100\n",
+    "\n",
+    "    # Avoid log(0) with a small epsilon\n",
+    "    eps = 1e-9\n",
+    "    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))\n",
+    "\n",
+    "    return {\n",
+    "        \"word\":           word,\n",
+    "        \"freq_recent\":    freq_recent,\n",
+    "        \"freq_reference\": freq_reference,\n",
+    "        \"pct_diff\":       pct_diff,\n",
+    "        \"log_ratio\":      log_ratio,\n",
+    "        \"chi2\":           chi2_stat,\n",
+    "        \"p_chi2\":         p_chi2,\n",
+    "        \"g2\":             g2_stat,\n",
+    "        \"p_g2\":           p_g2,\n",
+    "    }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 38,
+     "status": "ok",
+     "timestamp": 1781100880331,
+     "user": {
+      "displayName": "Andreas van Cranenburgh",
+      "userId": "13143063654677287265"
+     },
+     "user_tz": -120
+    },
+    "id": "mHH718-222BM"
+   },
+   "outputs": [],
+   "source": [
+    "# Example data\n",
+    "counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}\n",
+    "counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}\n",
+    "total_recent = sum(counts_recent.values())\n",
+    "total_reference = sum(counts_reference.values())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 7,
+     "status": "ok",
+     "timestamp": 1781100881153,
+     "user": {
+      "displayName": "Andreas van Cranenburgh",
+      "userId": "13143063654677287265"
+     },
+     "user_tz": -120
+    },
+    "id": "urBml1212wxb"
+   },
+   "outputs": [],
+   "source": [
+    "# Run tests on whole vocabulary, including correction for multiple tests\n",
+    "# (false discovery rate).\n",
+    "\n",
+    "results = [\n",
+    "    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),\n",
+    "                 total_recent, total_reference)\n",
+    "    for word in counts_recent]\n",
+    "\n",
+    "# FDR correction across all words\n",
+    "p_values = [r[\"p_g2\"] for r in results]\n",
+    "_, p_adjusted, _, _ = multipletests(p_values, method=\"fdr_bh\")\n",
+    "\n",
+    "for r, p_adj in zip(results, p_adjusted):\n",
+    "    r[\"p_g2_adjusted\"] = p_adj"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 163
+    },
+    "executionInfo": {
+     "elapsed": 12,
+     "status": "ok",
+     "timestamp": 1781100882491,
+     "user": {
+      "displayName": "Andreas van Cranenburgh",
+      "userId": "13143063654677287265"
+     },
+     "user_tz": -120
+    },
+    "id": "-y3MUOgI3PFn",
+    "outputId": "f9a90951-c8f2-45a7-8d8d-fb0e279dd5b3"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>word</th>\n",
+       "      <th>freq_recent</th>\n",
+       "      <th>freq_reference</th>\n",
+       "      <th>pct_diff</th>\n",
+       "      <th>log_ratio</th>\n",
+       "      <th>chi2</th>\n",
+       "      <th>p_chi2</th>\n",
+       "      <th>g2</th>\n",
+       "      <th>p_g2</th>\n",
+       "      <th>p_g2_adjusted</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>eend</td>\n",
+       "      <td>0.424929</td>\n",
+       "      <td>0.241379</td>\n",
+       "      <td>76.042088</td>\n",
+       "      <td>0.815920</td>\n",
+       "      <td>25.238117</td>\n",
+       "      <td>5.067080e-07</td>\n",
+       "      <td>24.764140</td>\n",
+       "      <td>6.479173e-07</td>\n",
+       "      <td>0.000002</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>tafel</td>\n",
+       "      <td>0.286119</td>\n",
+       "      <td>0.313480</td>\n",
+       "      <td>-8.728045</td>\n",
+       "      <td>-0.131756</td>\n",
+       "      <td>0.598371</td>\n",
+       "      <td>4.392004e-01</td>\n",
+       "      <td>0.474701</td>\n",
+       "      <td>4.908322e-01</td>\n",
+       "      <td>0.490832</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>fiets</td>\n",
+       "      <td>0.288952</td>\n",
+       "      <td>0.445141</td>\n",
+       "      <td>-35.087579</td>\n",
+       "      <td>-0.623434</td>\n",
+       "      <td>17.676782</td>\n",
+       "      <td>2.618028e-05</td>\n",
+       "      <td>17.051468</td>\n",
+       "      <td>3.638025e-05</td>\n",
+       "      <td>0.000055</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    word  freq_recent  freq_reference   pct_diff  log_ratio       chi2  \\\n",
+       "0   eend     0.424929        0.241379  76.042088   0.815920  25.238117   \n",
+       "1  tafel     0.286119        0.313480  -8.728045  -0.131756   0.598371   \n",
+       "2  fiets     0.288952        0.445141 -35.087579  -0.623434  17.676782   \n",
+       "\n",
+       "         p_chi2         g2          p_g2  p_g2_adjusted  \n",
+       "0  5.067080e-07  24.764140  6.479173e-07       0.000002  \n",
+       "1  4.392004e-01   0.474701  4.908322e-01       0.490832  \n",
+       "2  2.618028e-05  17.051468  3.638025e-05       0.000055  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results = pd.DataFrame(results)\n",
+    "results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 132
+    },
+    "executionInfo": {
+     "elapsed": 65,
+     "status": "ok",
+     "timestamp": 1781100883685,
+     "user": {
+      "displayName": "Andreas van Cranenburgh",
+      "userId": "13143063654677287265"
+     },
+     "user_tz": -120
+    },
+    "id": "nTpOtOka3ViF",
+    "outputId": "2430f959-eeb9-4670-da76-613406cbf473"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>word</th>\n",
+       "      <th>freq_recent</th>\n",
+       "      <th>freq_reference</th>\n",
+       "      <th>pct_diff</th>\n",
+       "      <th>log_ratio</th>\n",
+       "      <th>chi2</th>\n",
+       "      <th>p_chi2</th>\n",
+       "      <th>g2</th>\n",
+       "      <th>p_g2</th>\n",
+       "      <th>p_g2_adjusted</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>eend</td>\n",
+       "      <td>0.424929</td>\n",
+       "      <td>0.241379</td>\n",
+       "      <td>76.042088</td>\n",
+       "      <td>0.815920</td>\n",
+       "      <td>25.238117</td>\n",
+       "      <td>5.067080e-07</td>\n",
+       "      <td>24.764140</td>\n",
+       "      <td>6.479173e-07</td>\n",
+       "      <td>0.000002</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>fiets</td>\n",
+       "      <td>0.288952</td>\n",
+       "      <td>0.445141</td>\n",
+       "      <td>-35.087579</td>\n",
+       "      <td>-0.623434</td>\n",
+       "      <td>17.676782</td>\n",
+       "      <td>2.618028e-05</td>\n",
+       "      <td>17.051468</td>\n",
+       "      <td>3.638025e-05</td>\n",
+       "      <td>0.000055</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    word  freq_recent  freq_reference   pct_diff  log_ratio       chi2  \\\n",
+       "0   eend     0.424929        0.241379  76.042088   0.815920  25.238117   \n",
+       "2  fiets     0.288952        0.445141 -35.087579  -0.623434  17.676782   \n",
+       "\n",
+       "         p_chi2         g2          p_g2  p_g2_adjusted  \n",
+       "0  5.067080e-07  24.764140  6.479173e-07       0.000002  \n",
+       "2  2.618028e-05  17.051468  3.638025e-05       0.000055  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Significant according to Chi2\n",
+    "results[results['p_chi2'] < 0.05]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 132
+    },
+    "executionInfo": {
+     "elapsed": 166,
+     "status": "ok",
+     "timestamp": 1781100928540,
+     "user": {
+      "displayName": "Andreas van Cranenburgh",
+      "userId": "13143063654677287265"
+     },
+     "user_tz": -120
+    },
+    "id": "Mz4zAphE4dBY",
+    "outputId": "3b42fdd2-f451-47b6-8989-ebf4dafcbf1a"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>word</th>\n",
+       "      <th>freq_recent</th>\n",
+       "      <th>freq_reference</th>\n",
+       "      <th>pct_diff</th>\n",
+       "      <th>log_ratio</th>\n",
+       "      <th>chi2</th>\n",
+       "      <th>p_chi2</th>\n",
+       "      <th>g2</th>\n",
+       "      <th>p_g2</th>\n",
+       "      <th>p_g2_adjusted</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>eend</td>\n",
+       "      <td>0.424929</td>\n",
+       "      <td>0.241379</td>\n",
+       "      <td>76.042088</td>\n",
+       "      <td>0.815920</td>\n",
+       "      <td>25.238117</td>\n",
+       "      <td>5.067080e-07</td>\n",
+       "      <td>24.764140</td>\n",
+       "      <td>6.479173e-07</td>\n",
+       "      <td>0.000002</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>fiets</td>\n",
+       "      <td>0.288952</td>\n",
+       "      <td>0.445141</td>\n",
+       "      <td>-35.087579</td>\n",
+       "      <td>-0.623434</td>\n",
+       "      <td>17.676782</td>\n",
+       "      <td>2.618028e-05</td>\n",
+       "      <td>17.051468</td>\n",
+       "      <td>3.638025e-05</td>\n",
+       "      <td>0.000055</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    word  freq_recent  freq_reference   pct_diff  log_ratio       chi2  \\\n",
+       "0   eend     0.424929        0.241379  76.042088   0.815920  25.238117   \n",
+       "2  fiets     0.288952        0.445141 -35.087579  -0.623434  17.676782   \n",
+       "\n",
+       "         p_chi2         g2          p_g2  p_g2_adjusted  \n",
+       "0  5.067080e-07  24.764140  6.479173e-07       0.000002  \n",
+       "2  2.618028e-05  17.051468  3.638025e-05       0.000055  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Significant according to G2 (LLR)\n",
+    "results[results['p_g2_adjusted'] < 0.05]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "JNCCUpdC4jK5"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "authorship_tag": "ABX9TyOWNAG6IZoh+ik4rqgeMAZj",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/python/word_freq_comparison.ipynb.ori
+++ b/python/word_freq_comparison.ipynb.ori