stijgers

2026-06-18 12:52:40 +02:00
parent a8bea0ab44
commit 01e6d48665
13 changed files with 15363 additions and 8 deletions
--- a/RO/cmd/ro/ro.go
+++ b/RO/cmd/ro/ro.go
@@ -29,8 +29,9 @@ type ItemT struct {
 }

 var (
-	p     = e.PanicErr
-	agent = "AhrefsBot/7.0"
+	p = e.PanicErr
+	//agent = "AhrefsBot/7.0"
+	agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/149.0.0.0 Safari/537.36"
 )

 func exists(filename string) bool {
--- a/collect.sh
+++ b/collect.sh
@@ -163,14 +163,57 @@ do
            | sed -e 's/\([0-9]\)  */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
            > $part-nieuwe-adjww-extra-$ds-$i

-        # ranglijsten
+        # kale tellingen

-        say $part-rang-$ds-$i
+        say $part-count-word-$ds-$i
        alto \
-            'fp://node[((@pt="n" or @neclass) and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass])]' \
-            'tt:%w\t%I' $files \
-            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
-            > $part-rang-$ds-$i
+            'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
+            | sed -e 's/\t.*//' | uniq -c \
+            | grep -v '^ *1 ' \
+            | sed -e 's/\([0-9]\)  */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
+            > $part-count-word-$ds-$i
+
+        say $part-count-loc-$ds-$i
+        alto \
+            'fp://node[(@neclass="LOC"  and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="LOC" ])]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
+            | sed -e 's/\t.*//' | uniq -c \
+            | grep -v '^ *1 ' \
+            | sed -e 's/\([0-9]\)  */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
+            > $part-count-loc-$ds-$i
+
+        say $part-count-per-$ds-$i
+        alto \
+            'fp://node[(@neclass="PER"  and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="PER" ])]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
+            | sed -e 's/\t.*//' | uniq -c \
+            | grep -v '^ *1 ' \
+            | sed -e 's/\([0-9]\)  */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
+            > $part-count-per-$ds-$i
+
+        say $part-count-org-$ds-$i
+        alto \
+            'fp://node[(@neclass="ORG"  and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="ORG" ])]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
+            | sed -e 's/\t.*//' | uniq -c \
+            | grep -v '^ *1 ' \
+            | sed -e 's/\([0-9]\)  */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
+            > $part-count-org-$ds-$i
+
+        say $part-count-misc-$ds-$i
+        alto \
+            'fp://node[(@neclass="MISC"  and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="MISC" ])]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
+            | sed -e 's/\t.*//' | uniq -c \
+            | grep -v '^ *1 ' \
+            | sed -e 's/\([0-9]\)  */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
+            > $part-count-misc-$ds-$i

    done
 done
--- a/oud/rang/collect-rang.sh
+++ b/oud/rang/collect-rang.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+set -e
+
+unset CDPATH
+PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
+export TZ=Europe/Amsterdam
+
+verbose=0
+if [ "$1" = "-v" ]
+then
+    shift
+    verbose=1
+fi
+
+say () {
+    if [ "$verbose" = "1" ]
+    then
+        echo "$*"
+    fi
+}
+
+if [ "$1" = "" ]
+then
+    ds=`date -d -7days +%G.%V`
+else
+    case "$1" in
+        2[0-9][0-9][0-9].[0-5][0-9])
+            ds=$1
+            ;;
+        *)
+            echo INVALID
+            exit 1
+            ;;
+    esac
+fi
+
+year=${ds%%.*}
+
+mkdir -p /net/corpora/nlnieuws/data/$year
+mkdir -p /net/corpora/nlnieuws/data/json/$year
+cd /net/corpora/nlnieuws/data/$year
+
+declare -A parts
+#parts[alles]='.'
+parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso|Volkskrant'
+#parts[amsterdam]='AT5|BuurtAdam|Parool'
+#parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom'
+#parts[literatuur]='LitNL|Tzum'
+#parts[vlaanderen]='HLN|VRT'
+#parts[AT5]='AT5'
+#parts[BuurtAdam]='BuurtAdam'
+#parts[BuurtGrn]='BuurtGrn'
+#parts[GG]='GG'
+#parts[HLN]='HLN'
+#parts[LitNL]='LitNL'
+#parts[NOS]='NOS'
+#parts[NU]='NU'
+#parts[NieuwsNL]='NieuwsNL'
+#parts[Oog]='Oog'
+#parts[Parool]='Parool'
+#parts[RO]='RO'
+#parts[RTVNoord]='RTVNoord'
+#parts[Sargasso]='Sargasso'
+#parts[Sikkom]='Sikkom'
+#parts[Tzum]='Tzum'
+#parts[Volkskrant]='Volkskrant'
+#parts[VRT]='VRT'
+
+for part in ${!parts[@]}
+do
+    regex=${parts[$part]}
+
+    for i in 1 4
+    do
+        files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true
+        if [ -z "$files" ]
+        then
+            continue
+        fi
+
+        say $part-rang-word-$ds-$i
+        alto \
+            'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
+            > $part-rang-word-$ds-$i
+
+        say $part-rang-loc-$ds-$i
+        alto \
+            'fp://node[(@neclass="LOC"  and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="LOC" ])]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
+            > $part-rang-loc-$ds-$i
+
+        say $part-rang-per-$ds-$i
+        alto \
+            'fp://node[(@neclass="PER"  and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="PER" ])]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
+            > $part-rang-per-$ds-$i
+
+        say $part-rang-org-$ds-$i
+        alto \
+            'fp://node[(@neclass="ORG"  and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="ORG" ])]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
+            > $part-rang-org-$ds-$i
+
+        say $part-rang-misc-$ds-$i
+        alto \
+            'fp://node[(@neclass="MISC"  and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="MISC" ])]' \
+            'tt:%l\t%I' $files \
+            | sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
+            > $part-rang-misc-$ds-$i
+
+    done
+done
--- a/python/TODO.txt
+++ b/python/TODO.txt
@@ -0,0 +1 @@
+python: notebook en pakketten installeren
--- a/python/data.txt
+++ b/python/data.txt
--- a/python/namen.ipynb
+++ b/python/namen.ipynb
--- a/python/namen.py
+++ b/python/namen.py
@@ -0,0 +1,82 @@
+#!/net/corpora/nlnieuws/notebook/bin/python3
+
+import numpy as np
+from scipy.stats import chi2_contingency
+from statsmodels.stats.multitest import multipletests
+import pandas as pd
+
+# waarom werkt dit niet?
+pd.set_option('display.max_rows', 40)
+
+def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
+    """
+    word             : the word being tested
+    counts_recent    : raw count in week 5
+    counts_reference : raw count in weeks 1-4
+    total_recent     : total tokens in week 5
+    total_reference  : total tokens in weeks 1-4
+    """
+    a = counts_recent      # word in recent
+    b = counts_reference   # word in reference
+    c = total_recent - a   # non-word in recent
+    d = total_reference - b  # non-word in reference
+
+    contingency = np.array([[a, b],
+                             [c, d]])
+
+    # --- Chi-Squared ---
+    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
+
+    # --- Log-Likelihood (G²) ---
+    # G² = 2 * sum(observed * log(observed / expected))
+    # scipy's chi2_contingency with lambda_="log-likelihood" computes this
+    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
+
+    # --- Effect sizes ---
+    freq_recent    = a / total_recent
+    freq_reference = b / total_reference
+
+    pct_diff = (freq_recent - freq_reference) / freq_reference * 100
+
+    # Avoid log(0) with a small epsilon
+    eps = 1e-9
+    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
+
+    return {
+        "word":           word,
+        "freq_recent":    freq_recent,
+        "freq_reference": freq_reference,
+        "pct_diff":       pct_diff,
+        "log_ratio":      log_ratio,
+        "chi2":           chi2_stat,
+        "p_chi2":         p_chi2,
+        "g2":             g2_stat,
+        "p_g2":           p_g2,
+    }
+
+counts_recent = {}
+counts_reference = {}
+with open("data.txt", "rt", encoding="utf-8") as fp:
+    for line in fp:
+        aa = line.split("\t")
+        counts_reference[aa[0]] = max(int(aa[1]), 0.5)
+        counts_recent[aa[0]] = max(int(aa[2]), 0.5)
+total_recent = sum(counts_recent.values())
+total_reference = sum(counts_reference.values())
+
+results = [
+    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
+                 total_recent, total_reference)
+    for word in counts_recent]
+
+# FDR correction across all words
+p_values = [r["p_g2"] for r in results]
+_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
+
+for r, p_adj in zip(results, p_adjusted):
+    r["p_g2_adjusted"] = p_adj
+
+results = pd.DataFrame(results)
+print(results)
+print(results.sort_values('g2'))
+print(results.sort_values('pct_diff'))
--- a/python/score.txt
+++ b/python/score.txt
@@ -0,0 +1,30 @@
+Er zijn twee simpele formules om de "effect size" van het verschil tussen twee
+relatieve frequenties te rapporteren:
+ *  %DIFF = (freq_B  - freq_A) / freq_A * 100
+    Percentage verschil in relatieve frequenties, makkelijk te interpreteren,
+    maar niet symmetrisch.
+ *  Log Ratio: log2(freq_A / freq_B)
+    Een symmetrische en interpreteerbare effect size; +1 is een verdubbeling, -1
+    een halvering
+Twee populaire methodes om de significantie van frequentieverschillen te testen
+(ook wel keyword extraction):
+ *  Log-Likelihood Ratio (G^2): meest gebruikte methode in Corpus Linguistics.
+    Vergelijkt observed vs expected frequency.
+ *  Chi-Squared test (X^2): simpeler dan G^2, maar geeft meer false positives
+    bij sparse data, werkt niet goed met lage frequenties.
+Je kunt dan de gebruiker alleen de woorden met significante verschillen laten
+zien (dit zijn dan de keywords). Ik heb met behulp van Claude een notebook in
+elkaar gezet met een demonstratie van deze methodes:
+→ Word freq comparison.ipynb
+
+
+Er zijn ook geavanceerdere methodes die me te ingewikkeld lijken om te
+implementeren, maar ik noem ze voor de volledigheid. In de stylometrie
+is Burrow's Zeta populair, deze is bijv. beschikbaar in Stylo
+https://github.com/computationalstylistics/stylo onder de oppose()
+functie
+https://cran.r-project.org/web/packages/stylo/stylo.pdf#Rfn.oppose.1 .
+Er is ook een methode die gebruik maakt van Bayesiaanse statistiek en
+frequenties uit een achtergrondcorpus, de Fightin' Words methode van
+Monroe et al: https://github.com/jmhessel/FightingWords
+
--- a/python/word_freq_comparison.html
+++ b/python/word_freq_comparison.html
--- a/python/word_freq_comparison.ipynb
+++ b/python/word_freq_comparison.ipynb
@@ -0,0 +1,500 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 1341,
+     "status": "ok",
+     "timestamp": 1781100698726,
+     "user": {
+      "displayName": "Andreas van Cranenburgh",
+      "userId": "13143063654677287265"
+     },
+     "user_tz": -120
+    },
+    "id": "YMifluhW2rZp"
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from scipy.stats import chi2_contingency\n",
+    "from statsmodels.stats.multitest import multipletests\n",
+    "import pandas as pd\n",
+    "\n",
+    "def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):\n",
+    "    \"\"\"\n",
+    "    word             : the word being tested\n",
+    "    counts_recent    : raw count in week 5\n",
+    "    counts_reference : raw count in weeks 1-4\n",
+    "    total_recent     : total tokens in week 5\n",
+    "    total_reference  : total tokens in weeks 1-4\n",
+    "    \"\"\"\n",
+    "    a = counts_recent      # word in recent\n",
+    "    b = counts_reference   # word in reference\n",
+    "    c = total_recent - a   # non-word in recent\n",
+    "    d = total_reference - b  # non-word in reference\n",
+    "\n",
+    "    contingency = np.array([[a, b],\n",
+    "                             [c, d]])\n",
+    "\n",
+    "    # --- Chi-Squared ---\n",
+    "    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)\n",
+    "\n",
+    "    # --- Log-Likelihood (G²) ---\n",
+    "    # G² = 2 * sum(observed * log(observed / expected))\n",
+    "    # scipy's chi2_contingency with lambda_=\"log-likelihood\" computes this\n",
+    "    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_=\"log-likelihood\")\n",
+    "\n",
+    "    # --- Effect sizes ---\n",
+    "    freq_recent    = a / total_recent\n",
+    "    freq_reference = b / total_reference\n",
+    "\n",
+    "    pct_diff = (freq_recent - freq_reference) / freq_reference * 100\n",
+    "\n",
+    "    # Avoid log(0) with a small epsilon\n",
+    "    eps = 1e-9\n",
+    "    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))\n",
+    "\n",
+    "    return {\n",
+    "        \"word\":           word,\n",
+    "        \"freq_recent\":    freq_recent,\n",
+    "        \"freq_reference\": freq_reference,\n",
+    "        \"pct_diff\":       pct_diff,\n",
+    "        \"log_ratio\":      log_ratio,\n",
+    "        \"chi2\":           chi2_stat,\n",
+    "        \"p_chi2\":         p_chi2,\n",
+    "        \"g2\":             g2_stat,\n",
+    "        \"p_g2\":           p_g2,\n",
+    "    }\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 38,
+     "status": "ok",
+     "timestamp": 1781100880331,
+     "user": {
+      "displayName": "Andreas van Cranenburgh",
+      "userId": "13143063654677287265"
+     },
+     "user_tz": -120
+    },
+    "id": "mHH718-222BM"
+   },
+   "outputs": [],
+   "source": [
+    "# Example data\n",
+    "counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}\n",
+    "counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}\n",
+    "total_recent = sum(counts_recent.values())\n",
+    "total_reference = sum(counts_reference.values())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "executionInfo": {
+     "elapsed": 7,
+     "status": "ok",
+     "timestamp": 1781100881153,
+     "user": {
+      "displayName": "Andreas van Cranenburgh",
+      "userId": "13143063654677287265"
+     },
+     "user_tz": -120
+    },
+    "id": "urBml1212wxb"
+   },
+   "outputs": [],
+   "source": [
+    "# Run tests on whole vocabulary, including correction for multiple tests\n",
+    "# (false discovery rate).\n",
+    "\n",
+    "results = [\n",
+    "    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),\n",
+    "                 total_recent, total_reference)\n",
+    "    for word in counts_recent]\n",
+    "\n",
+    "# FDR correction across all words\n",
+    "p_values = [r[\"p_g2\"] for r in results]\n",
+    "_, p_adjusted, _, _ = multipletests(p_values, method=\"fdr_bh\")\n",
+    "\n",
+    "for r, p_adj in zip(results, p_adjusted):\n",
+    "    r[\"p_g2_adjusted\"] = p_adj"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 163
+    },
+    "executionInfo": {
+     "elapsed": 12,
+     "status": "ok",
+     "timestamp": 1781100882491,
+     "user": {
+      "displayName": "Andreas van Cranenburgh",
+      "userId": "13143063654677287265"
+     },
+     "user_tz": -120
+    },
+    "id": "-y3MUOgI3PFn",
+    "outputId": "f9a90951-c8f2-45a7-8d8d-fb0e279dd5b3"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>word</th>\n",
+       "      <th>freq_recent</th>\n",
+       "      <th>freq_reference</th>\n",
+       "      <th>pct_diff</th>\n",
+       "      <th>log_ratio</th>\n",
+       "      <th>chi2</th>\n",
+       "      <th>p_chi2</th>\n",
+       "      <th>g2</th>\n",
+       "      <th>p_g2</th>\n",
+       "      <th>p_g2_adjusted</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>eend</td>\n",
+       "      <td>0.424929</td>\n",
+       "      <td>0.241379</td>\n",
+       "      <td>76.042088</td>\n",
+       "      <td>0.815920</td>\n",
+       "      <td>25.238117</td>\n",
+       "      <td>5.067080e-07</td>\n",
+       "      <td>24.764140</td>\n",
+       "      <td>6.479173e-07</td>\n",
+       "      <td>0.000002</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>tafel</td>\n",
+       "      <td>0.286119</td>\n",
+       "      <td>0.313480</td>\n",
+       "      <td>-8.728045</td>\n",
+       "      <td>-0.131756</td>\n",
+       "      <td>0.598371</td>\n",
+       "      <td>4.392004e-01</td>\n",
+       "      <td>0.474701</td>\n",
+       "      <td>4.908322e-01</td>\n",
+       "      <td>0.490832</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>fiets</td>\n",
+       "      <td>0.288952</td>\n",
+       "      <td>0.445141</td>\n",
+       "      <td>-35.087579</td>\n",
+       "      <td>-0.623434</td>\n",
+       "      <td>17.676782</td>\n",
+       "      <td>2.618028e-05</td>\n",
+       "      <td>17.051468</td>\n",
+       "      <td>3.638025e-05</td>\n",
+       "      <td>0.000055</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    word  freq_recent  freq_reference   pct_diff  log_ratio       chi2  \\\n",
+       "0   eend     0.424929        0.241379  76.042088   0.815920  25.238117   \n",
+       "1  tafel     0.286119        0.313480  -8.728045  -0.131756   0.598371   \n",
+       "2  fiets     0.288952        0.445141 -35.087579  -0.623434  17.676782   \n",
+       "\n",
+       "         p_chi2         g2          p_g2  p_g2_adjusted  \n",
+       "0  5.067080e-07  24.764140  6.479173e-07       0.000002  \n",
+       "1  4.392004e-01   0.474701  4.908322e-01       0.490832  \n",
+       "2  2.618028e-05  17.051468  3.638025e-05       0.000055  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results = pd.DataFrame(results)\n",
+    "results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 132
+    },
+    "executionInfo": {
+     "elapsed": 65,
+     "status": "ok",
+     "timestamp": 1781100883685,
+     "user": {
+      "displayName": "Andreas van Cranenburgh",
+      "userId": "13143063654677287265"
+     },
+     "user_tz": -120
+    },
+    "id": "nTpOtOka3ViF",
+    "outputId": "2430f959-eeb9-4670-da76-613406cbf473"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>word</th>\n",
+       "      <th>freq_recent</th>\n",
+       "      <th>freq_reference</th>\n",
+       "      <th>pct_diff</th>\n",
+       "      <th>log_ratio</th>\n",
+       "      <th>chi2</th>\n",
+       "      <th>p_chi2</th>\n",
+       "      <th>g2</th>\n",
+       "      <th>p_g2</th>\n",
+       "      <th>p_g2_adjusted</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>eend</td>\n",
+       "      <td>0.424929</td>\n",
+       "      <td>0.241379</td>\n",
+       "      <td>76.042088</td>\n",
+       "      <td>0.815920</td>\n",
+       "      <td>25.238117</td>\n",
+       "      <td>5.067080e-07</td>\n",
+       "      <td>24.764140</td>\n",
+       "      <td>6.479173e-07</td>\n",
+       "      <td>0.000002</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>fiets</td>\n",
+       "      <td>0.288952</td>\n",
+       "      <td>0.445141</td>\n",
+       "      <td>-35.087579</td>\n",
+       "      <td>-0.623434</td>\n",
+       "      <td>17.676782</td>\n",
+       "      <td>2.618028e-05</td>\n",
+       "      <td>17.051468</td>\n",
+       "      <td>3.638025e-05</td>\n",
+       "      <td>0.000055</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    word  freq_recent  freq_reference   pct_diff  log_ratio       chi2  \\\n",
+       "0   eend     0.424929        0.241379  76.042088   0.815920  25.238117   \n",
+       "2  fiets     0.288952        0.445141 -35.087579  -0.623434  17.676782   \n",
+       "\n",
+       "         p_chi2         g2          p_g2  p_g2_adjusted  \n",
+       "0  5.067080e-07  24.764140  6.479173e-07       0.000002  \n",
+       "2  2.618028e-05  17.051468  3.638025e-05       0.000055  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Significant according to Chi2\n",
+    "results[results['p_chi2'] < 0.05]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 132
+    },
+    "executionInfo": {
+     "elapsed": 166,
+     "status": "ok",
+     "timestamp": 1781100928540,
+     "user": {
+      "displayName": "Andreas van Cranenburgh",
+      "userId": "13143063654677287265"
+     },
+     "user_tz": -120
+    },
+    "id": "Mz4zAphE4dBY",
+    "outputId": "3b42fdd2-f451-47b6-8989-ebf4dafcbf1a"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>word</th>\n",
+       "      <th>freq_recent</th>\n",
+       "      <th>freq_reference</th>\n",
+       "      <th>pct_diff</th>\n",
+       "      <th>log_ratio</th>\n",
+       "      <th>chi2</th>\n",
+       "      <th>p_chi2</th>\n",
+       "      <th>g2</th>\n",
+       "      <th>p_g2</th>\n",
+       "      <th>p_g2_adjusted</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>eend</td>\n",
+       "      <td>0.424929</td>\n",
+       "      <td>0.241379</td>\n",
+       "      <td>76.042088</td>\n",
+       "      <td>0.815920</td>\n",
+       "      <td>25.238117</td>\n",
+       "      <td>5.067080e-07</td>\n",
+       "      <td>24.764140</td>\n",
+       "      <td>6.479173e-07</td>\n",
+       "      <td>0.000002</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>fiets</td>\n",
+       "      <td>0.288952</td>\n",
+       "      <td>0.445141</td>\n",
+       "      <td>-35.087579</td>\n",
+       "      <td>-0.623434</td>\n",
+       "      <td>17.676782</td>\n",
+       "      <td>2.618028e-05</td>\n",
+       "      <td>17.051468</td>\n",
+       "      <td>3.638025e-05</td>\n",
+       "      <td>0.000055</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    word  freq_recent  freq_reference   pct_diff  log_ratio       chi2  \\\n",
+       "0   eend     0.424929        0.241379  76.042088   0.815920  25.238117   \n",
+       "2  fiets     0.288952        0.445141 -35.087579  -0.623434  17.676782   \n",
+       "\n",
+       "         p_chi2         g2          p_g2  p_g2_adjusted  \n",
+       "0  5.067080e-07  24.764140  6.479173e-07       0.000002  \n",
+       "2  2.618028e-05  17.051468  3.638025e-05       0.000055  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Significant according to G2 (LLR)\n",
+    "results[results['p_g2_adjusted'] < 0.05]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "JNCCUpdC4jK5"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "authorship_tag": "ABX9TyOWNAG6IZoh+ik4rqgeMAZj",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/python/word_freq_comparison.ipynb.ori
+++ b/python/word_freq_comparison.ipynb.ori
--- a/r/test-count.R
+++ b/r/test-count.R
@@ -0,0 +1,28 @@
+nw <- read.table('data/2026/algemeen-count-per-2026.23-1', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word"))
+od <- read.table('data/2026/algemeen-count-per-2026.22-4', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word"))
+words <- unique(c(od$word, nw$word))
+o <- order(words)
+words <- words[o]
+n <- length(words)
+d <- data.frame(word=words, od=rep(0, n), nw=rep(0, n))
+
+for (word in nw$word) {
+  d$nw[d$word==word] <- nw$f[nw$word==word]
+}
+for (word in od$word) {
+  d$od[d$word==word] <- od$f[od$word==word]
+}
+
+write.table(d, file="tmp.csv", quote=FALSE, sep="\t", row.names=FALSE, fileEncoding="utf-8")
+
+####
+
+oud <- d$od
+nieuw <- d$nw
+
+oud[oud == 0] <- 0.5
+nieuw[nieuw == 0] <- 0.5
+
+plot(log(oud), log(nieuw))
+lines(log(range(oud)), log(range(nieuw)))
+identify(log(oud), log(nieuw), labels=words)
--- a/r/test-rang.R
+++ b/r/test-rang.R
@@ -0,0 +1,32 @@
+nw <- read.table('data/2026/algemeen-rang-per-2026.23-1', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word"))
+od <- read.table('data/2026/algemeen-rang-per-2026.22-4', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word"))
+words <- unique(c(od$word, nw$word))
+o <- order(words)
+words <- words[o]
+n <- length(words)
+d <- data.frame(word=words, od=rep(NA, n), nw=rep(NA, n))
+
+for (word in nw$word) {
+  d$nw[d$word==word] <- nw$f[nw$word==word]
+}
+for (word in od$word) {
+  d$od[d$word==word] <- od$f[od$word==word]
+}
+
+d$nw[is.na(d$nw)] <- max(nw$f) + 2
+d$od[is.na(d$od)] <- max(od$f) + 2
+
+myplot <- function(values, labels, titel="", sub ="") {
+    y <- 1:length(values)
+    xx <- range(values)
+    plot(values, y, xlim=c(xx[1], xx[2] + (xx[2]-xx[1]) / 5), xlab="score", ylab="index", main=titel, sub=sub)
+    text(values, y, labels, pos=4)
+}
+
+nwn <- (d$nw - 1) / (max(nw$f) + 1)
+odn <- (d$od - 1) / (max(od$f) + 1)
+
+v <- odn - nwn
+o <- order(-v)
+myplot(v[o][1:40], words[o][1:40], titel="score op basis van genormaliseerde rang", "week 23 t.o.v. week 19 t/m 22")
+