This commit is contained in:
Peter Kleiweg
2026-06-18 12:52:40 +02:00
parent a8bea0ab44
commit 01e6d48665
13 changed files with 15363 additions and 8 deletions

1
python/TODO.txt Normal file
View File

@@ -0,0 +1 @@
python: notebook en pakketten installeren

4762
python/data.txt Normal file

File diff suppressed because it is too large Load Diff

1563
python/namen.ipynb Normal file

File diff suppressed because it is too large Load Diff

82
python/namen.py Executable file
View File

@@ -0,0 +1,82 @@
#!/net/corpora/nlnieuws/notebook/bin/python3
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import pandas as pd
# waarom werkt dit niet?
pd.set_option('display.max_rows', 40)
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
"""
word : the word being tested
counts_recent : raw count in week 5
counts_reference : raw count in weeks 1-4
total_recent : total tokens in week 5
total_reference : total tokens in weeks 1-4
"""
a = counts_recent # word in recent
b = counts_reference # word in reference
c = total_recent - a # non-word in recent
d = total_reference - b # non-word in reference
contingency = np.array([[a, b],
[c, d]])
# --- Chi-Squared ---
chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
# --- Log-Likelihood (G²) ---
# G² = 2 * sum(observed * log(observed / expected))
# scipy's chi2_contingency with lambda_="log-likelihood" computes this
g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
# --- Effect sizes ---
freq_recent = a / total_recent
freq_reference = b / total_reference
pct_diff = (freq_recent - freq_reference) / freq_reference * 100
# Avoid log(0) with a small epsilon
eps = 1e-9
log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
return {
"word": word,
"freq_recent": freq_recent,
"freq_reference": freq_reference,
"pct_diff": pct_diff,
"log_ratio": log_ratio,
"chi2": chi2_stat,
"p_chi2": p_chi2,
"g2": g2_stat,
"p_g2": p_g2,
}
counts_recent = {}
counts_reference = {}
with open("data.txt", "rt", encoding="utf-8") as fp:
for line in fp:
aa = line.split("\t")
counts_reference[aa[0]] = max(int(aa[1]), 0.5)
counts_recent[aa[0]] = max(int(aa[2]), 0.5)
total_recent = sum(counts_recent.values())
total_reference = sum(counts_reference.values())
results = [
corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
total_recent, total_reference)
for word in counts_recent]
# FDR correction across all words
p_values = [r["p_g2"] for r in results]
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
for r, p_adj in zip(results, p_adjusted):
r["p_g2_adjusted"] = p_adj
results = pd.DataFrame(results)
print(results)
print(results.sort_values('g2'))
print(results.sort_values('pct_diff'))

30
python/score.txt Normal file
View File

@@ -0,0 +1,30 @@
Er zijn twee simpele formules om de "effect size" van het verschil tussen twee
relatieve frequenties te rapporteren:
* %DIFF = (freq_B  - freq_A) / freq_A * 100
Percentage verschil in relatieve frequenties, makkelijk te interpreteren,
maar niet symmetrisch.
* Log Ratio: log2(freq_A / freq_B)
Een symmetrische en interpreteerbare effect size; +1 is een verdubbeling, -1
een halvering
Twee populaire methodes om de significantie van frequentieverschillen te testen
(ook wel keyword extraction):
* Log-Likelihood Ratio (G^2): meest gebruikte methode in Corpus Linguistics.
Vergelijkt observed vs expected frequency.
* Chi-Squared test (X^2): simpeler dan G^2, maar geeft meer false positives
bij sparse data, werkt niet goed met lage frequenties.
Je kunt dan de gebruiker alleen de woorden met significante verschillen laten
zien (dit zijn dan de keywords). Ik heb met behulp van Claude een notebook in
elkaar gezet met een demonstratie van deze methodes:
→ Word freq comparison.ipynb
Er zijn ook geavanceerdere methodes die me te ingewikkeld lijken om te
implementeren, maar ik noem ze voor de volledigheid. In de stylometrie
is Burrow's Zeta populair, deze is bijv. beschikbaar in Stylo
https://github.com/computationalstylistics/stylo onder de oppose()
functie
https://cran.r-project.org/web/packages/stylo/stylo.pdf#Rfn.oppose.1 .
Er is ook een methode die gebruik maakt van Bayesiaanse statistiek en
frequenties uit een achtergrondcorpus, de Fightin' Words methode van
Monroe et al: https://github.com/jmhessel/FightingWords

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,500 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"executionInfo": {
"elapsed": 1341,
"status": "ok",
"timestamp": 1781100698726,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "YMifluhW2rZp"
},
"outputs": [],
"source": [
"import numpy as np\n",
"from scipy.stats import chi2_contingency\n",
"from statsmodels.stats.multitest import multipletests\n",
"import pandas as pd\n",
"\n",
"def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):\n",
" \"\"\"\n",
" word : the word being tested\n",
" counts_recent : raw count in week 5\n",
" counts_reference : raw count in weeks 1-4\n",
" total_recent : total tokens in week 5\n",
" total_reference : total tokens in weeks 1-4\n",
" \"\"\"\n",
" a = counts_recent # word in recent\n",
" b = counts_reference # word in reference\n",
" c = total_recent - a # non-word in recent\n",
" d = total_reference - b # non-word in reference\n",
"\n",
" contingency = np.array([[a, b],\n",
" [c, d]])\n",
"\n",
" # --- Chi-Squared ---\n",
" chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)\n",
"\n",
" # --- Log-Likelihood (G²) ---\n",
" # G² = 2 * sum(observed * log(observed / expected))\n",
" # scipy's chi2_contingency with lambda_=\"log-likelihood\" computes this\n",
" g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_=\"log-likelihood\")\n",
"\n",
" # --- Effect sizes ---\n",
" freq_recent = a / total_recent\n",
" freq_reference = b / total_reference\n",
"\n",
" pct_diff = (freq_recent - freq_reference) / freq_reference * 100\n",
"\n",
" # Avoid log(0) with a small epsilon\n",
" eps = 1e-9\n",
" log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))\n",
"\n",
" return {\n",
" \"word\": word,\n",
" \"freq_recent\": freq_recent,\n",
" \"freq_reference\": freq_reference,\n",
" \"pct_diff\": pct_diff,\n",
" \"log_ratio\": log_ratio,\n",
" \"chi2\": chi2_stat,\n",
" \"p_chi2\": p_chi2,\n",
" \"g2\": g2_stat,\n",
" \"p_g2\": p_g2,\n",
" }\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"executionInfo": {
"elapsed": 38,
"status": "ok",
"timestamp": 1781100880331,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "mHH718-222BM"
},
"outputs": [],
"source": [
"# Example data\n",
"counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}\n",
"counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}\n",
"total_recent = sum(counts_recent.values())\n",
"total_reference = sum(counts_reference.values())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"executionInfo": {
"elapsed": 7,
"status": "ok",
"timestamp": 1781100881153,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "urBml1212wxb"
},
"outputs": [],
"source": [
"# Run tests on whole vocabulary, including correction for multiple tests\n",
"# (false discovery rate).\n",
"\n",
"results = [\n",
" corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),\n",
" total_recent, total_reference)\n",
" for word in counts_recent]\n",
"\n",
"# FDR correction across all words\n",
"p_values = [r[\"p_g2\"] for r in results]\n",
"_, p_adjusted, _, _ = multipletests(p_values, method=\"fdr_bh\")\n",
"\n",
"for r, p_adj in zip(results, p_adjusted):\n",
" r[\"p_g2_adjusted\"] = p_adj"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 163
},
"executionInfo": {
"elapsed": 12,
"status": "ok",
"timestamp": 1781100882491,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "-y3MUOgI3PFn",
"outputId": "f9a90951-c8f2-45a7-8d8d-fb0e279dd5b3"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>freq_recent</th>\n",
" <th>freq_reference</th>\n",
" <th>pct_diff</th>\n",
" <th>log_ratio</th>\n",
" <th>chi2</th>\n",
" <th>p_chi2</th>\n",
" <th>g2</th>\n",
" <th>p_g2</th>\n",
" <th>p_g2_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>eend</td>\n",
" <td>0.424929</td>\n",
" <td>0.241379</td>\n",
" <td>76.042088</td>\n",
" <td>0.815920</td>\n",
" <td>25.238117</td>\n",
" <td>5.067080e-07</td>\n",
" <td>24.764140</td>\n",
" <td>6.479173e-07</td>\n",
" <td>0.000002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>tafel</td>\n",
" <td>0.286119</td>\n",
" <td>0.313480</td>\n",
" <td>-8.728045</td>\n",
" <td>-0.131756</td>\n",
" <td>0.598371</td>\n",
" <td>4.392004e-01</td>\n",
" <td>0.474701</td>\n",
" <td>4.908322e-01</td>\n",
" <td>0.490832</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>fiets</td>\n",
" <td>0.288952</td>\n",
" <td>0.445141</td>\n",
" <td>-35.087579</td>\n",
" <td>-0.623434</td>\n",
" <td>17.676782</td>\n",
" <td>2.618028e-05</td>\n",
" <td>17.051468</td>\n",
" <td>3.638025e-05</td>\n",
" <td>0.000055</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
"1 tafel 0.286119 0.313480 -8.728045 -0.131756 0.598371 \n",
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
"\n",
" p_chi2 g2 p_g2 p_g2_adjusted \n",
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
"1 4.392004e-01 0.474701 4.908322e-01 0.490832 \n",
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results = pd.DataFrame(results)\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 132
},
"executionInfo": {
"elapsed": 65,
"status": "ok",
"timestamp": 1781100883685,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "nTpOtOka3ViF",
"outputId": "2430f959-eeb9-4670-da76-613406cbf473"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>freq_recent</th>\n",
" <th>freq_reference</th>\n",
" <th>pct_diff</th>\n",
" <th>log_ratio</th>\n",
" <th>chi2</th>\n",
" <th>p_chi2</th>\n",
" <th>g2</th>\n",
" <th>p_g2</th>\n",
" <th>p_g2_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>eend</td>\n",
" <td>0.424929</td>\n",
" <td>0.241379</td>\n",
" <td>76.042088</td>\n",
" <td>0.815920</td>\n",
" <td>25.238117</td>\n",
" <td>5.067080e-07</td>\n",
" <td>24.764140</td>\n",
" <td>6.479173e-07</td>\n",
" <td>0.000002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>fiets</td>\n",
" <td>0.288952</td>\n",
" <td>0.445141</td>\n",
" <td>-35.087579</td>\n",
" <td>-0.623434</td>\n",
" <td>17.676782</td>\n",
" <td>2.618028e-05</td>\n",
" <td>17.051468</td>\n",
" <td>3.638025e-05</td>\n",
" <td>0.000055</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
"\n",
" p_chi2 g2 p_g2 p_g2_adjusted \n",
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Significant according to Chi2\n",
"results[results['p_chi2'] < 0.05]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 132
},
"executionInfo": {
"elapsed": 166,
"status": "ok",
"timestamp": 1781100928540,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "Mz4zAphE4dBY",
"outputId": "3b42fdd2-f451-47b6-8989-ebf4dafcbf1a"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>freq_recent</th>\n",
" <th>freq_reference</th>\n",
" <th>pct_diff</th>\n",
" <th>log_ratio</th>\n",
" <th>chi2</th>\n",
" <th>p_chi2</th>\n",
" <th>g2</th>\n",
" <th>p_g2</th>\n",
" <th>p_g2_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>eend</td>\n",
" <td>0.424929</td>\n",
" <td>0.241379</td>\n",
" <td>76.042088</td>\n",
" <td>0.815920</td>\n",
" <td>25.238117</td>\n",
" <td>5.067080e-07</td>\n",
" <td>24.764140</td>\n",
" <td>6.479173e-07</td>\n",
" <td>0.000002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>fiets</td>\n",
" <td>0.288952</td>\n",
" <td>0.445141</td>\n",
" <td>-35.087579</td>\n",
" <td>-0.623434</td>\n",
" <td>17.676782</td>\n",
" <td>2.618028e-05</td>\n",
" <td>17.051468</td>\n",
" <td>3.638025e-05</td>\n",
" <td>0.000055</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
"\n",
" p_chi2 g2 p_g2 p_g2_adjusted \n",
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Significant according to G2 (LLR)\n",
"results[results['p_g2_adjusted'] < 0.05]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "JNCCUpdC4jK5"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"authorship_tag": "ABX9TyOWNAG6IZoh+ik4rqgeMAZj",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

File diff suppressed because one or more lines are too long