This commit is contained in:
Peter Kleiweg
2026-06-18 12:52:40 +02:00
parent a8bea0ab44
commit 01e6d48665
13 changed files with 15363 additions and 8 deletions

View File

@@ -29,8 +29,9 @@ type ItemT struct {
}
var (
p = e.PanicErr
agent = "AhrefsBot/7.0"
p = e.PanicErr
//agent = "AhrefsBot/7.0"
agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/149.0.0.0 Safari/537.36"
)
func exists(filename string) bool {

View File

@@ -163,14 +163,57 @@ do
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
> $part-nieuwe-adjww-extra-$ds-$i
# ranglijsten
# kale tellingen
say $part-rang-$ds-$i
say $part-count-word-$ds-$i
alto \
'fp://node[((@pt="n" or @neclass) and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass])]' \
'tt:%w\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
> $part-rang-$ds-$i
'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| sed -e 's/\t.*//' | uniq -c \
| grep -v '^ *1 ' \
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
> $part-count-word-$ds-$i
say $part-count-loc-$ds-$i
alto \
'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="LOC" ])]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| sed -e 's/\t.*//' | uniq -c \
| grep -v '^ *1 ' \
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
> $part-count-loc-$ds-$i
say $part-count-per-$ds-$i
alto \
'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="PER" ])]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| sed -e 's/\t.*//' | uniq -c \
| grep -v '^ *1 ' \
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
> $part-count-per-$ds-$i
say $part-count-org-$ds-$i
alto \
'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="ORG" ])]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| sed -e 's/\t.*//' | uniq -c \
| grep -v '^ *1 ' \
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
> $part-count-org-$ds-$i
say $part-count-misc-$ds-$i
alto \
'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="MISC" ])]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
| sed -e 's/\t.*//' | uniq -c \
| grep -v '^ *1 ' \
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
> $part-count-misc-$ds-$i
done
done

118
oud/rang/collect-rang.sh Normal file
View File

@@ -0,0 +1,118 @@
#!/bin/bash
set -e
unset CDPATH
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
export TZ=Europe/Amsterdam
verbose=0
if [ "$1" = "-v" ]
then
shift
verbose=1
fi
say () {
if [ "$verbose" = "1" ]
then
echo "$*"
fi
}
if [ "$1" = "" ]
then
ds=`date -d -7days +%G.%V`
else
case "$1" in
2[0-9][0-9][0-9].[0-5][0-9])
ds=$1
;;
*)
echo INVALID
exit 1
;;
esac
fi
year=${ds%%.*}
mkdir -p /net/corpora/nlnieuws/data/$year
mkdir -p /net/corpora/nlnieuws/data/json/$year
cd /net/corpora/nlnieuws/data/$year
declare -A parts
#parts[alles]='.'
parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso|Volkskrant'
#parts[amsterdam]='AT5|BuurtAdam|Parool'
#parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom'
#parts[literatuur]='LitNL|Tzum'
#parts[vlaanderen]='HLN|VRT'
#parts[AT5]='AT5'
#parts[BuurtAdam]='BuurtAdam'
#parts[BuurtGrn]='BuurtGrn'
#parts[GG]='GG'
#parts[HLN]='HLN'
#parts[LitNL]='LitNL'
#parts[NOS]='NOS'
#parts[NU]='NU'
#parts[NieuwsNL]='NieuwsNL'
#parts[Oog]='Oog'
#parts[Parool]='Parool'
#parts[RO]='RO'
#parts[RTVNoord]='RTVNoord'
#parts[Sargasso]='Sargasso'
#parts[Sikkom]='Sikkom'
#parts[Tzum]='Tzum'
#parts[Volkskrant]='Volkskrant'
#parts[VRT]='VRT'
for part in ${!parts[@]}
do
regex=${parts[$part]}
for i in 1 4
do
files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true
if [ -z "$files" ]
then
continue
fi
say $part-rang-word-$ds-$i
alto \
'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
> $part-rang-word-$ds-$i
say $part-rang-loc-$ds-$i
alto \
'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="LOC" ])]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
> $part-rang-loc-$ds-$i
say $part-rang-per-$ds-$i
alto \
'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="PER" ])]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
> $part-rang-per-$ds-$i
say $part-rang-org-$ds-$i
alto \
'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="ORG" ])]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
> $part-rang-org-$ds-$i
say $part-rang-misc-$ds-$i
alto \
'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="MISC" ])]' \
'tt:%l\t%I' $files \
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
> $part-rang-misc-$ds-$i
done
done

1
python/TODO.txt Normal file
View File

@@ -0,0 +1 @@
python: notebook en pakketten installeren

4762
python/data.txt Normal file

File diff suppressed because it is too large Load Diff

1563
python/namen.ipynb Normal file

File diff suppressed because it is too large Load Diff

82
python/namen.py Executable file
View File

@@ -0,0 +1,82 @@
#!/net/corpora/nlnieuws/notebook/bin/python3
import numpy as np
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
import pandas as pd
# waarom werkt dit niet?
pd.set_option('display.max_rows', 40)
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
"""
word : the word being tested
counts_recent : raw count in week 5
counts_reference : raw count in weeks 1-4
total_recent : total tokens in week 5
total_reference : total tokens in weeks 1-4
"""
a = counts_recent # word in recent
b = counts_reference # word in reference
c = total_recent - a # non-word in recent
d = total_reference - b # non-word in reference
contingency = np.array([[a, b],
[c, d]])
# --- Chi-Squared ---
chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
# --- Log-Likelihood (G²) ---
# G² = 2 * sum(observed * log(observed / expected))
# scipy's chi2_contingency with lambda_="log-likelihood" computes this
g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
# --- Effect sizes ---
freq_recent = a / total_recent
freq_reference = b / total_reference
pct_diff = (freq_recent - freq_reference) / freq_reference * 100
# Avoid log(0) with a small epsilon
eps = 1e-9
log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
return {
"word": word,
"freq_recent": freq_recent,
"freq_reference": freq_reference,
"pct_diff": pct_diff,
"log_ratio": log_ratio,
"chi2": chi2_stat,
"p_chi2": p_chi2,
"g2": g2_stat,
"p_g2": p_g2,
}
counts_recent = {}
counts_reference = {}
with open("data.txt", "rt", encoding="utf-8") as fp:
for line in fp:
aa = line.split("\t")
counts_reference[aa[0]] = max(int(aa[1]), 0.5)
counts_recent[aa[0]] = max(int(aa[2]), 0.5)
total_recent = sum(counts_recent.values())
total_reference = sum(counts_reference.values())
results = [
corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
total_recent, total_reference)
for word in counts_recent]
# FDR correction across all words
p_values = [r["p_g2"] for r in results]
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
for r, p_adj in zip(results, p_adjusted):
r["p_g2_adjusted"] = p_adj
results = pd.DataFrame(results)
print(results)
print(results.sort_values('g2'))
print(results.sort_values('pct_diff'))

30
python/score.txt Normal file
View File

@@ -0,0 +1,30 @@
Er zijn twee simpele formules om de "effect size" van het verschil tussen twee
relatieve frequenties te rapporteren:
* %DIFF = (freq_B  - freq_A) / freq_A * 100
Percentage verschil in relatieve frequenties, makkelijk te interpreteren,
maar niet symmetrisch.
* Log Ratio: log2(freq_A / freq_B)
Een symmetrische en interpreteerbare effect size; +1 is een verdubbeling, -1
een halvering
Twee populaire methodes om de significantie van frequentieverschillen te testen
(ook wel keyword extraction):
* Log-Likelihood Ratio (G^2): meest gebruikte methode in Corpus Linguistics.
Vergelijkt observed vs expected frequency.
* Chi-Squared test (X^2): simpeler dan G^2, maar geeft meer false positives
bij sparse data, werkt niet goed met lage frequenties.
Je kunt dan de gebruiker alleen de woorden met significante verschillen laten
zien (dit zijn dan de keywords). Ik heb met behulp van Claude een notebook in
elkaar gezet met een demonstratie van deze methodes:
→ Word freq comparison.ipynb
Er zijn ook geavanceerdere methodes die me te ingewikkeld lijken om te
implementeren, maar ik noem ze voor de volledigheid. In de stylometrie
is Burrow's Zeta populair, deze is bijv. beschikbaar in Stylo
https://github.com/computationalstylistics/stylo onder de oppose()
functie
https://cran.r-project.org/web/packages/stylo/stylo.pdf#Rfn.oppose.1 .
Er is ook een methode die gebruik maakt van Bayesiaanse statistiek en
frequenties uit een achtergrondcorpus, de Fightin' Words methode van
Monroe et al: https://github.com/jmhessel/FightingWords

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,500 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"executionInfo": {
"elapsed": 1341,
"status": "ok",
"timestamp": 1781100698726,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "YMifluhW2rZp"
},
"outputs": [],
"source": [
"import numpy as np\n",
"from scipy.stats import chi2_contingency\n",
"from statsmodels.stats.multitest import multipletests\n",
"import pandas as pd\n",
"\n",
"def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):\n",
" \"\"\"\n",
" word : the word being tested\n",
" counts_recent : raw count in week 5\n",
" counts_reference : raw count in weeks 1-4\n",
" total_recent : total tokens in week 5\n",
" total_reference : total tokens in weeks 1-4\n",
" \"\"\"\n",
" a = counts_recent # word in recent\n",
" b = counts_reference # word in reference\n",
" c = total_recent - a # non-word in recent\n",
" d = total_reference - b # non-word in reference\n",
"\n",
" contingency = np.array([[a, b],\n",
" [c, d]])\n",
"\n",
" # --- Chi-Squared ---\n",
" chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)\n",
"\n",
" # --- Log-Likelihood (G²) ---\n",
" # G² = 2 * sum(observed * log(observed / expected))\n",
" # scipy's chi2_contingency with lambda_=\"log-likelihood\" computes this\n",
" g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_=\"log-likelihood\")\n",
"\n",
" # --- Effect sizes ---\n",
" freq_recent = a / total_recent\n",
" freq_reference = b / total_reference\n",
"\n",
" pct_diff = (freq_recent - freq_reference) / freq_reference * 100\n",
"\n",
" # Avoid log(0) with a small epsilon\n",
" eps = 1e-9\n",
" log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))\n",
"\n",
" return {\n",
" \"word\": word,\n",
" \"freq_recent\": freq_recent,\n",
" \"freq_reference\": freq_reference,\n",
" \"pct_diff\": pct_diff,\n",
" \"log_ratio\": log_ratio,\n",
" \"chi2\": chi2_stat,\n",
" \"p_chi2\": p_chi2,\n",
" \"g2\": g2_stat,\n",
" \"p_g2\": p_g2,\n",
" }\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"executionInfo": {
"elapsed": 38,
"status": "ok",
"timestamp": 1781100880331,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "mHH718-222BM"
},
"outputs": [],
"source": [
"# Example data\n",
"counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}\n",
"counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}\n",
"total_recent = sum(counts_recent.values())\n",
"total_reference = sum(counts_reference.values())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"executionInfo": {
"elapsed": 7,
"status": "ok",
"timestamp": 1781100881153,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "urBml1212wxb"
},
"outputs": [],
"source": [
"# Run tests on whole vocabulary, including correction for multiple tests\n",
"# (false discovery rate).\n",
"\n",
"results = [\n",
" corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),\n",
" total_recent, total_reference)\n",
" for word in counts_recent]\n",
"\n",
"# FDR correction across all words\n",
"p_values = [r[\"p_g2\"] for r in results]\n",
"_, p_adjusted, _, _ = multipletests(p_values, method=\"fdr_bh\")\n",
"\n",
"for r, p_adj in zip(results, p_adjusted):\n",
" r[\"p_g2_adjusted\"] = p_adj"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 163
},
"executionInfo": {
"elapsed": 12,
"status": "ok",
"timestamp": 1781100882491,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "-y3MUOgI3PFn",
"outputId": "f9a90951-c8f2-45a7-8d8d-fb0e279dd5b3"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>freq_recent</th>\n",
" <th>freq_reference</th>\n",
" <th>pct_diff</th>\n",
" <th>log_ratio</th>\n",
" <th>chi2</th>\n",
" <th>p_chi2</th>\n",
" <th>g2</th>\n",
" <th>p_g2</th>\n",
" <th>p_g2_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>eend</td>\n",
" <td>0.424929</td>\n",
" <td>0.241379</td>\n",
" <td>76.042088</td>\n",
" <td>0.815920</td>\n",
" <td>25.238117</td>\n",
" <td>5.067080e-07</td>\n",
" <td>24.764140</td>\n",
" <td>6.479173e-07</td>\n",
" <td>0.000002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>tafel</td>\n",
" <td>0.286119</td>\n",
" <td>0.313480</td>\n",
" <td>-8.728045</td>\n",
" <td>-0.131756</td>\n",
" <td>0.598371</td>\n",
" <td>4.392004e-01</td>\n",
" <td>0.474701</td>\n",
" <td>4.908322e-01</td>\n",
" <td>0.490832</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>fiets</td>\n",
" <td>0.288952</td>\n",
" <td>0.445141</td>\n",
" <td>-35.087579</td>\n",
" <td>-0.623434</td>\n",
" <td>17.676782</td>\n",
" <td>2.618028e-05</td>\n",
" <td>17.051468</td>\n",
" <td>3.638025e-05</td>\n",
" <td>0.000055</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
"1 tafel 0.286119 0.313480 -8.728045 -0.131756 0.598371 \n",
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
"\n",
" p_chi2 g2 p_g2 p_g2_adjusted \n",
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
"1 4.392004e-01 0.474701 4.908322e-01 0.490832 \n",
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results = pd.DataFrame(results)\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 132
},
"executionInfo": {
"elapsed": 65,
"status": "ok",
"timestamp": 1781100883685,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "nTpOtOka3ViF",
"outputId": "2430f959-eeb9-4670-da76-613406cbf473"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>freq_recent</th>\n",
" <th>freq_reference</th>\n",
" <th>pct_diff</th>\n",
" <th>log_ratio</th>\n",
" <th>chi2</th>\n",
" <th>p_chi2</th>\n",
" <th>g2</th>\n",
" <th>p_g2</th>\n",
" <th>p_g2_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>eend</td>\n",
" <td>0.424929</td>\n",
" <td>0.241379</td>\n",
" <td>76.042088</td>\n",
" <td>0.815920</td>\n",
" <td>25.238117</td>\n",
" <td>5.067080e-07</td>\n",
" <td>24.764140</td>\n",
" <td>6.479173e-07</td>\n",
" <td>0.000002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>fiets</td>\n",
" <td>0.288952</td>\n",
" <td>0.445141</td>\n",
" <td>-35.087579</td>\n",
" <td>-0.623434</td>\n",
" <td>17.676782</td>\n",
" <td>2.618028e-05</td>\n",
" <td>17.051468</td>\n",
" <td>3.638025e-05</td>\n",
" <td>0.000055</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
"\n",
" p_chi2 g2 p_g2 p_g2_adjusted \n",
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Significant according to Chi2\n",
"results[results['p_chi2'] < 0.05]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 132
},
"executionInfo": {
"elapsed": 166,
"status": "ok",
"timestamp": 1781100928540,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "Mz4zAphE4dBY",
"outputId": "3b42fdd2-f451-47b6-8989-ebf4dafcbf1a"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>freq_recent</th>\n",
" <th>freq_reference</th>\n",
" <th>pct_diff</th>\n",
" <th>log_ratio</th>\n",
" <th>chi2</th>\n",
" <th>p_chi2</th>\n",
" <th>g2</th>\n",
" <th>p_g2</th>\n",
" <th>p_g2_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>eend</td>\n",
" <td>0.424929</td>\n",
" <td>0.241379</td>\n",
" <td>76.042088</td>\n",
" <td>0.815920</td>\n",
" <td>25.238117</td>\n",
" <td>5.067080e-07</td>\n",
" <td>24.764140</td>\n",
" <td>6.479173e-07</td>\n",
" <td>0.000002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>fiets</td>\n",
" <td>0.288952</td>\n",
" <td>0.445141</td>\n",
" <td>-35.087579</td>\n",
" <td>-0.623434</td>\n",
" <td>17.676782</td>\n",
" <td>2.618028e-05</td>\n",
" <td>17.051468</td>\n",
" <td>3.638025e-05</td>\n",
" <td>0.000055</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
"\n",
" p_chi2 g2 p_g2 p_g2_adjusted \n",
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Significant according to G2 (LLR)\n",
"results[results['p_g2_adjusted'] < 0.05]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "JNCCUpdC4jK5"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"authorship_tag": "ABX9TyOWNAG6IZoh+ik4rqgeMAZj",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

File diff suppressed because one or more lines are too long

28
r/test-count.R Normal file
View File

@@ -0,0 +1,28 @@
nw <- read.table('data/2026/algemeen-count-per-2026.23-1', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word"))
od <- read.table('data/2026/algemeen-count-per-2026.22-4', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word"))
words <- unique(c(od$word, nw$word))
o <- order(words)
words <- words[o]
n <- length(words)
d <- data.frame(word=words, od=rep(0, n), nw=rep(0, n))
for (word in nw$word) {
d$nw[d$word==word] <- nw$f[nw$word==word]
}
for (word in od$word) {
d$od[d$word==word] <- od$f[od$word==word]
}
write.table(d, file="tmp.csv", quote=FALSE, sep="\t", row.names=FALSE, fileEncoding="utf-8")
####
oud <- d$od
nieuw <- d$nw
oud[oud == 0] <- 0.5
nieuw[nieuw == 0] <- 0.5
plot(log(oud), log(nieuw))
lines(log(range(oud)), log(range(nieuw)))
identify(log(oud), log(nieuw), labels=words)

32
r/test-rang.R Normal file
View File

@@ -0,0 +1,32 @@
nw <- read.table('data/2026/algemeen-rang-per-2026.23-1', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word"))
od <- read.table('data/2026/algemeen-rang-per-2026.22-4', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word"))
words <- unique(c(od$word, nw$word))
o <- order(words)
words <- words[o]
n <- length(words)
d <- data.frame(word=words, od=rep(NA, n), nw=rep(NA, n))
for (word in nw$word) {
d$nw[d$word==word] <- nw$f[nw$word==word]
}
for (word in od$word) {
d$od[d$word==word] <- od$f[od$word==word]
}
d$nw[is.na(d$nw)] <- max(nw$f) + 2
d$od[is.na(d$od)] <- max(od$f) + 2
myplot <- function(values, labels, titel="", sub ="") {
y <- 1:length(values)
xx <- range(values)
plot(values, y, xlim=c(xx[1], xx[2] + (xx[2]-xx[1]) / 5), xlab="score", ylab="index", main=titel, sub=sub)
text(values, y, labels, pos=4)
}
nwn <- (d$nw - 1) / (max(nw$f) + 1)
odn <- (d$od - 1) / (max(od$f) + 1)
v <- odn - nwn
o <- order(-v)
myplot(v[o][1:40], words[o][1:40], titel="score op basis van genormaliseerde rang", "week 23 t.o.v. week 19 t/m 22")