stijgers
This commit is contained in:
@@ -29,8 +29,9 @@ type ItemT struct {
|
||||
}
|
||||
|
||||
var (
|
||||
p = e.PanicErr
|
||||
agent = "AhrefsBot/7.0"
|
||||
p = e.PanicErr
|
||||
//agent = "AhrefsBot/7.0"
|
||||
agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/149.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
func exists(filename string) bool {
|
||||
|
||||
55
collect.sh
55
collect.sh
@@ -163,14 +163,57 @@ do
|
||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
||||
> $part-nieuwe-adjww-extra-$ds-$i
|
||||
|
||||
# ranglijsten
|
||||
# kale tellingen
|
||||
|
||||
say $part-rang-$ds-$i
|
||||
say $part-count-word-$ds-$i
|
||||
alto \
|
||||
'fp://node[((@pt="n" or @neclass) and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass])]' \
|
||||
'tt:%w\t%I' $files \
|
||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
|
||||
> $part-rang-$ds-$i
|
||||
'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
|
||||
'tt:%l\t%I' $files \
|
||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
||||
| sed -e 's/\t.*//' | uniq -c \
|
||||
| grep -v '^ *1 ' \
|
||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
||||
> $part-count-word-$ds-$i
|
||||
|
||||
say $part-count-loc-$ds-$i
|
||||
alto \
|
||||
'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="LOC" ])]' \
|
||||
'tt:%l\t%I' $files \
|
||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
||||
| sed -e 's/\t.*//' | uniq -c \
|
||||
| grep -v '^ *1 ' \
|
||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
||||
> $part-count-loc-$ds-$i
|
||||
|
||||
say $part-count-per-$ds-$i
|
||||
alto \
|
||||
'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="PER" ])]' \
|
||||
'tt:%l\t%I' $files \
|
||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
||||
| sed -e 's/\t.*//' | uniq -c \
|
||||
| grep -v '^ *1 ' \
|
||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
||||
> $part-count-per-$ds-$i
|
||||
|
||||
say $part-count-org-$ds-$i
|
||||
alto \
|
||||
'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="ORG" ])]' \
|
||||
'tt:%l\t%I' $files \
|
||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
||||
| sed -e 's/\t.*//' | uniq -c \
|
||||
| grep -v '^ *1 ' \
|
||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
||||
> $part-count-org-$ds-$i
|
||||
|
||||
say $part-count-misc-$ds-$i
|
||||
alto \
|
||||
'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="MISC" ])]' \
|
||||
'tt:%l\t%I' $files \
|
||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq \
|
||||
| sed -e 's/\t.*//' | uniq -c \
|
||||
| grep -v '^ *1 ' \
|
||||
| sed -e 's/\([0-9]\) */\1\t/' | sort -f -k 2 | sort -n -r -k 1,1 -s \
|
||||
> $part-count-misc-$ds-$i
|
||||
|
||||
done
|
||||
done
|
||||
|
||||
118
oud/rang/collect-rang.sh
Normal file
118
oud/rang/collect-rang.sh
Normal file
@@ -0,0 +1,118 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
unset CDPATH
|
||||
PATH=/net/corpora/nlnieuws/bin:/net/aps/bin:$PATH
|
||||
export TZ=Europe/Amsterdam
|
||||
|
||||
verbose=0
|
||||
if [ "$1" = "-v" ]
|
||||
then
|
||||
shift
|
||||
verbose=1
|
||||
fi
|
||||
|
||||
say () {
|
||||
if [ "$verbose" = "1" ]
|
||||
then
|
||||
echo "$*"
|
||||
fi
|
||||
}
|
||||
|
||||
if [ "$1" = "" ]
|
||||
then
|
||||
ds=`date -d -7days +%G.%V`
|
||||
else
|
||||
case "$1" in
|
||||
2[0-9][0-9][0-9].[0-5][0-9])
|
||||
ds=$1
|
||||
;;
|
||||
*)
|
||||
echo INVALID
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
|
||||
year=${ds%%.*}
|
||||
|
||||
mkdir -p /net/corpora/nlnieuws/data/$year
|
||||
mkdir -p /net/corpora/nlnieuws/data/json/$year
|
||||
cd /net/corpora/nlnieuws/data/$year
|
||||
|
||||
declare -A parts
|
||||
#parts[alles]='.'
|
||||
parts[algemeen]='NOS|NU|NieuwsNL|RO|Sargasso|Volkskrant'
|
||||
#parts[amsterdam]='AT5|BuurtAdam|Parool'
|
||||
#parts[groningen]='BuurtGrn|GG|Oog|RTVNoord|Sikkom'
|
||||
#parts[literatuur]='LitNL|Tzum'
|
||||
#parts[vlaanderen]='HLN|VRT'
|
||||
#parts[AT5]='AT5'
|
||||
#parts[BuurtAdam]='BuurtAdam'
|
||||
#parts[BuurtGrn]='BuurtGrn'
|
||||
#parts[GG]='GG'
|
||||
#parts[HLN]='HLN'
|
||||
#parts[LitNL]='LitNL'
|
||||
#parts[NOS]='NOS'
|
||||
#parts[NU]='NU'
|
||||
#parts[NieuwsNL]='NieuwsNL'
|
||||
#parts[Oog]='Oog'
|
||||
#parts[Parool]='Parool'
|
||||
#parts[RO]='RO'
|
||||
#parts[RTVNoord]='RTVNoord'
|
||||
#parts[Sargasso]='Sargasso'
|
||||
#parts[Sikkom]='Sikkom'
|
||||
#parts[Tzum]='Tzum'
|
||||
#parts[Volkskrant]='Volkskrant'
|
||||
#parts[VRT]='VRT'
|
||||
|
||||
for part in ${!parts[@]}
|
||||
do
|
||||
regex=${parts[$part]}
|
||||
|
||||
for i in 1 4
|
||||
do
|
||||
files=$(find ../.. $(week2files $ds $i) | grep -E "$regex") || true
|
||||
if [ -z "$files" ]
|
||||
then
|
||||
continue
|
||||
fi
|
||||
|
||||
say $part-rang-word-$ds-$i
|
||||
alto \
|
||||
'fp://node[(@pt and not(@pt="let" or @rel="mwp" or @neclass)) or (@cat="mwu" and not(.//node[@neclass]))]' \
|
||||
'tt:%l\t%I' $files \
|
||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
|
||||
> $part-rang-word-$ds-$i
|
||||
|
||||
say $part-rang-loc-$ds-$i
|
||||
alto \
|
||||
'fp://node[(@neclass="LOC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="LOC" ])]' \
|
||||
'tt:%l\t%I' $files \
|
||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
|
||||
> $part-rang-loc-$ds-$i
|
||||
|
||||
say $part-rang-per-$ds-$i
|
||||
alto \
|
||||
'fp://node[(@neclass="PER" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="PER" ])]' \
|
||||
'tt:%l\t%I' $files \
|
||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
|
||||
> $part-rang-per-$ds-$i
|
||||
|
||||
say $part-rang-org-$ds-$i
|
||||
alto \
|
||||
'fp://node[(@neclass="ORG" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="ORG" ])]' \
|
||||
'tt:%l\t%I' $files \
|
||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
|
||||
> $part-rang-org-$ds-$i
|
||||
|
||||
say $part-rang-misc-$ds-$i
|
||||
alto \
|
||||
'fp://node[(@neclass="MISC" and not(@rel="mwp")) or (@cat="mwu" and .//node[@neclass="MISC" ])]' \
|
||||
'tt:%l\t%I' $files \
|
||||
| sed -e 's/\.[0-9][0-9]*$//' | sort | uniq | rang \
|
||||
> $part-rang-misc-$ds-$i
|
||||
|
||||
done
|
||||
done
|
||||
1
python/TODO.txt
Normal file
1
python/TODO.txt
Normal file
@@ -0,0 +1 @@
|
||||
python: notebook en pakketten installeren
|
||||
4762
python/data.txt
Normal file
4762
python/data.txt
Normal file
File diff suppressed because it is too large
Load Diff
1563
python/namen.ipynb
Normal file
1563
python/namen.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
82
python/namen.py
Executable file
82
python/namen.py
Executable file
@@ -0,0 +1,82 @@
|
||||
#!/net/corpora/nlnieuws/notebook/bin/python3
|
||||
|
||||
import numpy as np
|
||||
from scipy.stats import chi2_contingency
|
||||
from statsmodels.stats.multitest import multipletests
|
||||
import pandas as pd
|
||||
|
||||
# waarom werkt dit niet?
|
||||
pd.set_option('display.max_rows', 40)
|
||||
|
||||
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
|
||||
"""
|
||||
word : the word being tested
|
||||
counts_recent : raw count in week 5
|
||||
counts_reference : raw count in weeks 1-4
|
||||
total_recent : total tokens in week 5
|
||||
total_reference : total tokens in weeks 1-4
|
||||
"""
|
||||
a = counts_recent # word in recent
|
||||
b = counts_reference # word in reference
|
||||
c = total_recent - a # non-word in recent
|
||||
d = total_reference - b # non-word in reference
|
||||
|
||||
contingency = np.array([[a, b],
|
||||
[c, d]])
|
||||
|
||||
# --- Chi-Squared ---
|
||||
chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
|
||||
|
||||
# --- Log-Likelihood (G²) ---
|
||||
# G² = 2 * sum(observed * log(observed / expected))
|
||||
# scipy's chi2_contingency with lambda_="log-likelihood" computes this
|
||||
g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
|
||||
|
||||
# --- Effect sizes ---
|
||||
freq_recent = a / total_recent
|
||||
freq_reference = b / total_reference
|
||||
|
||||
pct_diff = (freq_recent - freq_reference) / freq_reference * 100
|
||||
|
||||
# Avoid log(0) with a small epsilon
|
||||
eps = 1e-9
|
||||
log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
|
||||
|
||||
return {
|
||||
"word": word,
|
||||
"freq_recent": freq_recent,
|
||||
"freq_reference": freq_reference,
|
||||
"pct_diff": pct_diff,
|
||||
"log_ratio": log_ratio,
|
||||
"chi2": chi2_stat,
|
||||
"p_chi2": p_chi2,
|
||||
"g2": g2_stat,
|
||||
"p_g2": p_g2,
|
||||
}
|
||||
|
||||
counts_recent = {}
|
||||
counts_reference = {}
|
||||
with open("data.txt", "rt", encoding="utf-8") as fp:
|
||||
for line in fp:
|
||||
aa = line.split("\t")
|
||||
counts_reference[aa[0]] = max(int(aa[1]), 0.5)
|
||||
counts_recent[aa[0]] = max(int(aa[2]), 0.5)
|
||||
total_recent = sum(counts_recent.values())
|
||||
total_reference = sum(counts_reference.values())
|
||||
|
||||
results = [
|
||||
corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
|
||||
total_recent, total_reference)
|
||||
for word in counts_recent]
|
||||
|
||||
# FDR correction across all words
|
||||
p_values = [r["p_g2"] for r in results]
|
||||
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
|
||||
|
||||
for r, p_adj in zip(results, p_adjusted):
|
||||
r["p_g2_adjusted"] = p_adj
|
||||
|
||||
results = pd.DataFrame(results)
|
||||
print(results)
|
||||
print(results.sort_values('g2'))
|
||||
print(results.sort_values('pct_diff'))
|
||||
30
python/score.txt
Normal file
30
python/score.txt
Normal file
@@ -0,0 +1,30 @@
|
||||
Er zijn twee simpele formules om de "effect size" van het verschil tussen twee
|
||||
relatieve frequenties te rapporteren:
|
||||
* %DIFF = (freq_B - freq_A) / freq_A * 100
|
||||
Percentage verschil in relatieve frequenties, makkelijk te interpreteren,
|
||||
maar niet symmetrisch.
|
||||
* Log Ratio: log2(freq_A / freq_B)
|
||||
Een symmetrische en interpreteerbare effect size; +1 is een verdubbeling, -1
|
||||
een halvering
|
||||
Twee populaire methodes om de significantie van frequentieverschillen te testen
|
||||
(ook wel keyword extraction):
|
||||
* Log-Likelihood Ratio (G^2): meest gebruikte methode in Corpus Linguistics.
|
||||
Vergelijkt observed vs expected frequency.
|
||||
* Chi-Squared test (X^2): simpeler dan G^2, maar geeft meer false positives
|
||||
bij sparse data, werkt niet goed met lage frequenties.
|
||||
Je kunt dan de gebruiker alleen de woorden met significante verschillen laten
|
||||
zien (dit zijn dan de keywords). Ik heb met behulp van Claude een notebook in
|
||||
elkaar gezet met een demonstratie van deze methodes:
|
||||
→ Word freq comparison.ipynb
|
||||
|
||||
|
||||
Er zijn ook geavanceerdere methodes die me te ingewikkeld lijken om te
|
||||
implementeren, maar ik noem ze voor de volledigheid. In de stylometrie
|
||||
is Burrow's Zeta populair, deze is bijv. beschikbaar in Stylo
|
||||
https://github.com/computationalstylistics/stylo onder de oppose()
|
||||
functie
|
||||
https://cran.r-project.org/web/packages/stylo/stylo.pdf#Rfn.oppose.1 .
|
||||
Er is ook een methode die gebruik maakt van Bayesiaanse statistiek en
|
||||
frequenties uit een achtergrondcorpus, de Fightin' Words methode van
|
||||
Monroe et al: https://github.com/jmhessel/FightingWords
|
||||
|
||||
8194
python/word_freq_comparison.html
Normal file
8194
python/word_freq_comparison.html
Normal file
File diff suppressed because one or more lines are too long
500
python/word_freq_comparison.ipynb
Normal file
500
python/word_freq_comparison.ipynb
Normal file
@@ -0,0 +1,500 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"executionInfo": {
|
||||
"elapsed": 1341,
|
||||
"status": "ok",
|
||||
"timestamp": 1781100698726,
|
||||
"user": {
|
||||
"displayName": "Andreas van Cranenburgh",
|
||||
"userId": "13143063654677287265"
|
||||
},
|
||||
"user_tz": -120
|
||||
},
|
||||
"id": "YMifluhW2rZp"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"from scipy.stats import chi2_contingency\n",
|
||||
"from statsmodels.stats.multitest import multipletests\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):\n",
|
||||
" \"\"\"\n",
|
||||
" word : the word being tested\n",
|
||||
" counts_recent : raw count in week 5\n",
|
||||
" counts_reference : raw count in weeks 1-4\n",
|
||||
" total_recent : total tokens in week 5\n",
|
||||
" total_reference : total tokens in weeks 1-4\n",
|
||||
" \"\"\"\n",
|
||||
" a = counts_recent # word in recent\n",
|
||||
" b = counts_reference # word in reference\n",
|
||||
" c = total_recent - a # non-word in recent\n",
|
||||
" d = total_reference - b # non-word in reference\n",
|
||||
"\n",
|
||||
" contingency = np.array([[a, b],\n",
|
||||
" [c, d]])\n",
|
||||
"\n",
|
||||
" # --- Chi-Squared ---\n",
|
||||
" chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)\n",
|
||||
"\n",
|
||||
" # --- Log-Likelihood (G²) ---\n",
|
||||
" # G² = 2 * sum(observed * log(observed / expected))\n",
|
||||
" # scipy's chi2_contingency with lambda_=\"log-likelihood\" computes this\n",
|
||||
" g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_=\"log-likelihood\")\n",
|
||||
"\n",
|
||||
" # --- Effect sizes ---\n",
|
||||
" freq_recent = a / total_recent\n",
|
||||
" freq_reference = b / total_reference\n",
|
||||
"\n",
|
||||
" pct_diff = (freq_recent - freq_reference) / freq_reference * 100\n",
|
||||
"\n",
|
||||
" # Avoid log(0) with a small epsilon\n",
|
||||
" eps = 1e-9\n",
|
||||
" log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))\n",
|
||||
"\n",
|
||||
" return {\n",
|
||||
" \"word\": word,\n",
|
||||
" \"freq_recent\": freq_recent,\n",
|
||||
" \"freq_reference\": freq_reference,\n",
|
||||
" \"pct_diff\": pct_diff,\n",
|
||||
" \"log_ratio\": log_ratio,\n",
|
||||
" \"chi2\": chi2_stat,\n",
|
||||
" \"p_chi2\": p_chi2,\n",
|
||||
" \"g2\": g2_stat,\n",
|
||||
" \"p_g2\": p_g2,\n",
|
||||
" }\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"executionInfo": {
|
||||
"elapsed": 38,
|
||||
"status": "ok",
|
||||
"timestamp": 1781100880331,
|
||||
"user": {
|
||||
"displayName": "Andreas van Cranenburgh",
|
||||
"userId": "13143063654677287265"
|
||||
},
|
||||
"user_tz": -120
|
||||
},
|
||||
"id": "mHH718-222BM"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Example data\n",
|
||||
"counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}\n",
|
||||
"counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}\n",
|
||||
"total_recent = sum(counts_recent.values())\n",
|
||||
"total_reference = sum(counts_reference.values())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"executionInfo": {
|
||||
"elapsed": 7,
|
||||
"status": "ok",
|
||||
"timestamp": 1781100881153,
|
||||
"user": {
|
||||
"displayName": "Andreas van Cranenburgh",
|
||||
"userId": "13143063654677287265"
|
||||
},
|
||||
"user_tz": -120
|
||||
},
|
||||
"id": "urBml1212wxb"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Run tests on whole vocabulary, including correction for multiple tests\n",
|
||||
"# (false discovery rate).\n",
|
||||
"\n",
|
||||
"results = [\n",
|
||||
" corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),\n",
|
||||
" total_recent, total_reference)\n",
|
||||
" for word in counts_recent]\n",
|
||||
"\n",
|
||||
"# FDR correction across all words\n",
|
||||
"p_values = [r[\"p_g2\"] for r in results]\n",
|
||||
"_, p_adjusted, _, _ = multipletests(p_values, method=\"fdr_bh\")\n",
|
||||
"\n",
|
||||
"for r, p_adj in zip(results, p_adjusted):\n",
|
||||
" r[\"p_g2_adjusted\"] = p_adj"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 163
|
||||
},
|
||||
"executionInfo": {
|
||||
"elapsed": 12,
|
||||
"status": "ok",
|
||||
"timestamp": 1781100882491,
|
||||
"user": {
|
||||
"displayName": "Andreas van Cranenburgh",
|
||||
"userId": "13143063654677287265"
|
||||
},
|
||||
"user_tz": -120
|
||||
},
|
||||
"id": "-y3MUOgI3PFn",
|
||||
"outputId": "f9a90951-c8f2-45a7-8d8d-fb0e279dd5b3"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>word</th>\n",
|
||||
" <th>freq_recent</th>\n",
|
||||
" <th>freq_reference</th>\n",
|
||||
" <th>pct_diff</th>\n",
|
||||
" <th>log_ratio</th>\n",
|
||||
" <th>chi2</th>\n",
|
||||
" <th>p_chi2</th>\n",
|
||||
" <th>g2</th>\n",
|
||||
" <th>p_g2</th>\n",
|
||||
" <th>p_g2_adjusted</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>eend</td>\n",
|
||||
" <td>0.424929</td>\n",
|
||||
" <td>0.241379</td>\n",
|
||||
" <td>76.042088</td>\n",
|
||||
" <td>0.815920</td>\n",
|
||||
" <td>25.238117</td>\n",
|
||||
" <td>5.067080e-07</td>\n",
|
||||
" <td>24.764140</td>\n",
|
||||
" <td>6.479173e-07</td>\n",
|
||||
" <td>0.000002</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>tafel</td>\n",
|
||||
" <td>0.286119</td>\n",
|
||||
" <td>0.313480</td>\n",
|
||||
" <td>-8.728045</td>\n",
|
||||
" <td>-0.131756</td>\n",
|
||||
" <td>0.598371</td>\n",
|
||||
" <td>4.392004e-01</td>\n",
|
||||
" <td>0.474701</td>\n",
|
||||
" <td>4.908322e-01</td>\n",
|
||||
" <td>0.490832</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>fiets</td>\n",
|
||||
" <td>0.288952</td>\n",
|
||||
" <td>0.445141</td>\n",
|
||||
" <td>-35.087579</td>\n",
|
||||
" <td>-0.623434</td>\n",
|
||||
" <td>17.676782</td>\n",
|
||||
" <td>2.618028e-05</td>\n",
|
||||
" <td>17.051468</td>\n",
|
||||
" <td>3.638025e-05</td>\n",
|
||||
" <td>0.000055</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
|
||||
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
|
||||
"1 tafel 0.286119 0.313480 -8.728045 -0.131756 0.598371 \n",
|
||||
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
|
||||
"\n",
|
||||
" p_chi2 g2 p_g2 p_g2_adjusted \n",
|
||||
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
|
||||
"1 4.392004e-01 0.474701 4.908322e-01 0.490832 \n",
|
||||
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"results = pd.DataFrame(results)\n",
|
||||
"results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 132
|
||||
},
|
||||
"executionInfo": {
|
||||
"elapsed": 65,
|
||||
"status": "ok",
|
||||
"timestamp": 1781100883685,
|
||||
"user": {
|
||||
"displayName": "Andreas van Cranenburgh",
|
||||
"userId": "13143063654677287265"
|
||||
},
|
||||
"user_tz": -120
|
||||
},
|
||||
"id": "nTpOtOka3ViF",
|
||||
"outputId": "2430f959-eeb9-4670-da76-613406cbf473"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>word</th>\n",
|
||||
" <th>freq_recent</th>\n",
|
||||
" <th>freq_reference</th>\n",
|
||||
" <th>pct_diff</th>\n",
|
||||
" <th>log_ratio</th>\n",
|
||||
" <th>chi2</th>\n",
|
||||
" <th>p_chi2</th>\n",
|
||||
" <th>g2</th>\n",
|
||||
" <th>p_g2</th>\n",
|
||||
" <th>p_g2_adjusted</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>eend</td>\n",
|
||||
" <td>0.424929</td>\n",
|
||||
" <td>0.241379</td>\n",
|
||||
" <td>76.042088</td>\n",
|
||||
" <td>0.815920</td>\n",
|
||||
" <td>25.238117</td>\n",
|
||||
" <td>5.067080e-07</td>\n",
|
||||
" <td>24.764140</td>\n",
|
||||
" <td>6.479173e-07</td>\n",
|
||||
" <td>0.000002</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>fiets</td>\n",
|
||||
" <td>0.288952</td>\n",
|
||||
" <td>0.445141</td>\n",
|
||||
" <td>-35.087579</td>\n",
|
||||
" <td>-0.623434</td>\n",
|
||||
" <td>17.676782</td>\n",
|
||||
" <td>2.618028e-05</td>\n",
|
||||
" <td>17.051468</td>\n",
|
||||
" <td>3.638025e-05</td>\n",
|
||||
" <td>0.000055</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
|
||||
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
|
||||
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
|
||||
"\n",
|
||||
" p_chi2 g2 p_g2 p_g2_adjusted \n",
|
||||
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
|
||||
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Significant according to Chi2\n",
|
||||
"results[results['p_chi2'] < 0.05]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 132
|
||||
},
|
||||
"executionInfo": {
|
||||
"elapsed": 166,
|
||||
"status": "ok",
|
||||
"timestamp": 1781100928540,
|
||||
"user": {
|
||||
"displayName": "Andreas van Cranenburgh",
|
||||
"userId": "13143063654677287265"
|
||||
},
|
||||
"user_tz": -120
|
||||
},
|
||||
"id": "Mz4zAphE4dBY",
|
||||
"outputId": "3b42fdd2-f451-47b6-8989-ebf4dafcbf1a"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>word</th>\n",
|
||||
" <th>freq_recent</th>\n",
|
||||
" <th>freq_reference</th>\n",
|
||||
" <th>pct_diff</th>\n",
|
||||
" <th>log_ratio</th>\n",
|
||||
" <th>chi2</th>\n",
|
||||
" <th>p_chi2</th>\n",
|
||||
" <th>g2</th>\n",
|
||||
" <th>p_g2</th>\n",
|
||||
" <th>p_g2_adjusted</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>eend</td>\n",
|
||||
" <td>0.424929</td>\n",
|
||||
" <td>0.241379</td>\n",
|
||||
" <td>76.042088</td>\n",
|
||||
" <td>0.815920</td>\n",
|
||||
" <td>25.238117</td>\n",
|
||||
" <td>5.067080e-07</td>\n",
|
||||
" <td>24.764140</td>\n",
|
||||
" <td>6.479173e-07</td>\n",
|
||||
" <td>0.000002</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>fiets</td>\n",
|
||||
" <td>0.288952</td>\n",
|
||||
" <td>0.445141</td>\n",
|
||||
" <td>-35.087579</td>\n",
|
||||
" <td>-0.623434</td>\n",
|
||||
" <td>17.676782</td>\n",
|
||||
" <td>2.618028e-05</td>\n",
|
||||
" <td>17.051468</td>\n",
|
||||
" <td>3.638025e-05</td>\n",
|
||||
" <td>0.000055</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
|
||||
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
|
||||
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
|
||||
"\n",
|
||||
" p_chi2 g2 p_g2 p_g2_adjusted \n",
|
||||
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
|
||||
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Significant according to G2 (LLR)\n",
|
||||
"results[results['p_g2_adjusted'] < 0.05]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "JNCCUpdC4jK5"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"authorship_tag": "ABX9TyOWNAG6IZoh+ik4rqgeMAZj",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
1
python/word_freq_comparison.ipynb.ori
Normal file
1
python/word_freq_comparison.ipynb.ori
Normal file
File diff suppressed because one or more lines are too long
28
r/test-count.R
Normal file
28
r/test-count.R
Normal file
@@ -0,0 +1,28 @@
|
||||
nw <- read.table('data/2026/algemeen-count-per-2026.23-1', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word"))
|
||||
od <- read.table('data/2026/algemeen-count-per-2026.22-4', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word"))
|
||||
words <- unique(c(od$word, nw$word))
|
||||
o <- order(words)
|
||||
words <- words[o]
|
||||
n <- length(words)
|
||||
d <- data.frame(word=words, od=rep(0, n), nw=rep(0, n))
|
||||
|
||||
for (word in nw$word) {
|
||||
d$nw[d$word==word] <- nw$f[nw$word==word]
|
||||
}
|
||||
for (word in od$word) {
|
||||
d$od[d$word==word] <- od$f[od$word==word]
|
||||
}
|
||||
|
||||
write.table(d, file="tmp.csv", quote=FALSE, sep="\t", row.names=FALSE, fileEncoding="utf-8")
|
||||
|
||||
####
|
||||
|
||||
oud <- d$od
|
||||
nieuw <- d$nw
|
||||
|
||||
oud[oud == 0] <- 0.5
|
||||
nieuw[nieuw == 0] <- 0.5
|
||||
|
||||
plot(log(oud), log(nieuw))
|
||||
lines(log(range(oud)), log(range(nieuw)))
|
||||
identify(log(oud), log(nieuw), labels=words)
|
||||
32
r/test-rang.R
Normal file
32
r/test-rang.R
Normal file
@@ -0,0 +1,32 @@
|
||||
nw <- read.table('data/2026/algemeen-rang-per-2026.23-1', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word"))
|
||||
od <- read.table('data/2026/algemeen-rang-per-2026.22-4', sep="\t", quote="", encoding="utf-8", col.names=c("f", "word"))
|
||||
words <- unique(c(od$word, nw$word))
|
||||
o <- order(words)
|
||||
words <- words[o]
|
||||
n <- length(words)
|
||||
d <- data.frame(word=words, od=rep(NA, n), nw=rep(NA, n))
|
||||
|
||||
for (word in nw$word) {
|
||||
d$nw[d$word==word] <- nw$f[nw$word==word]
|
||||
}
|
||||
for (word in od$word) {
|
||||
d$od[d$word==word] <- od$f[od$word==word]
|
||||
}
|
||||
|
||||
d$nw[is.na(d$nw)] <- max(nw$f) + 2
|
||||
d$od[is.na(d$od)] <- max(od$f) + 2
|
||||
|
||||
myplot <- function(values, labels, titel="", sub ="") {
|
||||
y <- 1:length(values)
|
||||
xx <- range(values)
|
||||
plot(values, y, xlim=c(xx[1], xx[2] + (xx[2]-xx[1]) / 5), xlab="score", ylab="index", main=titel, sub=sub)
|
||||
text(values, y, labels, pos=4)
|
||||
}
|
||||
|
||||
nwn <- (d$nw - 1) / (max(nw$f) + 1)
|
||||
odn <- (d$od - 1) / (max(od$f) + 1)
|
||||
|
||||
v <- odn - nwn
|
||||
o <- order(-v)
|
||||
myplot(v[o][1:40], words[o][1:40], titel="score op basis van genormaliseerde rang", "week 23 t.o.v. week 19 t/m 22")
|
||||
|
||||
Reference in New Issue
Block a user