stijgers
This commit is contained in:
1
python/TODO.txt
Normal file
1
python/TODO.txt
Normal file
@@ -0,0 +1 @@
|
||||
python: notebook en pakketten installeren
|
||||
4762
python/data.txt
Normal file
4762
python/data.txt
Normal file
File diff suppressed because it is too large
Load Diff
1563
python/namen.ipynb
Normal file
1563
python/namen.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
82
python/namen.py
Executable file
82
python/namen.py
Executable file
@@ -0,0 +1,82 @@
|
||||
#!/net/corpora/nlnieuws/notebook/bin/python3
|
||||
|
||||
import numpy as np
|
||||
from scipy.stats import chi2_contingency
|
||||
from statsmodels.stats.multitest import multipletests
|
||||
import pandas as pd
|
||||
|
||||
# waarom werkt dit niet?
|
||||
pd.set_option('display.max_rows', 40)
|
||||
|
||||
def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):
|
||||
"""
|
||||
word : the word being tested
|
||||
counts_recent : raw count in week 5
|
||||
counts_reference : raw count in weeks 1-4
|
||||
total_recent : total tokens in week 5
|
||||
total_reference : total tokens in weeks 1-4
|
||||
"""
|
||||
a = counts_recent # word in recent
|
||||
b = counts_reference # word in reference
|
||||
c = total_recent - a # non-word in recent
|
||||
d = total_reference - b # non-word in reference
|
||||
|
||||
contingency = np.array([[a, b],
|
||||
[c, d]])
|
||||
|
||||
# --- Chi-Squared ---
|
||||
chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)
|
||||
|
||||
# --- Log-Likelihood (G²) ---
|
||||
# G² = 2 * sum(observed * log(observed / expected))
|
||||
# scipy's chi2_contingency with lambda_="log-likelihood" computes this
|
||||
g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_="log-likelihood")
|
||||
|
||||
# --- Effect sizes ---
|
||||
freq_recent = a / total_recent
|
||||
freq_reference = b / total_reference
|
||||
|
||||
pct_diff = (freq_recent - freq_reference) / freq_reference * 100
|
||||
|
||||
# Avoid log(0) with a small epsilon
|
||||
eps = 1e-9
|
||||
log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))
|
||||
|
||||
return {
|
||||
"word": word,
|
||||
"freq_recent": freq_recent,
|
||||
"freq_reference": freq_reference,
|
||||
"pct_diff": pct_diff,
|
||||
"log_ratio": log_ratio,
|
||||
"chi2": chi2_stat,
|
||||
"p_chi2": p_chi2,
|
||||
"g2": g2_stat,
|
||||
"p_g2": p_g2,
|
||||
}
|
||||
|
||||
counts_recent = {}
|
||||
counts_reference = {}
|
||||
with open("data.txt", "rt", encoding="utf-8") as fp:
|
||||
for line in fp:
|
||||
aa = line.split("\t")
|
||||
counts_reference[aa[0]] = max(int(aa[1]), 0.5)
|
||||
counts_recent[aa[0]] = max(int(aa[2]), 0.5)
|
||||
total_recent = sum(counts_recent.values())
|
||||
total_reference = sum(counts_reference.values())
|
||||
|
||||
results = [
|
||||
corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),
|
||||
total_recent, total_reference)
|
||||
for word in counts_recent]
|
||||
|
||||
# FDR correction across all words
|
||||
p_values = [r["p_g2"] for r in results]
|
||||
_, p_adjusted, _, _ = multipletests(p_values, method="fdr_bh")
|
||||
|
||||
for r, p_adj in zip(results, p_adjusted):
|
||||
r["p_g2_adjusted"] = p_adj
|
||||
|
||||
results = pd.DataFrame(results)
|
||||
print(results)
|
||||
print(results.sort_values('g2'))
|
||||
print(results.sort_values('pct_diff'))
|
||||
30
python/score.txt
Normal file
30
python/score.txt
Normal file
@@ -0,0 +1,30 @@
|
||||
Er zijn twee simpele formules om de "effect size" van het verschil tussen twee
|
||||
relatieve frequenties te rapporteren:
|
||||
* %DIFF = (freq_B - freq_A) / freq_A * 100
|
||||
Percentage verschil in relatieve frequenties, makkelijk te interpreteren,
|
||||
maar niet symmetrisch.
|
||||
* Log Ratio: log2(freq_A / freq_B)
|
||||
Een symmetrische en interpreteerbare effect size; +1 is een verdubbeling, -1
|
||||
een halvering
|
||||
Twee populaire methodes om de significantie van frequentieverschillen te testen
|
||||
(ook wel keyword extraction):
|
||||
* Log-Likelihood Ratio (G^2): meest gebruikte methode in Corpus Linguistics.
|
||||
Vergelijkt observed vs expected frequency.
|
||||
* Chi-Squared test (X^2): simpeler dan G^2, maar geeft meer false positives
|
||||
bij sparse data, werkt niet goed met lage frequenties.
|
||||
Je kunt dan de gebruiker alleen de woorden met significante verschillen laten
|
||||
zien (dit zijn dan de keywords). Ik heb met behulp van Claude een notebook in
|
||||
elkaar gezet met een demonstratie van deze methodes:
|
||||
→ Word freq comparison.ipynb
|
||||
|
||||
|
||||
Er zijn ook geavanceerdere methodes die me te ingewikkeld lijken om te
|
||||
implementeren, maar ik noem ze voor de volledigheid. In de stylometrie
|
||||
is Burrow's Zeta populair, deze is bijv. beschikbaar in Stylo
|
||||
https://github.com/computationalstylistics/stylo onder de oppose()
|
||||
functie
|
||||
https://cran.r-project.org/web/packages/stylo/stylo.pdf#Rfn.oppose.1 .
|
||||
Er is ook een methode die gebruik maakt van Bayesiaanse statistiek en
|
||||
frequenties uit een achtergrondcorpus, de Fightin' Words methode van
|
||||
Monroe et al: https://github.com/jmhessel/FightingWords
|
||||
|
||||
8194
python/word_freq_comparison.html
Normal file
8194
python/word_freq_comparison.html
Normal file
File diff suppressed because one or more lines are too long
500
python/word_freq_comparison.ipynb
Normal file
500
python/word_freq_comparison.ipynb
Normal file
@@ -0,0 +1,500 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"executionInfo": {
|
||||
"elapsed": 1341,
|
||||
"status": "ok",
|
||||
"timestamp": 1781100698726,
|
||||
"user": {
|
||||
"displayName": "Andreas van Cranenburgh",
|
||||
"userId": "13143063654677287265"
|
||||
},
|
||||
"user_tz": -120
|
||||
},
|
||||
"id": "YMifluhW2rZp"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"from scipy.stats import chi2_contingency\n",
|
||||
"from statsmodels.stats.multitest import multipletests\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):\n",
|
||||
" \"\"\"\n",
|
||||
" word : the word being tested\n",
|
||||
" counts_recent : raw count in week 5\n",
|
||||
" counts_reference : raw count in weeks 1-4\n",
|
||||
" total_recent : total tokens in week 5\n",
|
||||
" total_reference : total tokens in weeks 1-4\n",
|
||||
" \"\"\"\n",
|
||||
" a = counts_recent # word in recent\n",
|
||||
" b = counts_reference # word in reference\n",
|
||||
" c = total_recent - a # non-word in recent\n",
|
||||
" d = total_reference - b # non-word in reference\n",
|
||||
"\n",
|
||||
" contingency = np.array([[a, b],\n",
|
||||
" [c, d]])\n",
|
||||
"\n",
|
||||
" # --- Chi-Squared ---\n",
|
||||
" chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)\n",
|
||||
"\n",
|
||||
" # --- Log-Likelihood (G²) ---\n",
|
||||
" # G² = 2 * sum(observed * log(observed / expected))\n",
|
||||
" # scipy's chi2_contingency with lambda_=\"log-likelihood\" computes this\n",
|
||||
" g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_=\"log-likelihood\")\n",
|
||||
"\n",
|
||||
" # --- Effect sizes ---\n",
|
||||
" freq_recent = a / total_recent\n",
|
||||
" freq_reference = b / total_reference\n",
|
||||
"\n",
|
||||
" pct_diff = (freq_recent - freq_reference) / freq_reference * 100\n",
|
||||
"\n",
|
||||
" # Avoid log(0) with a small epsilon\n",
|
||||
" eps = 1e-9\n",
|
||||
" log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))\n",
|
||||
"\n",
|
||||
" return {\n",
|
||||
" \"word\": word,\n",
|
||||
" \"freq_recent\": freq_recent,\n",
|
||||
" \"freq_reference\": freq_reference,\n",
|
||||
" \"pct_diff\": pct_diff,\n",
|
||||
" \"log_ratio\": log_ratio,\n",
|
||||
" \"chi2\": chi2_stat,\n",
|
||||
" \"p_chi2\": p_chi2,\n",
|
||||
" \"g2\": g2_stat,\n",
|
||||
" \"p_g2\": p_g2,\n",
|
||||
" }\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"executionInfo": {
|
||||
"elapsed": 38,
|
||||
"status": "ok",
|
||||
"timestamp": 1781100880331,
|
||||
"user": {
|
||||
"displayName": "Andreas van Cranenburgh",
|
||||
"userId": "13143063654677287265"
|
||||
},
|
||||
"user_tz": -120
|
||||
},
|
||||
"id": "mHH718-222BM"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Example data\n",
|
||||
"counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}\n",
|
||||
"counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}\n",
|
||||
"total_recent = sum(counts_recent.values())\n",
|
||||
"total_reference = sum(counts_reference.values())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"executionInfo": {
|
||||
"elapsed": 7,
|
||||
"status": "ok",
|
||||
"timestamp": 1781100881153,
|
||||
"user": {
|
||||
"displayName": "Andreas van Cranenburgh",
|
||||
"userId": "13143063654677287265"
|
||||
},
|
||||
"user_tz": -120
|
||||
},
|
||||
"id": "urBml1212wxb"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Run tests on whole vocabulary, including correction for multiple tests\n",
|
||||
"# (false discovery rate).\n",
|
||||
"\n",
|
||||
"results = [\n",
|
||||
" corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),\n",
|
||||
" total_recent, total_reference)\n",
|
||||
" for word in counts_recent]\n",
|
||||
"\n",
|
||||
"# FDR correction across all words\n",
|
||||
"p_values = [r[\"p_g2\"] for r in results]\n",
|
||||
"_, p_adjusted, _, _ = multipletests(p_values, method=\"fdr_bh\")\n",
|
||||
"\n",
|
||||
"for r, p_adj in zip(results, p_adjusted):\n",
|
||||
" r[\"p_g2_adjusted\"] = p_adj"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 163
|
||||
},
|
||||
"executionInfo": {
|
||||
"elapsed": 12,
|
||||
"status": "ok",
|
||||
"timestamp": 1781100882491,
|
||||
"user": {
|
||||
"displayName": "Andreas van Cranenburgh",
|
||||
"userId": "13143063654677287265"
|
||||
},
|
||||
"user_tz": -120
|
||||
},
|
||||
"id": "-y3MUOgI3PFn",
|
||||
"outputId": "f9a90951-c8f2-45a7-8d8d-fb0e279dd5b3"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>word</th>\n",
|
||||
" <th>freq_recent</th>\n",
|
||||
" <th>freq_reference</th>\n",
|
||||
" <th>pct_diff</th>\n",
|
||||
" <th>log_ratio</th>\n",
|
||||
" <th>chi2</th>\n",
|
||||
" <th>p_chi2</th>\n",
|
||||
" <th>g2</th>\n",
|
||||
" <th>p_g2</th>\n",
|
||||
" <th>p_g2_adjusted</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>eend</td>\n",
|
||||
" <td>0.424929</td>\n",
|
||||
" <td>0.241379</td>\n",
|
||||
" <td>76.042088</td>\n",
|
||||
" <td>0.815920</td>\n",
|
||||
" <td>25.238117</td>\n",
|
||||
" <td>5.067080e-07</td>\n",
|
||||
" <td>24.764140</td>\n",
|
||||
" <td>6.479173e-07</td>\n",
|
||||
" <td>0.000002</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>tafel</td>\n",
|
||||
" <td>0.286119</td>\n",
|
||||
" <td>0.313480</td>\n",
|
||||
" <td>-8.728045</td>\n",
|
||||
" <td>-0.131756</td>\n",
|
||||
" <td>0.598371</td>\n",
|
||||
" <td>4.392004e-01</td>\n",
|
||||
" <td>0.474701</td>\n",
|
||||
" <td>4.908322e-01</td>\n",
|
||||
" <td>0.490832</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>fiets</td>\n",
|
||||
" <td>0.288952</td>\n",
|
||||
" <td>0.445141</td>\n",
|
||||
" <td>-35.087579</td>\n",
|
||||
" <td>-0.623434</td>\n",
|
||||
" <td>17.676782</td>\n",
|
||||
" <td>2.618028e-05</td>\n",
|
||||
" <td>17.051468</td>\n",
|
||||
" <td>3.638025e-05</td>\n",
|
||||
" <td>0.000055</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
|
||||
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
|
||||
"1 tafel 0.286119 0.313480 -8.728045 -0.131756 0.598371 \n",
|
||||
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
|
||||
"\n",
|
||||
" p_chi2 g2 p_g2 p_g2_adjusted \n",
|
||||
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
|
||||
"1 4.392004e-01 0.474701 4.908322e-01 0.490832 \n",
|
||||
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"results = pd.DataFrame(results)\n",
|
||||
"results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 132
|
||||
},
|
||||
"executionInfo": {
|
||||
"elapsed": 65,
|
||||
"status": "ok",
|
||||
"timestamp": 1781100883685,
|
||||
"user": {
|
||||
"displayName": "Andreas van Cranenburgh",
|
||||
"userId": "13143063654677287265"
|
||||
},
|
||||
"user_tz": -120
|
||||
},
|
||||
"id": "nTpOtOka3ViF",
|
||||
"outputId": "2430f959-eeb9-4670-da76-613406cbf473"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>word</th>\n",
|
||||
" <th>freq_recent</th>\n",
|
||||
" <th>freq_reference</th>\n",
|
||||
" <th>pct_diff</th>\n",
|
||||
" <th>log_ratio</th>\n",
|
||||
" <th>chi2</th>\n",
|
||||
" <th>p_chi2</th>\n",
|
||||
" <th>g2</th>\n",
|
||||
" <th>p_g2</th>\n",
|
||||
" <th>p_g2_adjusted</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>eend</td>\n",
|
||||
" <td>0.424929</td>\n",
|
||||
" <td>0.241379</td>\n",
|
||||
" <td>76.042088</td>\n",
|
||||
" <td>0.815920</td>\n",
|
||||
" <td>25.238117</td>\n",
|
||||
" <td>5.067080e-07</td>\n",
|
||||
" <td>24.764140</td>\n",
|
||||
" <td>6.479173e-07</td>\n",
|
||||
" <td>0.000002</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>fiets</td>\n",
|
||||
" <td>0.288952</td>\n",
|
||||
" <td>0.445141</td>\n",
|
||||
" <td>-35.087579</td>\n",
|
||||
" <td>-0.623434</td>\n",
|
||||
" <td>17.676782</td>\n",
|
||||
" <td>2.618028e-05</td>\n",
|
||||
" <td>17.051468</td>\n",
|
||||
" <td>3.638025e-05</td>\n",
|
||||
" <td>0.000055</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
|
||||
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
|
||||
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
|
||||
"\n",
|
||||
" p_chi2 g2 p_g2 p_g2_adjusted \n",
|
||||
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
|
||||
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Significant according to Chi2\n",
|
||||
"results[results['p_chi2'] < 0.05]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 132
|
||||
},
|
||||
"executionInfo": {
|
||||
"elapsed": 166,
|
||||
"status": "ok",
|
||||
"timestamp": 1781100928540,
|
||||
"user": {
|
||||
"displayName": "Andreas van Cranenburgh",
|
||||
"userId": "13143063654677287265"
|
||||
},
|
||||
"user_tz": -120
|
||||
},
|
||||
"id": "Mz4zAphE4dBY",
|
||||
"outputId": "3b42fdd2-f451-47b6-8989-ebf4dafcbf1a"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>word</th>\n",
|
||||
" <th>freq_recent</th>\n",
|
||||
" <th>freq_reference</th>\n",
|
||||
" <th>pct_diff</th>\n",
|
||||
" <th>log_ratio</th>\n",
|
||||
" <th>chi2</th>\n",
|
||||
" <th>p_chi2</th>\n",
|
||||
" <th>g2</th>\n",
|
||||
" <th>p_g2</th>\n",
|
||||
" <th>p_g2_adjusted</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>eend</td>\n",
|
||||
" <td>0.424929</td>\n",
|
||||
" <td>0.241379</td>\n",
|
||||
" <td>76.042088</td>\n",
|
||||
" <td>0.815920</td>\n",
|
||||
" <td>25.238117</td>\n",
|
||||
" <td>5.067080e-07</td>\n",
|
||||
" <td>24.764140</td>\n",
|
||||
" <td>6.479173e-07</td>\n",
|
||||
" <td>0.000002</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>fiets</td>\n",
|
||||
" <td>0.288952</td>\n",
|
||||
" <td>0.445141</td>\n",
|
||||
" <td>-35.087579</td>\n",
|
||||
" <td>-0.623434</td>\n",
|
||||
" <td>17.676782</td>\n",
|
||||
" <td>2.618028e-05</td>\n",
|
||||
" <td>17.051468</td>\n",
|
||||
" <td>3.638025e-05</td>\n",
|
||||
" <td>0.000055</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
|
||||
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
|
||||
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
|
||||
"\n",
|
||||
" p_chi2 g2 p_g2 p_g2_adjusted \n",
|
||||
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
|
||||
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Significant according to G2 (LLR)\n",
|
||||
"results[results['p_g2_adjusted'] < 0.05]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "JNCCUpdC4jK5"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"authorship_tag": "ABX9TyOWNAG6IZoh+ik4rqgeMAZj",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
1
python/word_freq_comparison.ipynb.ori
Normal file
1
python/word_freq_comparison.ipynb.ori
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user