Files
nlnieuws/python/word_freq_comparison.ipynb
Peter Kleiweg 01e6d48665 stijgers
2026-06-18 12:52:40 +02:00

501 lines
15 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"executionInfo": {
"elapsed": 1341,
"status": "ok",
"timestamp": 1781100698726,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "YMifluhW2rZp"
},
"outputs": [],
"source": [
"import numpy as np\n",
"from scipy.stats import chi2_contingency\n",
"from statsmodels.stats.multitest import multipletests\n",
"import pandas as pd\n",
"\n",
"def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):\n",
" \"\"\"\n",
" word : the word being tested\n",
" counts_recent : raw count in week 5\n",
" counts_reference : raw count in weeks 1-4\n",
" total_recent : total tokens in week 5\n",
" total_reference : total tokens in weeks 1-4\n",
" \"\"\"\n",
" a = counts_recent # word in recent\n",
" b = counts_reference # word in reference\n",
" c = total_recent - a # non-word in recent\n",
" d = total_reference - b # non-word in reference\n",
"\n",
" contingency = np.array([[a, b],\n",
" [c, d]])\n",
"\n",
" # --- Chi-Squared ---\n",
" chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)\n",
"\n",
" # --- Log-Likelihood (G²) ---\n",
" # G² = 2 * sum(observed * log(observed / expected))\n",
" # scipy's chi2_contingency with lambda_=\"log-likelihood\" computes this\n",
" g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_=\"log-likelihood\")\n",
"\n",
" # --- Effect sizes ---\n",
" freq_recent = a / total_recent\n",
" freq_reference = b / total_reference\n",
"\n",
" pct_diff = (freq_recent - freq_reference) / freq_reference * 100\n",
"\n",
" # Avoid log(0) with a small epsilon\n",
" eps = 1e-9\n",
" log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))\n",
"\n",
" return {\n",
" \"word\": word,\n",
" \"freq_recent\": freq_recent,\n",
" \"freq_reference\": freq_reference,\n",
" \"pct_diff\": pct_diff,\n",
" \"log_ratio\": log_ratio,\n",
" \"chi2\": chi2_stat,\n",
" \"p_chi2\": p_chi2,\n",
" \"g2\": g2_stat,\n",
" \"p_g2\": p_g2,\n",
" }\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"executionInfo": {
"elapsed": 38,
"status": "ok",
"timestamp": 1781100880331,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "mHH718-222BM"
},
"outputs": [],
"source": [
"# Example data\n",
"counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}\n",
"counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}\n",
"total_recent = sum(counts_recent.values())\n",
"total_reference = sum(counts_reference.values())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"executionInfo": {
"elapsed": 7,
"status": "ok",
"timestamp": 1781100881153,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "urBml1212wxb"
},
"outputs": [],
"source": [
"# Run tests on whole vocabulary, including correction for multiple tests\n",
"# (false discovery rate).\n",
"\n",
"results = [\n",
" corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),\n",
" total_recent, total_reference)\n",
" for word in counts_recent]\n",
"\n",
"# FDR correction across all words\n",
"p_values = [r[\"p_g2\"] for r in results]\n",
"_, p_adjusted, _, _ = multipletests(p_values, method=\"fdr_bh\")\n",
"\n",
"for r, p_adj in zip(results, p_adjusted):\n",
" r[\"p_g2_adjusted\"] = p_adj"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 163
},
"executionInfo": {
"elapsed": 12,
"status": "ok",
"timestamp": 1781100882491,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "-y3MUOgI3PFn",
"outputId": "f9a90951-c8f2-45a7-8d8d-fb0e279dd5b3"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>freq_recent</th>\n",
" <th>freq_reference</th>\n",
" <th>pct_diff</th>\n",
" <th>log_ratio</th>\n",
" <th>chi2</th>\n",
" <th>p_chi2</th>\n",
" <th>g2</th>\n",
" <th>p_g2</th>\n",
" <th>p_g2_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>eend</td>\n",
" <td>0.424929</td>\n",
" <td>0.241379</td>\n",
" <td>76.042088</td>\n",
" <td>0.815920</td>\n",
" <td>25.238117</td>\n",
" <td>5.067080e-07</td>\n",
" <td>24.764140</td>\n",
" <td>6.479173e-07</td>\n",
" <td>0.000002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>tafel</td>\n",
" <td>0.286119</td>\n",
" <td>0.313480</td>\n",
" <td>-8.728045</td>\n",
" <td>-0.131756</td>\n",
" <td>0.598371</td>\n",
" <td>4.392004e-01</td>\n",
" <td>0.474701</td>\n",
" <td>4.908322e-01</td>\n",
" <td>0.490832</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>fiets</td>\n",
" <td>0.288952</td>\n",
" <td>0.445141</td>\n",
" <td>-35.087579</td>\n",
" <td>-0.623434</td>\n",
" <td>17.676782</td>\n",
" <td>2.618028e-05</td>\n",
" <td>17.051468</td>\n",
" <td>3.638025e-05</td>\n",
" <td>0.000055</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
"1 tafel 0.286119 0.313480 -8.728045 -0.131756 0.598371 \n",
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
"\n",
" p_chi2 g2 p_g2 p_g2_adjusted \n",
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
"1 4.392004e-01 0.474701 4.908322e-01 0.490832 \n",
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results = pd.DataFrame(results)\n",
"results"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 132
},
"executionInfo": {
"elapsed": 65,
"status": "ok",
"timestamp": 1781100883685,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "nTpOtOka3ViF",
"outputId": "2430f959-eeb9-4670-da76-613406cbf473"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>freq_recent</th>\n",
" <th>freq_reference</th>\n",
" <th>pct_diff</th>\n",
" <th>log_ratio</th>\n",
" <th>chi2</th>\n",
" <th>p_chi2</th>\n",
" <th>g2</th>\n",
" <th>p_g2</th>\n",
" <th>p_g2_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>eend</td>\n",
" <td>0.424929</td>\n",
" <td>0.241379</td>\n",
" <td>76.042088</td>\n",
" <td>0.815920</td>\n",
" <td>25.238117</td>\n",
" <td>5.067080e-07</td>\n",
" <td>24.764140</td>\n",
" <td>6.479173e-07</td>\n",
" <td>0.000002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>fiets</td>\n",
" <td>0.288952</td>\n",
" <td>0.445141</td>\n",
" <td>-35.087579</td>\n",
" <td>-0.623434</td>\n",
" <td>17.676782</td>\n",
" <td>2.618028e-05</td>\n",
" <td>17.051468</td>\n",
" <td>3.638025e-05</td>\n",
" <td>0.000055</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
"\n",
" p_chi2 g2 p_g2 p_g2_adjusted \n",
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Significant according to Chi2\n",
"results[results['p_chi2'] < 0.05]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 132
},
"executionInfo": {
"elapsed": 166,
"status": "ok",
"timestamp": 1781100928540,
"user": {
"displayName": "Andreas van Cranenburgh",
"userId": "13143063654677287265"
},
"user_tz": -120
},
"id": "Mz4zAphE4dBY",
"outputId": "3b42fdd2-f451-47b6-8989-ebf4dafcbf1a"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>freq_recent</th>\n",
" <th>freq_reference</th>\n",
" <th>pct_diff</th>\n",
" <th>log_ratio</th>\n",
" <th>chi2</th>\n",
" <th>p_chi2</th>\n",
" <th>g2</th>\n",
" <th>p_g2</th>\n",
" <th>p_g2_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>eend</td>\n",
" <td>0.424929</td>\n",
" <td>0.241379</td>\n",
" <td>76.042088</td>\n",
" <td>0.815920</td>\n",
" <td>25.238117</td>\n",
" <td>5.067080e-07</td>\n",
" <td>24.764140</td>\n",
" <td>6.479173e-07</td>\n",
" <td>0.000002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>fiets</td>\n",
" <td>0.288952</td>\n",
" <td>0.445141</td>\n",
" <td>-35.087579</td>\n",
" <td>-0.623434</td>\n",
" <td>17.676782</td>\n",
" <td>2.618028e-05</td>\n",
" <td>17.051468</td>\n",
" <td>3.638025e-05</td>\n",
" <td>0.000055</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
"\n",
" p_chi2 g2 p_g2 p_g2_adjusted \n",
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Significant according to G2 (LLR)\n",
"results[results['p_g2_adjusted'] < 0.05]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "JNCCUpdC4jK5"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"authorship_tag": "ABX9TyOWNAG6IZoh+ik4rqgeMAZj",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}