501 lines
15 KiB
Plaintext
501 lines
15 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {
|
|
"executionInfo": {
|
|
"elapsed": 1341,
|
|
"status": "ok",
|
|
"timestamp": 1781100698726,
|
|
"user": {
|
|
"displayName": "Andreas van Cranenburgh",
|
|
"userId": "13143063654677287265"
|
|
},
|
|
"user_tz": -120
|
|
},
|
|
"id": "YMifluhW2rZp"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"from scipy.stats import chi2_contingency\n",
|
|
"from statsmodels.stats.multitest import multipletests\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):\n",
|
|
" \"\"\"\n",
|
|
" word : the word being tested\n",
|
|
" counts_recent : raw count in week 5\n",
|
|
" counts_reference : raw count in weeks 1-4\n",
|
|
" total_recent : total tokens in week 5\n",
|
|
" total_reference : total tokens in weeks 1-4\n",
|
|
" \"\"\"\n",
|
|
" a = counts_recent # word in recent\n",
|
|
" b = counts_reference # word in reference\n",
|
|
" c = total_recent - a # non-word in recent\n",
|
|
" d = total_reference - b # non-word in reference\n",
|
|
"\n",
|
|
" contingency = np.array([[a, b],\n",
|
|
" [c, d]])\n",
|
|
"\n",
|
|
" # --- Chi-Squared ---\n",
|
|
" chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)\n",
|
|
"\n",
|
|
" # --- Log-Likelihood (G²) ---\n",
|
|
" # G² = 2 * sum(observed * log(observed / expected))\n",
|
|
" # scipy's chi2_contingency with lambda_=\"log-likelihood\" computes this\n",
|
|
" g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_=\"log-likelihood\")\n",
|
|
"\n",
|
|
" # --- Effect sizes ---\n",
|
|
" freq_recent = a / total_recent\n",
|
|
" freq_reference = b / total_reference\n",
|
|
"\n",
|
|
" pct_diff = (freq_recent - freq_reference) / freq_reference * 100\n",
|
|
"\n",
|
|
" # Avoid log(0) with a small epsilon\n",
|
|
" eps = 1e-9\n",
|
|
" log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))\n",
|
|
"\n",
|
|
" return {\n",
|
|
" \"word\": word,\n",
|
|
" \"freq_recent\": freq_recent,\n",
|
|
" \"freq_reference\": freq_reference,\n",
|
|
" \"pct_diff\": pct_diff,\n",
|
|
" \"log_ratio\": log_ratio,\n",
|
|
" \"chi2\": chi2_stat,\n",
|
|
" \"p_chi2\": p_chi2,\n",
|
|
" \"g2\": g2_stat,\n",
|
|
" \"p_g2\": p_g2,\n",
|
|
" }\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {
|
|
"executionInfo": {
|
|
"elapsed": 38,
|
|
"status": "ok",
|
|
"timestamp": 1781100880331,
|
|
"user": {
|
|
"displayName": "Andreas van Cranenburgh",
|
|
"userId": "13143063654677287265"
|
|
},
|
|
"user_tz": -120
|
|
},
|
|
"id": "mHH718-222BM"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Example data\n",
|
|
"counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}\n",
|
|
"counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}\n",
|
|
"total_recent = sum(counts_recent.values())\n",
|
|
"total_reference = sum(counts_reference.values())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"executionInfo": {
|
|
"elapsed": 7,
|
|
"status": "ok",
|
|
"timestamp": 1781100881153,
|
|
"user": {
|
|
"displayName": "Andreas van Cranenburgh",
|
|
"userId": "13143063654677287265"
|
|
},
|
|
"user_tz": -120
|
|
},
|
|
"id": "urBml1212wxb"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Run tests on whole vocabulary, including correction for multiple tests\n",
|
|
"# (false discovery rate).\n",
|
|
"\n",
|
|
"results = [\n",
|
|
" corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),\n",
|
|
" total_recent, total_reference)\n",
|
|
" for word in counts_recent]\n",
|
|
"\n",
|
|
"# FDR correction across all words\n",
|
|
"p_values = [r[\"p_g2\"] for r in results]\n",
|
|
"_, p_adjusted, _, _ = multipletests(p_values, method=\"fdr_bh\")\n",
|
|
"\n",
|
|
"for r, p_adj in zip(results, p_adjusted):\n",
|
|
" r[\"p_g2_adjusted\"] = p_adj"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 163
|
|
},
|
|
"executionInfo": {
|
|
"elapsed": 12,
|
|
"status": "ok",
|
|
"timestamp": 1781100882491,
|
|
"user": {
|
|
"displayName": "Andreas van Cranenburgh",
|
|
"userId": "13143063654677287265"
|
|
},
|
|
"user_tz": -120
|
|
},
|
|
"id": "-y3MUOgI3PFn",
|
|
"outputId": "f9a90951-c8f2-45a7-8d8d-fb0e279dd5b3"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>word</th>\n",
|
|
" <th>freq_recent</th>\n",
|
|
" <th>freq_reference</th>\n",
|
|
" <th>pct_diff</th>\n",
|
|
" <th>log_ratio</th>\n",
|
|
" <th>chi2</th>\n",
|
|
" <th>p_chi2</th>\n",
|
|
" <th>g2</th>\n",
|
|
" <th>p_g2</th>\n",
|
|
" <th>p_g2_adjusted</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>eend</td>\n",
|
|
" <td>0.424929</td>\n",
|
|
" <td>0.241379</td>\n",
|
|
" <td>76.042088</td>\n",
|
|
" <td>0.815920</td>\n",
|
|
" <td>25.238117</td>\n",
|
|
" <td>5.067080e-07</td>\n",
|
|
" <td>24.764140</td>\n",
|
|
" <td>6.479173e-07</td>\n",
|
|
" <td>0.000002</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>tafel</td>\n",
|
|
" <td>0.286119</td>\n",
|
|
" <td>0.313480</td>\n",
|
|
" <td>-8.728045</td>\n",
|
|
" <td>-0.131756</td>\n",
|
|
" <td>0.598371</td>\n",
|
|
" <td>4.392004e-01</td>\n",
|
|
" <td>0.474701</td>\n",
|
|
" <td>4.908322e-01</td>\n",
|
|
" <td>0.490832</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>fiets</td>\n",
|
|
" <td>0.288952</td>\n",
|
|
" <td>0.445141</td>\n",
|
|
" <td>-35.087579</td>\n",
|
|
" <td>-0.623434</td>\n",
|
|
" <td>17.676782</td>\n",
|
|
" <td>2.618028e-05</td>\n",
|
|
" <td>17.051468</td>\n",
|
|
" <td>3.638025e-05</td>\n",
|
|
" <td>0.000055</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
|
|
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
|
|
"1 tafel 0.286119 0.313480 -8.728045 -0.131756 0.598371 \n",
|
|
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
|
|
"\n",
|
|
" p_chi2 g2 p_g2 p_g2_adjusted \n",
|
|
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
|
|
"1 4.392004e-01 0.474701 4.908322e-01 0.490832 \n",
|
|
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"results = pd.DataFrame(results)\n",
|
|
"results"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 132
|
|
},
|
|
"executionInfo": {
|
|
"elapsed": 65,
|
|
"status": "ok",
|
|
"timestamp": 1781100883685,
|
|
"user": {
|
|
"displayName": "Andreas van Cranenburgh",
|
|
"userId": "13143063654677287265"
|
|
},
|
|
"user_tz": -120
|
|
},
|
|
"id": "nTpOtOka3ViF",
|
|
"outputId": "2430f959-eeb9-4670-da76-613406cbf473"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>word</th>\n",
|
|
" <th>freq_recent</th>\n",
|
|
" <th>freq_reference</th>\n",
|
|
" <th>pct_diff</th>\n",
|
|
" <th>log_ratio</th>\n",
|
|
" <th>chi2</th>\n",
|
|
" <th>p_chi2</th>\n",
|
|
" <th>g2</th>\n",
|
|
" <th>p_g2</th>\n",
|
|
" <th>p_g2_adjusted</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>eend</td>\n",
|
|
" <td>0.424929</td>\n",
|
|
" <td>0.241379</td>\n",
|
|
" <td>76.042088</td>\n",
|
|
" <td>0.815920</td>\n",
|
|
" <td>25.238117</td>\n",
|
|
" <td>5.067080e-07</td>\n",
|
|
" <td>24.764140</td>\n",
|
|
" <td>6.479173e-07</td>\n",
|
|
" <td>0.000002</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>fiets</td>\n",
|
|
" <td>0.288952</td>\n",
|
|
" <td>0.445141</td>\n",
|
|
" <td>-35.087579</td>\n",
|
|
" <td>-0.623434</td>\n",
|
|
" <td>17.676782</td>\n",
|
|
" <td>2.618028e-05</td>\n",
|
|
" <td>17.051468</td>\n",
|
|
" <td>3.638025e-05</td>\n",
|
|
" <td>0.000055</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
|
|
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
|
|
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
|
|
"\n",
|
|
" p_chi2 g2 p_g2 p_g2_adjusted \n",
|
|
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
|
|
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Significant according to Chi2\n",
|
|
"results[results['p_chi2'] < 0.05]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 132
|
|
},
|
|
"executionInfo": {
|
|
"elapsed": 166,
|
|
"status": "ok",
|
|
"timestamp": 1781100928540,
|
|
"user": {
|
|
"displayName": "Andreas van Cranenburgh",
|
|
"userId": "13143063654677287265"
|
|
},
|
|
"user_tz": -120
|
|
},
|
|
"id": "Mz4zAphE4dBY",
|
|
"outputId": "3b42fdd2-f451-47b6-8989-ebf4dafcbf1a"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>word</th>\n",
|
|
" <th>freq_recent</th>\n",
|
|
" <th>freq_reference</th>\n",
|
|
" <th>pct_diff</th>\n",
|
|
" <th>log_ratio</th>\n",
|
|
" <th>chi2</th>\n",
|
|
" <th>p_chi2</th>\n",
|
|
" <th>g2</th>\n",
|
|
" <th>p_g2</th>\n",
|
|
" <th>p_g2_adjusted</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>eend</td>\n",
|
|
" <td>0.424929</td>\n",
|
|
" <td>0.241379</td>\n",
|
|
" <td>76.042088</td>\n",
|
|
" <td>0.815920</td>\n",
|
|
" <td>25.238117</td>\n",
|
|
" <td>5.067080e-07</td>\n",
|
|
" <td>24.764140</td>\n",
|
|
" <td>6.479173e-07</td>\n",
|
|
" <td>0.000002</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>fiets</td>\n",
|
|
" <td>0.288952</td>\n",
|
|
" <td>0.445141</td>\n",
|
|
" <td>-35.087579</td>\n",
|
|
" <td>-0.623434</td>\n",
|
|
" <td>17.676782</td>\n",
|
|
" <td>2.618028e-05</td>\n",
|
|
" <td>17.051468</td>\n",
|
|
" <td>3.638025e-05</td>\n",
|
|
" <td>0.000055</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n",
|
|
"0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n",
|
|
"2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n",
|
|
"\n",
|
|
" p_chi2 g2 p_g2 p_g2_adjusted \n",
|
|
"0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n",
|
|
"2 2.618028e-05 17.051468 3.638025e-05 0.000055 "
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Significant according to G2 (LLR)\n",
|
|
"results[results['p_g2_adjusted'] < 0.05]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "JNCCUpdC4jK5"
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"authorship_tag": "ABX9TyOWNAG6IZoh+ik4rqgeMAZj",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|