{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOWNAG6IZoh+ik4rqgeMAZj"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"id":"YMifluhW2rZp","executionInfo":{"status":"ok","timestamp":1781100698726,"user_tz":-120,"elapsed":1341,"user":{"displayName":"Andreas van Cranenburgh","userId":"13143063654677287265"}}},"outputs":[],"source":["import numpy as np\n","from scipy.stats import chi2_contingency\n","from statsmodels.stats.multitest import multipletests\n","import pandas as pd\n","\n","def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):\n"," \"\"\"\n"," word : the word being tested\n"," counts_recent : raw count in week 5\n"," counts_reference : raw count in weeks 1-4\n"," total_recent : total tokens in week 5\n"," total_reference : total tokens in weeks 1-4\n"," \"\"\"\n"," a = counts_recent # word in recent\n"," b = counts_reference # word in reference\n"," c = total_recent - a # non-word in recent\n"," d = total_reference - b # non-word in reference\n","\n"," contingency = np.array([[a, b],\n"," [c, d]])\n","\n"," # --- Chi-Squared ---\n"," chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)\n","\n"," # --- Log-Likelihood (G²) ---\n"," # G² = 2 * sum(observed * log(observed / expected))\n"," # scipy's chi2_contingency with lambda_=\"log-likelihood\" computes this\n"," g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_=\"log-likelihood\")\n","\n"," # --- Effect sizes ---\n"," freq_recent = a / total_recent\n"," freq_reference = b / total_reference\n","\n"," pct_diff = (freq_recent - freq_reference) / freq_reference * 100\n","\n"," # Avoid log(0) with a small epsilon\n"," eps = 1e-9\n"," log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))\n","\n"," return {\n"," \"word\": word,\n"," \"freq_recent\": freq_recent,\n"," \"freq_reference\": freq_reference,\n"," \"pct_diff\": pct_diff,\n"," \"log_ratio\": log_ratio,\n"," \"chi2\": chi2_stat,\n"," \"p_chi2\": p_chi2,\n"," \"g2\": g2_stat,\n"," \"p_g2\": p_g2,\n"," }\n"]},{"cell_type":"code","source":["# Example data\n","counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}\n","counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}\n","total_recent = sum(counts_recent.values())\n","total_reference = sum(counts_reference.values())"],"metadata":{"id":"mHH718-222BM","executionInfo":{"status":"ok","timestamp":1781100880331,"user_tz":-120,"elapsed":38,"user":{"displayName":"Andreas van Cranenburgh","userId":"13143063654677287265"}}},"execution_count":10,"outputs":[]},{"cell_type":"code","source":["# Run tests on whole vocabulary, including correction for multiple tests\n","# (false discovery rate).\n","\n","results = [\n"," corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),\n"," total_recent, total_reference)\n"," for word in counts_recent]\n","\n","# FDR correction across all words\n","p_values = [r[\"p_g2\"] for r in results]\n","_, p_adjusted, _, _ = multipletests(p_values, method=\"fdr_bh\")\n","\n","for r, p_adj in zip(results, p_adjusted):\n"," r[\"p_g2_adjusted\"] = p_adj"],"metadata":{"id":"urBml1212wxb","executionInfo":{"status":"ok","timestamp":1781100881153,"user_tz":-120,"elapsed":7,"user":{"displayName":"Andreas van Cranenburgh","userId":"13143063654677287265"}}},"execution_count":11,"outputs":[]},{"cell_type":"code","source":["results = pd.DataFrame(results)\n","results"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":163},"id":"-y3MUOgI3PFn","executionInfo":{"status":"ok","timestamp":1781100882491,"user_tz":-120,"elapsed":12,"user":{"displayName":"Andreas van Cranenburgh","userId":"13143063654677287265"}},"outputId":"f9a90951-c8f2-45a7-8d8d-fb0e279dd5b3"},"execution_count":12,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n","0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n","1 tafel 0.286119 0.313480 -8.728045 -0.131756 0.598371 \n","2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n","\n"," p_chi2 g2 p_g2 p_g2_adjusted \n","0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n","1 4.392004e-01 0.474701 4.908322e-01 0.490832 \n","2 2.618028e-05 17.051468 3.638025e-05 0.000055 "],"text/html":["\n","
\n","
\n","\n","
\n"," \n"," \n"," | \n"," word | \n"," freq_recent | \n"," freq_reference | \n"," pct_diff | \n"," log_ratio | \n"," chi2 | \n"," p_chi2 | \n"," g2 | \n"," p_g2 | \n"," p_g2_adjusted | \n","
\n"," \n"," \n"," \n"," | 0 | \n"," eend | \n"," 0.424929 | \n"," 0.241379 | \n"," 76.042088 | \n"," 0.815920 | \n"," 25.238117 | \n"," 5.067080e-07 | \n"," 24.764140 | \n"," 6.479173e-07 | \n"," 0.000002 | \n","
\n"," \n"," | 1 | \n"," tafel | \n"," 0.286119 | \n"," 0.313480 | \n"," -8.728045 | \n"," -0.131756 | \n"," 0.598371 | \n"," 4.392004e-01 | \n"," 0.474701 | \n"," 4.908322e-01 | \n"," 0.490832 | \n","
\n"," \n"," | 2 | \n"," fiets | \n"," 0.288952 | \n"," 0.445141 | \n"," -35.087579 | \n"," -0.623434 | \n"," 17.676782 | \n"," 2.618028e-05 | \n"," 17.051468 | \n"," 3.638025e-05 | \n"," 0.000055 | \n","
\n"," \n","
\n","
\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"results","summary":"{\n \"name\": \"results\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"word\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"eend\",\n \"tafel\",\n \"fiets\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq_recent\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.07933697379148243,\n \"min\": 0.28611898016997167,\n \"max\": 0.42492917847025496,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.42492917847025496,\n 0.28611898016997167,\n 0.28895184135977336\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq_reference\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1033215397950522,\n \"min\": 0.2413793103448276,\n \"max\": 0.445141065830721,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.2413793103448276,\n 0.31347962382445144,\n 0.445141065830721\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pct_diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 58.066920556065845,\n \"min\": -35.08757930016359,\n \"max\": 76.04208822339133,\n \"num_unique_values\": 3,\n \"samples\": [\n 76.04208822339133,\n -8.728045325779044,\n -35.08757930016359\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"log_ratio\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.7316166431568862,\n \"min\": -0.6234335352876249,\n \"max\": 0.8159203877129316,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.8159203877129316,\n -0.13175646608914762,\n -0.6234335352876249\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chi2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12.622486590913228,\n \"min\": 0.5983706960191795,\n \"max\": 25.238116861051893,\n \"num_unique_values\": 3,\n \"samples\": [\n 25.238116861051893,\n 0.5983706960191795,\n 17.67678188444784\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_chi2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2535647399038483,\n \"min\": 5.067080224469542e-07,\n \"max\": 0.4392003554536562,\n \"num_unique_values\": 3,\n \"samples\": [\n 5.067080224469542e-07,\n 0.4392003554536562,\n 2.618028198517596e-05\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"g2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12.411361159991287,\n \"min\": 0.47470138151998675,\n \"max\": 24.76413991002474,\n \"num_unique_values\": 3,\n \"samples\": [\n 24.76413991002474,\n 0.47470138151998675,\n 17.051467521507384\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_g2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.28337142981219393,\n \"min\": 6.479173479592146e-07,\n \"max\": 0.4908322269559797,\n \"num_unique_values\": 3,\n \"samples\": [\n 6.479173479592146e-07,\n 0.4908322269559797,\n 3.638024934064761e-05\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_g2_adjusted\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.28336580535875827,\n \"min\": 1.943752043877644e-06,\n \"max\": 0.4908322269559797,\n \"num_unique_values\": 3,\n \"samples\": [\n 1.943752043877644e-06,\n 0.4908322269559797,\n 5.457037401097142e-05\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":12}]},{"cell_type":"code","source":["# Significant according to Chi2\n","results[results['p_chi2'] < 0.05]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":132},"id":"nTpOtOka3ViF","executionInfo":{"status":"ok","timestamp":1781100883685,"user_tz":-120,"elapsed":65,"user":{"displayName":"Andreas van Cranenburgh","userId":"13143063654677287265"}},"outputId":"2430f959-eeb9-4670-da76-613406cbf473"},"execution_count":13,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n","0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n","2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n","\n"," p_chi2 g2 p_g2 p_g2_adjusted \n","0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n","2 2.618028e-05 17.051468 3.638025e-05 0.000055 "],"text/html":["\n"," \n","
\n","\n","
\n"," \n"," \n"," | \n"," word | \n"," freq_recent | \n"," freq_reference | \n"," pct_diff | \n"," log_ratio | \n"," chi2 | \n"," p_chi2 | \n"," g2 | \n"," p_g2 | \n"," p_g2_adjusted | \n","
\n"," \n"," \n"," \n"," | 0 | \n"," eend | \n"," 0.424929 | \n"," 0.241379 | \n"," 76.042088 | \n"," 0.815920 | \n"," 25.238117 | \n"," 5.067080e-07 | \n"," 24.764140 | \n"," 6.479173e-07 | \n"," 0.000002 | \n","
\n"," \n"," | 2 | \n"," fiets | \n"," 0.288952 | \n"," 0.445141 | \n"," -35.087579 | \n"," -0.623434 | \n"," 17.676782 | \n"," 2.618028e-05 | \n"," 17.051468 | \n"," 3.638025e-05 | \n"," 0.000055 | \n","
\n"," \n","
\n","
\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"results[results['p_chi2'] < 0\",\n \"rows\": 2,\n \"fields\": [\n {\n \"column\": \"word\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"fiets\",\n \"eend\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq_recent\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.09615049715851072,\n \"min\": 0.28895184135977336,\n \"max\": 0.42492917847025496,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.28895184135977336,\n 0.42492917847025496\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq_reference\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.14408131905055044,\n \"min\": 0.2413793103448276,\n \"max\": 0.445141065830721,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.445141065830721,\n 0.2413793103448276\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pct_diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 78.58054149691213,\n \"min\": -35.08757930016359,\n \"max\": 76.04208822339133,\n \"num_unique_values\": 2,\n \"samples\": [\n -35.08757930016359,\n 76.04208822339133\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"log_ratio\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0177769194811532,\n \"min\": -0.6234335352876249,\n \"max\": 0.8159203877129316,\n \"num_unique_values\": 2,\n \"samples\": [\n -0.6234335352876249,\n 0.8159203877129316\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chi2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.346671236779749,\n \"min\": 17.67678188444784,\n \"max\": 25.238116861051893,\n \"num_unique_values\": 2,\n \"samples\": [\n 17.67678188444784,\n 25.238116861051893\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_chi2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.8153958246340062e-05,\n \"min\": 5.067080224469542e-07,\n \"max\": 2.618028198517596e-05,\n \"num_unique_values\": 2,\n \"samples\": [\n 2.618028198517596e-05,\n 5.067080224469542e-07\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"g2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.453682946990868,\n \"min\": 17.051467521507384,\n \"max\": 24.76413991002474,\n \"num_unique_values\": 2,\n \"samples\": [\n 17.051467521507384,\n 24.76413991002474\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_g2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.5266574259638985e-05,\n \"min\": 6.479173479592146e-07,\n \"max\": 3.638024934064761e-05,\n \"num_unique_values\": 2,\n \"samples\": [\n 3.638024934064761e-05,\n 6.479173479592146e-07\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_g2_adjusted\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.721264126387293e-05,\n \"min\": 1.943752043877644e-06,\n \"max\": 5.457037401097142e-05,\n \"num_unique_values\": 2,\n \"samples\": [\n 5.457037401097142e-05,\n 1.943752043877644e-06\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":13}]},{"cell_type":"code","source":["# Significant according to G2 (LLR)\n","results[results['p_g2_adjusted'] < 0.05]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":132},"id":"Mz4zAphE4dBY","executionInfo":{"status":"ok","timestamp":1781100928540,"user_tz":-120,"elapsed":166,"user":{"displayName":"Andreas van Cranenburgh","userId":"13143063654677287265"}},"outputId":"3b42fdd2-f451-47b6-8989-ebf4dafcbf1a"},"execution_count":16,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n","0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n","2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n","\n"," p_chi2 g2 p_g2 p_g2_adjusted \n","0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n","2 2.618028e-05 17.051468 3.638025e-05 0.000055 "],"text/html":["\n"," \n","
\n","\n","
\n"," \n"," \n"," | \n"," word | \n"," freq_recent | \n"," freq_reference | \n"," pct_diff | \n"," log_ratio | \n"," chi2 | \n"," p_chi2 | \n"," g2 | \n"," p_g2 | \n"," p_g2_adjusted | \n","
\n"," \n"," \n"," \n"," | 0 | \n"," eend | \n"," 0.424929 | \n"," 0.241379 | \n"," 76.042088 | \n"," 0.815920 | \n"," 25.238117 | \n"," 5.067080e-07 | \n"," 24.764140 | \n"," 6.479173e-07 | \n"," 0.000002 | \n","
\n"," \n"," | 2 | \n"," fiets | \n"," 0.288952 | \n"," 0.445141 | \n"," -35.087579 | \n"," -0.623434 | \n"," 17.676782 | \n"," 2.618028e-05 | \n"," 17.051468 | \n"," 3.638025e-05 | \n"," 0.000055 | \n","
\n"," \n","
\n","
\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"results[results['p_g2_adjusted'] < 0\",\n \"rows\": 2,\n \"fields\": [\n {\n \"column\": \"word\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"fiets\",\n \"eend\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq_recent\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.09615049715851072,\n \"min\": 0.28895184135977336,\n \"max\": 0.42492917847025496,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.28895184135977336,\n 0.42492917847025496\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq_reference\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.14408131905055044,\n \"min\": 0.2413793103448276,\n \"max\": 0.445141065830721,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.445141065830721,\n 0.2413793103448276\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pct_diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 78.58054149691213,\n \"min\": -35.08757930016359,\n \"max\": 76.04208822339133,\n \"num_unique_values\": 2,\n \"samples\": [\n -35.08757930016359,\n 76.04208822339133\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"log_ratio\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0177769194811532,\n \"min\": -0.6234335352876249,\n \"max\": 0.8159203877129316,\n \"num_unique_values\": 2,\n \"samples\": [\n -0.6234335352876249,\n 0.8159203877129316\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chi2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.346671236779749,\n \"min\": 17.67678188444784,\n \"max\": 25.238116861051893,\n \"num_unique_values\": 2,\n \"samples\": [\n 17.67678188444784,\n 25.238116861051893\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_chi2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.8153958246340062e-05,\n \"min\": 5.067080224469542e-07,\n \"max\": 2.618028198517596e-05,\n \"num_unique_values\": 2,\n \"samples\": [\n 2.618028198517596e-05,\n 5.067080224469542e-07\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"g2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.453682946990868,\n \"min\": 17.051467521507384,\n \"max\": 24.76413991002474,\n \"num_unique_values\": 2,\n \"samples\": [\n 17.051467521507384,\n 24.76413991002474\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_g2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.5266574259638985e-05,\n \"min\": 6.479173479592146e-07,\n \"max\": 3.638024934064761e-05,\n \"num_unique_values\": 2,\n \"samples\": [\n 3.638024934064761e-05,\n 6.479173479592146e-07\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_g2_adjusted\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.721264126387293e-05,\n \"min\": 1.943752043877644e-06,\n \"max\": 5.457037401097142e-05,\n \"num_unique_values\": 2,\n \"samples\": [\n 5.457037401097142e-05,\n 1.943752043877644e-06\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":16}]},{"cell_type":"code","source":[],"metadata":{"id":"JNCCUpdC4jK5"},"execution_count":null,"outputs":[]}]}