Files
nlnieuws/python/word_freq_comparison.ipynb.ori
Peter Kleiweg 01e6d48665 stijgers
2026-06-18 12:52:40 +02:00

1 line
35 KiB
Plaintext

{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOWNAG6IZoh+ik4rqgeMAZj"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"id":"YMifluhW2rZp","executionInfo":{"status":"ok","timestamp":1781100698726,"user_tz":-120,"elapsed":1341,"user":{"displayName":"Andreas van Cranenburgh","userId":"13143063654677287265"}}},"outputs":[],"source":["import numpy as np\n","from scipy.stats import chi2_contingency\n","from statsmodels.stats.multitest import multipletests\n","import pandas as pd\n","\n","def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):\n"," \"\"\"\n"," word : the word being tested\n"," counts_recent : raw count in week 5\n"," counts_reference : raw count in weeks 1-4\n"," total_recent : total tokens in week 5\n"," total_reference : total tokens in weeks 1-4\n"," \"\"\"\n"," a = counts_recent # word in recent\n"," b = counts_reference # word in reference\n"," c = total_recent - a # non-word in recent\n"," d = total_reference - b # non-word in reference\n","\n"," contingency = np.array([[a, b],\n"," [c, d]])\n","\n"," # --- Chi-Squared ---\n"," chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)\n","\n"," # --- Log-Likelihood (G²) ---\n"," # G² = 2 * sum(observed * log(observed / expected))\n"," # scipy's chi2_contingency with lambda_=\"log-likelihood\" computes this\n"," g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_=\"log-likelihood\")\n","\n"," # --- Effect sizes ---\n"," freq_recent = a / total_recent\n"," freq_reference = b / total_reference\n","\n"," pct_diff = (freq_recent - freq_reference) / freq_reference * 100\n","\n"," # Avoid log(0) with a small epsilon\n"," eps = 1e-9\n"," log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))\n","\n"," return {\n"," \"word\": word,\n"," \"freq_recent\": freq_recent,\n"," \"freq_reference\": freq_reference,\n"," \"pct_diff\": pct_diff,\n"," \"log_ratio\": log_ratio,\n"," \"chi2\": chi2_stat,\n"," \"p_chi2\": p_chi2,\n"," \"g2\": g2_stat,\n"," \"p_g2\": p_g2,\n"," }\n"]},{"cell_type":"code","source":["# Example data\n","counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}\n","counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}\n","total_recent = sum(counts_recent.values())\n","total_reference = sum(counts_reference.values())"],"metadata":{"id":"mHH718-222BM","executionInfo":{"status":"ok","timestamp":1781100880331,"user_tz":-120,"elapsed":38,"user":{"displayName":"Andreas van Cranenburgh","userId":"13143063654677287265"}}},"execution_count":10,"outputs":[]},{"cell_type":"code","source":["# Run tests on whole vocabulary, including correction for multiple tests\n","# (false discovery rate).\n","\n","results = [\n"," corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),\n"," total_recent, total_reference)\n"," for word in counts_recent]\n","\n","# FDR correction across all words\n","p_values = [r[\"p_g2\"] for r in results]\n","_, p_adjusted, _, _ = multipletests(p_values, method=\"fdr_bh\")\n","\n","for r, p_adj in zip(results, p_adjusted):\n"," r[\"p_g2_adjusted\"] = p_adj"],"metadata":{"id":"urBml1212wxb","executionInfo":{"status":"ok","timestamp":1781100881153,"user_tz":-120,"elapsed":7,"user":{"displayName":"Andreas van Cranenburgh","userId":"13143063654677287265"}}},"execution_count":11,"outputs":[]},{"cell_type":"code","source":["results = pd.DataFrame(results)\n","results"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":163},"id":"-y3MUOgI3PFn","executionInfo":{"status":"ok","timestamp":1781100882491,"user_tz":-120,"elapsed":12,"user":{"displayName":"Andreas van Cranenburgh","userId":"13143063654677287265"}},"outputId":"f9a90951-c8f2-45a7-8d8d-fb0e279dd5b3"},"execution_count":12,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n","0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n","1 tafel 0.286119 0.313480 -8.728045 -0.131756 0.598371 \n","2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n","\n"," p_chi2 g2 p_g2 p_g2_adjusted \n","0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n","1 4.392004e-01 0.474701 4.908322e-01 0.490832 \n","2 2.618028e-05 17.051468 3.638025e-05 0.000055 "],"text/html":["\n"," <div id=\"df-c9b2768a-e9e3-4204-a57c-b02ed4ef374e\" class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>word</th>\n"," <th>freq_recent</th>\n"," <th>freq_reference</th>\n"," <th>pct_diff</th>\n"," <th>log_ratio</th>\n"," <th>chi2</th>\n"," <th>p_chi2</th>\n"," <th>g2</th>\n"," <th>p_g2</th>\n"," <th>p_g2_adjusted</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>eend</td>\n"," <td>0.424929</td>\n"," <td>0.241379</td>\n"," <td>76.042088</td>\n"," <td>0.815920</td>\n"," <td>25.238117</td>\n"," <td>5.067080e-07</td>\n"," <td>24.764140</td>\n"," <td>6.479173e-07</td>\n"," <td>0.000002</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>tafel</td>\n"," <td>0.286119</td>\n"," <td>0.313480</td>\n"," <td>-8.728045</td>\n"," <td>-0.131756</td>\n"," <td>0.598371</td>\n"," <td>4.392004e-01</td>\n"," <td>0.474701</td>\n"," <td>4.908322e-01</td>\n"," <td>0.490832</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>fiets</td>\n"," <td>0.288952</td>\n"," <td>0.445141</td>\n"," <td>-35.087579</td>\n"," <td>-0.623434</td>\n"," <td>17.676782</td>\n"," <td>2.618028e-05</td>\n"," <td>17.051468</td>\n"," <td>3.638025e-05</td>\n"," <td>0.000055</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <div class=\"colab-df-buttons\">\n","\n"," <div class=\"colab-df-container\">\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c9b2768a-e9e3-4204-a57c-b02ed4ef374e')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n","\n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n"," <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," </svg>\n"," </button>\n","\n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," .colab-df-buttons div {\n"," margin-bottom: 4px;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-c9b2768a-e9e3-4204-a57c-b02ed4ef374e button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-c9b2768a-e9e3-4204-a57c-b02ed4ef374e');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n","\n","\n"," <div id=\"id_9baad436-bf82-4412-879c-09e11c1c8d16\">\n"," <style>\n"," .colab-df-generate {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-generate:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-generate {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-generate:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n"," <button class=\"colab-df-generate\" onclick=\"generateWithVariable('results')\"\n"," title=\"Generate code using this dataframe.\"\n"," style=\"display:none;\">\n","\n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M7,19H8.4L18.45,9,17,7.55,7,17.6ZM5,21V16.75L18.45,3.32a2,2,0,0,1,2.83,0l1.4,1.43a1.91,1.91,0,0,1,.58,1.4,1.91,1.91,0,0,1-.58,1.4L9.25,21ZM18.45,9,17,7.55Zm-12,3A5.31,5.31,0,0,0,4.9,8.1,5.31,5.31,0,0,0,1,6.5,5.31,5.31,0,0,0,4.9,4.9,5.31,5.31,0,0,0,6.5,1,5.31,5.31,0,0,0,8.1,4.9,5.31,5.31,0,0,0,12,6.5,5.46,5.46,0,0,0,6.5,12Z\"/>\n"," </svg>\n"," </button>\n"," <script>\n"," (() => {\n"," const buttonEl =\n"," document.querySelector('#id_9baad436-bf82-4412-879c-09e11c1c8d16 button.colab-df-generate');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," buttonEl.onclick = () => {\n"," google.colab.notebook.generateWithVariable('results');\n"," }\n"," })();\n"," </script>\n"," </div>\n","\n"," </div>\n"," </div>\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"results","summary":"{\n \"name\": \"results\",\n \"rows\": 3,\n \"fields\": [\n {\n \"column\": \"word\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"eend\",\n \"tafel\",\n \"fiets\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq_recent\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.07933697379148243,\n \"min\": 0.28611898016997167,\n \"max\": 0.42492917847025496,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.42492917847025496,\n 0.28611898016997167,\n 0.28895184135977336\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq_reference\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.1033215397950522,\n \"min\": 0.2413793103448276,\n \"max\": 0.445141065830721,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.2413793103448276,\n 0.31347962382445144,\n 0.445141065830721\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pct_diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 58.066920556065845,\n \"min\": -35.08757930016359,\n \"max\": 76.04208822339133,\n \"num_unique_values\": 3,\n \"samples\": [\n 76.04208822339133,\n -8.728045325779044,\n -35.08757930016359\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"log_ratio\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.7316166431568862,\n \"min\": -0.6234335352876249,\n \"max\": 0.8159203877129316,\n \"num_unique_values\": 3,\n \"samples\": [\n 0.8159203877129316,\n -0.13175646608914762,\n -0.6234335352876249\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chi2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12.622486590913228,\n \"min\": 0.5983706960191795,\n \"max\": 25.238116861051893,\n \"num_unique_values\": 3,\n \"samples\": [\n 25.238116861051893,\n 0.5983706960191795,\n 17.67678188444784\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_chi2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.2535647399038483,\n \"min\": 5.067080224469542e-07,\n \"max\": 0.4392003554536562,\n \"num_unique_values\": 3,\n \"samples\": [\n 5.067080224469542e-07,\n 0.4392003554536562,\n 2.618028198517596e-05\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"g2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 12.411361159991287,\n \"min\": 0.47470138151998675,\n \"max\": 24.76413991002474,\n \"num_unique_values\": 3,\n \"samples\": [\n 24.76413991002474,\n 0.47470138151998675,\n 17.051467521507384\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_g2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.28337142981219393,\n \"min\": 6.479173479592146e-07,\n \"max\": 0.4908322269559797,\n \"num_unique_values\": 3,\n \"samples\": [\n 6.479173479592146e-07,\n 0.4908322269559797,\n 3.638024934064761e-05\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_g2_adjusted\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.28336580535875827,\n \"min\": 1.943752043877644e-06,\n \"max\": 0.4908322269559797,\n \"num_unique_values\": 3,\n \"samples\": [\n 1.943752043877644e-06,\n 0.4908322269559797,\n 5.457037401097142e-05\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":12}]},{"cell_type":"code","source":["# Significant according to Chi2\n","results[results['p_chi2'] < 0.05]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":132},"id":"nTpOtOka3ViF","executionInfo":{"status":"ok","timestamp":1781100883685,"user_tz":-120,"elapsed":65,"user":{"displayName":"Andreas van Cranenburgh","userId":"13143063654677287265"}},"outputId":"2430f959-eeb9-4670-da76-613406cbf473"},"execution_count":13,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n","0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n","2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n","\n"," p_chi2 g2 p_g2 p_g2_adjusted \n","0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n","2 2.618028e-05 17.051468 3.638025e-05 0.000055 "],"text/html":["\n"," <div id=\"df-07f90926-62ad-46b3-8cc2-b45a8005b40c\" class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>word</th>\n"," <th>freq_recent</th>\n"," <th>freq_reference</th>\n"," <th>pct_diff</th>\n"," <th>log_ratio</th>\n"," <th>chi2</th>\n"," <th>p_chi2</th>\n"," <th>g2</th>\n"," <th>p_g2</th>\n"," <th>p_g2_adjusted</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>eend</td>\n"," <td>0.424929</td>\n"," <td>0.241379</td>\n"," <td>76.042088</td>\n"," <td>0.815920</td>\n"," <td>25.238117</td>\n"," <td>5.067080e-07</td>\n"," <td>24.764140</td>\n"," <td>6.479173e-07</td>\n"," <td>0.000002</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>fiets</td>\n"," <td>0.288952</td>\n"," <td>0.445141</td>\n"," <td>-35.087579</td>\n"," <td>-0.623434</td>\n"," <td>17.676782</td>\n"," <td>2.618028e-05</td>\n"," <td>17.051468</td>\n"," <td>3.638025e-05</td>\n"," <td>0.000055</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <div class=\"colab-df-buttons\">\n","\n"," <div class=\"colab-df-container\">\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-07f90926-62ad-46b3-8cc2-b45a8005b40c')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n","\n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n"," <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," </svg>\n"," </button>\n","\n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," .colab-df-buttons div {\n"," margin-bottom: 4px;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-07f90926-62ad-46b3-8cc2-b45a8005b40c button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-07f90926-62ad-46b3-8cc2-b45a8005b40c');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n","\n","\n"," </div>\n"," </div>\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"results[results['p_chi2'] < 0\",\n \"rows\": 2,\n \"fields\": [\n {\n \"column\": \"word\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"fiets\",\n \"eend\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq_recent\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.09615049715851072,\n \"min\": 0.28895184135977336,\n \"max\": 0.42492917847025496,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.28895184135977336,\n 0.42492917847025496\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq_reference\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.14408131905055044,\n \"min\": 0.2413793103448276,\n \"max\": 0.445141065830721,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.445141065830721,\n 0.2413793103448276\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pct_diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 78.58054149691213,\n \"min\": -35.08757930016359,\n \"max\": 76.04208822339133,\n \"num_unique_values\": 2,\n \"samples\": [\n -35.08757930016359,\n 76.04208822339133\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"log_ratio\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0177769194811532,\n \"min\": -0.6234335352876249,\n \"max\": 0.8159203877129316,\n \"num_unique_values\": 2,\n \"samples\": [\n -0.6234335352876249,\n 0.8159203877129316\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chi2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.346671236779749,\n \"min\": 17.67678188444784,\n \"max\": 25.238116861051893,\n \"num_unique_values\": 2,\n \"samples\": [\n 17.67678188444784,\n 25.238116861051893\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_chi2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.8153958246340062e-05,\n \"min\": 5.067080224469542e-07,\n \"max\": 2.618028198517596e-05,\n \"num_unique_values\": 2,\n \"samples\": [\n 2.618028198517596e-05,\n 5.067080224469542e-07\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"g2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.453682946990868,\n \"min\": 17.051467521507384,\n \"max\": 24.76413991002474,\n \"num_unique_values\": 2,\n \"samples\": [\n 17.051467521507384,\n 24.76413991002474\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_g2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.5266574259638985e-05,\n \"min\": 6.479173479592146e-07,\n \"max\": 3.638024934064761e-05,\n \"num_unique_values\": 2,\n \"samples\": [\n 3.638024934064761e-05,\n 6.479173479592146e-07\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_g2_adjusted\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.721264126387293e-05,\n \"min\": 1.943752043877644e-06,\n \"max\": 5.457037401097142e-05,\n \"num_unique_values\": 2,\n \"samples\": [\n 5.457037401097142e-05,\n 1.943752043877644e-06\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":13}]},{"cell_type":"code","source":["# Significant according to G2 (LLR)\n","results[results['p_g2_adjusted'] < 0.05]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":132},"id":"Mz4zAphE4dBY","executionInfo":{"status":"ok","timestamp":1781100928540,"user_tz":-120,"elapsed":166,"user":{"displayName":"Andreas van Cranenburgh","userId":"13143063654677287265"}},"outputId":"3b42fdd2-f451-47b6-8989-ebf4dafcbf1a"},"execution_count":16,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" word freq_recent freq_reference pct_diff log_ratio chi2 \\\n","0 eend 0.424929 0.241379 76.042088 0.815920 25.238117 \n","2 fiets 0.288952 0.445141 -35.087579 -0.623434 17.676782 \n","\n"," p_chi2 g2 p_g2 p_g2_adjusted \n","0 5.067080e-07 24.764140 6.479173e-07 0.000002 \n","2 2.618028e-05 17.051468 3.638025e-05 0.000055 "],"text/html":["\n"," <div id=\"df-23ab50ae-434b-4e89-97cf-49a8231d6875\" class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>word</th>\n"," <th>freq_recent</th>\n"," <th>freq_reference</th>\n"," <th>pct_diff</th>\n"," <th>log_ratio</th>\n"," <th>chi2</th>\n"," <th>p_chi2</th>\n"," <th>g2</th>\n"," <th>p_g2</th>\n"," <th>p_g2_adjusted</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>eend</td>\n"," <td>0.424929</td>\n"," <td>0.241379</td>\n"," <td>76.042088</td>\n"," <td>0.815920</td>\n"," <td>25.238117</td>\n"," <td>5.067080e-07</td>\n"," <td>24.764140</td>\n"," <td>6.479173e-07</td>\n"," <td>0.000002</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>fiets</td>\n"," <td>0.288952</td>\n"," <td>0.445141</td>\n"," <td>-35.087579</td>\n"," <td>-0.623434</td>\n"," <td>17.676782</td>\n"," <td>2.618028e-05</td>\n"," <td>17.051468</td>\n"," <td>3.638025e-05</td>\n"," <td>0.000055</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <div class=\"colab-df-buttons\">\n","\n"," <div class=\"colab-df-container\">\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-23ab50ae-434b-4e89-97cf-49a8231d6875')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n","\n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n"," <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," </svg>\n"," </button>\n","\n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," .colab-df-buttons div {\n"," margin-bottom: 4px;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-23ab50ae-434b-4e89-97cf-49a8231d6875 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-23ab50ae-434b-4e89-97cf-49a8231d6875');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n","\n","\n"," </div>\n"," </div>\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","summary":"{\n \"name\": \"results[results['p_g2_adjusted'] < 0\",\n \"rows\": 2,\n \"fields\": [\n {\n \"column\": \"word\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"fiets\",\n \"eend\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq_recent\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.09615049715851072,\n \"min\": 0.28895184135977336,\n \"max\": 0.42492917847025496,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.28895184135977336,\n 0.42492917847025496\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"freq_reference\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.14408131905055044,\n \"min\": 0.2413793103448276,\n \"max\": 0.445141065830721,\n \"num_unique_values\": 2,\n \"samples\": [\n 0.445141065830721,\n 0.2413793103448276\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"pct_diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 78.58054149691213,\n \"min\": -35.08757930016359,\n \"max\": 76.04208822339133,\n \"num_unique_values\": 2,\n \"samples\": [\n -35.08757930016359,\n 76.04208822339133\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"log_ratio\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.0177769194811532,\n \"min\": -0.6234335352876249,\n \"max\": 0.8159203877129316,\n \"num_unique_values\": 2,\n \"samples\": [\n -0.6234335352876249,\n 0.8159203877129316\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"chi2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.346671236779749,\n \"min\": 17.67678188444784,\n \"max\": 25.238116861051893,\n \"num_unique_values\": 2,\n \"samples\": [\n 17.67678188444784,\n 25.238116861051893\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_chi2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.8153958246340062e-05,\n \"min\": 5.067080224469542e-07,\n \"max\": 2.618028198517596e-05,\n \"num_unique_values\": 2,\n \"samples\": [\n 2.618028198517596e-05,\n 5.067080224469542e-07\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"g2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 5.453682946990868,\n \"min\": 17.051467521507384,\n \"max\": 24.76413991002474,\n \"num_unique_values\": 2,\n \"samples\": [\n 17.051467521507384,\n 24.76413991002474\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_g2\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.5266574259638985e-05,\n \"min\": 6.479173479592146e-07,\n \"max\": 3.638024934064761e-05,\n \"num_unique_values\": 2,\n \"samples\": [\n 3.638024934064761e-05,\n 6.479173479592146e-07\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"p_g2_adjusted\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.721264126387293e-05,\n \"min\": 1.943752043877644e-06,\n \"max\": 5.457037401097142e-05,\n \"num_unique_values\": 2,\n \"samples\": [\n 5.457037401097142e-05,\n 1.943752043877644e-06\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":16}]},{"cell_type":"code","source":[],"metadata":{"id":"JNCCUpdC4jK5"},"execution_count":null,"outputs":[]}]}