nlnieuws/python/word_freq_comparison.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "executionInfo": {
     "elapsed": 1341,
     "status": "ok",
     "timestamp": 1781100698726,
     "user": {
      "displayName": "Andreas van Cranenburgh",
      "userId": "13143063654677287265"
     },
     "user_tz": -120
    },
    "id": "YMifluhW2rZp"
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy.stats import chi2_contingency\n",
    "from statsmodels.stats.multitest import multipletests\n",
    "import pandas as pd\n",
    "\n",
    "def corpus_stats(word, counts_recent, counts_reference, total_recent, total_reference):\n",
    "    \"\"\"\n",
    "    word             : the word being tested\n",
    "    counts_recent    : raw count in week 5\n",
    "    counts_reference : raw count in weeks 1-4\n",
    "    total_recent     : total tokens in week 5\n",
    "    total_reference  : total tokens in weeks 1-4\n",
    "    \"\"\"\n",
    "    a = counts_recent      # word in recent\n",
    "    b = counts_reference   # word in reference\n",
    "    c = total_recent - a   # non-word in recent\n",
    "    d = total_reference - b  # non-word in reference\n",
    "\n",
    "    contingency = np.array([[a, b],\n",
    "                             [c, d]])\n",
    "\n",
    "    # --- Chi-Squared ---\n",
    "    chi2_stat, p_chi2, _, _ = chi2_contingency(contingency, correction=False)\n",
    "\n",
    "    # --- Log-Likelihood (G²) ---\n",
    "    # G² = 2 * sum(observed * log(observed / expected))\n",
    "    # scipy's chi2_contingency with lambda_=\"log-likelihood\" computes this\n",
    "    g2_stat, p_g2, _, _ = chi2_contingency(contingency, lambda_=\"log-likelihood\")\n",
    "\n",
    "    # --- Effect sizes ---\n",
    "    freq_recent    = a / total_recent\n",
    "    freq_reference = b / total_reference\n",
    "\n",
    "    pct_diff = (freq_recent - freq_reference) / freq_reference * 100\n",
    "\n",
    "    # Avoid log(0) with a small epsilon\n",
    "    eps = 1e-9\n",
    "    log_ratio = np.log2((freq_recent + eps) / (freq_reference + eps))\n",
    "\n",
    "    return {\n",
    "        \"word\":           word,\n",
    "        \"freq_recent\":    freq_recent,\n",
    "        \"freq_reference\": freq_reference,\n",
    "        \"pct_diff\":       pct_diff,\n",
    "        \"log_ratio\":      log_ratio,\n",
    "        \"chi2\":           chi2_stat,\n",
    "        \"p_chi2\":         p_chi2,\n",
    "        \"g2\":             g2_stat,\n",
    "        \"p_g2\":           p_g2,\n",
    "    }\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "executionInfo": {
     "elapsed": 38,
     "status": "ok",
     "timestamp": 1781100880331,
     "user": {
      "displayName": "Andreas van Cranenburgh",
      "userId": "13143063654677287265"
     },
     "user_tz": -120
    },
    "id": "mHH718-222BM"
   },
   "outputs": [],
   "source": [
    "# Example data\n",
    "counts_recent = {'eend': 150, 'tafel': 101, 'fiets': 102}\n",
    "counts_reference = {'eend': 77, 'tafel': 100, 'fiets': 142}\n",
    "total_recent = sum(counts_recent.values())\n",
    "total_reference = sum(counts_reference.values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "executionInfo": {
     "elapsed": 7,
     "status": "ok",
     "timestamp": 1781100881153,
     "user": {
      "displayName": "Andreas van Cranenburgh",
      "userId": "13143063654677287265"
     },
     "user_tz": -120
    },
    "id": "urBml1212wxb"
   },
   "outputs": [],
   "source": [
    "# Run tests on whole vocabulary, including correction for multiple tests\n",
    "# (false discovery rate).\n",
    "\n",
    "results = [\n",
    "    corpus_stats(word, counts_recent[word], counts_reference.get(word, 0),\n",
    "                 total_recent, total_reference)\n",
    "    for word in counts_recent]\n",
    "\n",
    "# FDR correction across all words\n",
    "p_values = [r[\"p_g2\"] for r in results]\n",
    "_, p_adjusted, _, _ = multipletests(p_values, method=\"fdr_bh\")\n",
    "\n",
    "for r, p_adj in zip(results, p_adjusted):\n",
    "    r[\"p_g2_adjusted\"] = p_adj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 163
    },
    "executionInfo": {
     "elapsed": 12,
     "status": "ok",
     "timestamp": 1781100882491,
     "user": {
      "displayName": "Andreas van Cranenburgh",
      "userId": "13143063654677287265"
     },
     "user_tz": -120
    },
    "id": "-y3MUOgI3PFn",
    "outputId": "f9a90951-c8f2-45a7-8d8d-fb0e279dd5b3"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>freq_recent</th>\n",
       "      <th>freq_reference</th>\n",
       "      <th>pct_diff</th>\n",
       "      <th>log_ratio</th>\n",
       "      <th>chi2</th>\n",
       "      <th>p_chi2</th>\n",
       "      <th>g2</th>\n",
       "      <th>p_g2</th>\n",
       "      <th>p_g2_adjusted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>eend</td>\n",
       "      <td>0.424929</td>\n",
       "      <td>0.241379</td>\n",
       "      <td>76.042088</td>\n",
       "      <td>0.815920</td>\n",
       "      <td>25.238117</td>\n",
       "      <td>5.067080e-07</td>\n",
       "      <td>24.764140</td>\n",
       "      <td>6.479173e-07</td>\n",
       "      <td>0.000002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>tafel</td>\n",
       "      <td>0.286119</td>\n",
       "      <td>0.313480</td>\n",
       "      <td>-8.728045</td>\n",
       "      <td>-0.131756</td>\n",
       "      <td>0.598371</td>\n",
       "      <td>4.392004e-01</td>\n",
       "      <td>0.474701</td>\n",
       "      <td>4.908322e-01</td>\n",
       "      <td>0.490832</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>fiets</td>\n",
       "      <td>0.288952</td>\n",
       "      <td>0.445141</td>\n",
       "      <td>-35.087579</td>\n",
       "      <td>-0.623434</td>\n",
       "      <td>17.676782</td>\n",
       "      <td>2.618028e-05</td>\n",
       "      <td>17.051468</td>\n",
       "      <td>3.638025e-05</td>\n",
       "      <td>0.000055</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    word  freq_recent  freq_reference   pct_diff  log_ratio       chi2  \\\n",
       "0   eend     0.424929        0.241379  76.042088   0.815920  25.238117   \n",
       "1  tafel     0.286119        0.313480  -8.728045  -0.131756   0.598371   \n",
       "2  fiets     0.288952        0.445141 -35.087579  -0.623434  17.676782   \n",
       "\n",
       "         p_chi2         g2          p_g2  p_g2_adjusted  \n",
       "0  5.067080e-07  24.764140  6.479173e-07       0.000002  \n",
       "1  4.392004e-01   0.474701  4.908322e-01       0.490832  \n",
       "2  2.618028e-05  17.051468  3.638025e-05       0.000055  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results = pd.DataFrame(results)\n",
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 132
    },
    "executionInfo": {
     "elapsed": 65,
     "status": "ok",
     "timestamp": 1781100883685,
     "user": {
      "displayName": "Andreas van Cranenburgh",
      "userId": "13143063654677287265"
     },
     "user_tz": -120
    },
    "id": "nTpOtOka3ViF",
    "outputId": "2430f959-eeb9-4670-da76-613406cbf473"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>freq_recent</th>\n",
       "      <th>freq_reference</th>\n",
       "      <th>pct_diff</th>\n",
       "      <th>log_ratio</th>\n",
       "      <th>chi2</th>\n",
       "      <th>p_chi2</th>\n",
       "      <th>g2</th>\n",
       "      <th>p_g2</th>\n",
       "      <th>p_g2_adjusted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>eend</td>\n",
       "      <td>0.424929</td>\n",
       "      <td>0.241379</td>\n",
       "      <td>76.042088</td>\n",
       "      <td>0.815920</td>\n",
       "      <td>25.238117</td>\n",
       "      <td>5.067080e-07</td>\n",
       "      <td>24.764140</td>\n",
       "      <td>6.479173e-07</td>\n",
       "      <td>0.000002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>fiets</td>\n",
       "      <td>0.288952</td>\n",
       "      <td>0.445141</td>\n",
       "      <td>-35.087579</td>\n",
       "      <td>-0.623434</td>\n",
       "      <td>17.676782</td>\n",
       "      <td>2.618028e-05</td>\n",
       "      <td>17.051468</td>\n",
       "      <td>3.638025e-05</td>\n",
       "      <td>0.000055</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    word  freq_recent  freq_reference   pct_diff  log_ratio       chi2  \\\n",
       "0   eend     0.424929        0.241379  76.042088   0.815920  25.238117   \n",
       "2  fiets     0.288952        0.445141 -35.087579  -0.623434  17.676782   \n",
       "\n",
       "         p_chi2         g2          p_g2  p_g2_adjusted  \n",
       "0  5.067080e-07  24.764140  6.479173e-07       0.000002  \n",
       "2  2.618028e-05  17.051468  3.638025e-05       0.000055  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Significant according to Chi2\n",
    "results[results['p_chi2'] < 0.05]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 132
    },
    "executionInfo": {
     "elapsed": 166,
     "status": "ok",
     "timestamp": 1781100928540,
     "user": {
      "displayName": "Andreas van Cranenburgh",
      "userId": "13143063654677287265"
     },
     "user_tz": -120
    },
    "id": "Mz4zAphE4dBY",
    "outputId": "3b42fdd2-f451-47b6-8989-ebf4dafcbf1a"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>freq_recent</th>\n",
       "      <th>freq_reference</th>\n",
       "      <th>pct_diff</th>\n",
       "      <th>log_ratio</th>\n",
       "      <th>chi2</th>\n",
       "      <th>p_chi2</th>\n",
       "      <th>g2</th>\n",
       "      <th>p_g2</th>\n",
       "      <th>p_g2_adjusted</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>eend</td>\n",
       "      <td>0.424929</td>\n",
       "      <td>0.241379</td>\n",
       "      <td>76.042088</td>\n",
       "      <td>0.815920</td>\n",
       "      <td>25.238117</td>\n",
       "      <td>5.067080e-07</td>\n",
       "      <td>24.764140</td>\n",
       "      <td>6.479173e-07</td>\n",
       "      <td>0.000002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>fiets</td>\n",
       "      <td>0.288952</td>\n",
       "      <td>0.445141</td>\n",
       "      <td>-35.087579</td>\n",
       "      <td>-0.623434</td>\n",
       "      <td>17.676782</td>\n",
       "      <td>2.618028e-05</td>\n",
       "      <td>17.051468</td>\n",
       "      <td>3.638025e-05</td>\n",
       "      <td>0.000055</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    word  freq_recent  freq_reference   pct_diff  log_ratio       chi2  \\\n",
       "0   eend     0.424929        0.241379  76.042088   0.815920  25.238117   \n",
       "2  fiets     0.288952        0.445141 -35.087579  -0.623434  17.676782   \n",
       "\n",
       "         p_chi2         g2          p_g2  p_g2_adjusted  \n",
       "0  5.067080e-07  24.764140  6.479173e-07       0.000002  \n",
       "2  2.618028e-05  17.051468  3.638025e-05       0.000055  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Significant according to G2 (LLR)\n",
    "results[results['p_g2_adjusted'] < 0.05]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "JNCCUpdC4jK5"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "authorship_tag": "ABX9TyOWNAG6IZoh+ik4rqgeMAZj",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}