stimmenfryslan/notebooks/Gabmap Pronunciation Tables, Simple Example.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Geographical pronunciation tables, simple example\n",
    "\n",
    "Simple example to create gabmap files for two words with few pronunciations an two regions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('..')\n",
    "\n",
    "import pandas\n",
    "import MySQLdb\n",
    "import json\n",
    "import copy\n",
    "\n",
    "db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
    "\n",
    "from shapely.geometry import shape, Point\n",
    "\n",
    "from gabmap import create_gabmap_dataframes\n",
    "\n",
    "from stimmen.geojson import merge_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../data/Friesland_wijken.geojson') as f:\n",
    "    regions = json.load(f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and simplify"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Answers to how participants state a word should be pronounced\n",
    "\n",
    "answers = pandas.read_sql('''\n",
    "SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
    "FROM       core_surveyresult as survey\n",
    "INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
    "INNER JOIN core_predictionquizresultquestionanswer as answer\n",
    "    ON result.id = answer.prediction_quiz_id\n",
    "''', db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "regions_simple = merge_features(copy.deepcopy(regions),\n",
    "    condition=lambda feature: feature['properties']['GM_NAAM'] == 'Heerenveen',\n",
    ")\n",
    "\n",
    "regions_simple = merge_features(\n",
    "    regions_simple,\n",
    "    condition=lambda feature: feature['properties']['GM_NAAM'] == 'Leeuwarden',\n",
    ")\n",
    "regions_simple['features'] = regions_simple['features'][-2:]\n",
    "\n",
    "regions_simple['features'][0]['properties']['name'] = 'Heerenveen'\n",
    "regions_simple['features'][1]['properties']['name'] = 'Leeuwarden'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "answers_simple = answers[\n",
    "    (answers['question_text'] == '\"blad\" (aan een boom)') |\n",
    "    (answers['question_text'] == '\"vis\"')\n",
    "].copy()\n",
    "\n",
    "answers_simple['question_text'] = answers_simple['question_text'].map(\n",
    "    lambda x: x.replace('\"', '').replace('*', ''))\n",
    "\n",
    "answers_simple['answer_text'] = answers_simple['answer_text'].map(\n",
    "    lambda x: x[x.find('('):x.find(')')][1:])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Two words, boom and vis, with each 4 and 2 pronunciations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>answer_text</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>question_text</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>blad (aan een boom)</th>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vis</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     answer_text\n",
       "question_text                   \n",
       "blad (aan een boom)            4\n",
       "vis                            2"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "answers_simple.groupby('question_text').agg({'answer_text': lambda x: len(set(x))})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "centroids_example, pronunciations_example, counts_example = create_gabmap_dataframes(\n",
    "    regions_simple, answers_simple,\n",
    "    latitude_column='user_lat', longitude_column='user_lng',\n",
    "    word_column='question_text', pronunciation_column='answer_text',\n",
    "    region_name_property='name'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Resulting tables\n",
    "\n",
    "Stored as tab separated files for gabmap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>latitude</th>\n",
       "      <th>longitude</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>#name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Heerenveen</th>\n",
       "      <td>52.996076</td>\n",
       "      <td>5.977925</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Leeuwarden</th>\n",
       "      <td>53.169940</td>\n",
       "      <td>5.797613</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             latitude  longitude\n",
       "#name                           \n",
       "Heerenveen  52.996076   5.977925\n",
       "Leeuwarden  53.169940   5.797613"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "centroids_example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>blad (aan een boom)</th>\n",
       "      <th>vis</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Heerenveen</th>\n",
       "      <td>blet / blɑt / blɔd / blɛ:t</td>\n",
       "      <td>fisk / fɪs</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Leeuwarden</th>\n",
       "      <td>blet / blɑt / blɔd / blɛ:t</td>\n",
       "      <td>fisk / fɪs</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   blad (aan een boom)         vis\n",
       "                                                  \n",
       "Heerenveen  blet / blɑt / blɔd / blɛ:t  fisk / fɪs\n",
       "Leeuwarden  blet / blɑt / blɔd / blɛ:t  fisk / fɪs"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pronunciations_example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>blad (aan een boom): blet</th>\n",
       "      <th>blad (aan een boom): blɑt</th>\n",
       "      <th>blad (aan een boom): blɔd</th>\n",
       "      <th>blad (aan een boom): blɛ:t</th>\n",
       "      <th>vis: fisk</th>\n",
       "      <th>vis: fɪs</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Heerenveen</th>\n",
       "      <td>31.654676</td>\n",
       "      <td>2.158273</td>\n",
       "      <td>2.158273</td>\n",
       "      <td>64.028777</td>\n",
       "      <td>52.517986</td>\n",
       "      <td>47.482014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Leeuwarden</th>\n",
       "      <td>7.865169</td>\n",
       "      <td>7.022472</td>\n",
       "      <td>8.707865</td>\n",
       "      <td>76.404494</td>\n",
       "      <td>75.000000</td>\n",
       "      <td>25.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            blad (aan een boom): blet  blad (aan een boom): blɑt  \\\n",
       "                                                                   \n",
       "Heerenveen                  31.654676                   2.158273   \n",
       "Leeuwarden                   7.865169                   7.022472   \n",
       "\n",
       "            blad (aan een boom): blɔd  blad (aan een boom): blɛ:t  vis: fisk  \\\n",
       "                                                                               \n",
       "Heerenveen                   2.158273                   64.028777  52.517986   \n",
       "Leeuwarden                   8.707865                   76.404494  75.000000   \n",
       "\n",
       "             vis: fɪs  \n",
       "                       \n",
       "Heerenveen  47.482014  \n",
       "Leeuwarden  25.000000  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "counts_example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "pronunciations_example.to_csv('../data/Pronunciations_example.gabmap.tsv', sep='\\t')\n",
    "counts_example.to_csv('../data/Pronunciation_percentages_example.gabmap.tsv', sep='\\t')\n",
    "centroids_example.to_csv('../data/Centroids_example.gabmap.tsv', sep='\\t', columns=['longitude', 'latitude'])\n",
    "with open('../data/Gabmap_example.geojson', 'w') as f:\n",
    "    json.dump(regions_simple, f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}