stimmenfryslan/notebooks/Pronunciations Table per Wijk.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Geographical pronunciation statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas\n",
    "import MySQLdb\n",
    "import numpy\n",
    "import json\n",
    "\n",
    "db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
    "\n",
    "%matplotlib notebook\n",
    "from matplotlib import pyplot\n",
    "import folium\n",
    "from IPython.display import display\n",
    "from shapely.geometry import Polygon, MultiPolygon, shape, Point\n",
    "from jsbutton import JsButton\n",
    "from shapely.geometry import LineString, MultiLineString\n",
    "from jupyter_progressbar import ProgressBar\n",
    "from collections import defaultdict, Counter\n",
    "from ipy_table import make_table\n",
    "from html import escape\n",
    "\n",
    "import numpy as np\n",
    "from random import shuffle\n",
    "import pickle\n",
    "from jupyter_progressbar import ProgressBar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('friesland_wijken_land_only.p3', 'rb') as f:\n",
    "    wijken, wijk_shapes = pickle.load(f)\n",
    "\n",
    "for x in wijken['features']:\n",
    "    x['type'] = 'Feature'\n",
    "\n",
    "with open('friesland_wijken_geojson.json', 'w') as f:\n",
    "    wijken['features'] = wijken['features']\n",
    "    json.dump(wijken, f, indent=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from osgeo import gdal, ogr\n",
    "\n",
    "srcDS = gdal.OpenEx('friesland_wijken_geojson.json')\n",
    "ds = gdal.VectorTranslate('friesland_wijken_geojson.kml', srcDS, format='kml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'k4luâ7mWBAgDSKhCVaysNdr TjeoE85JzëGúcM.,IRtp2-bLû69Un0wZF3Hv1iOfô'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "''.join({\n",
    "    c\n",
    "    for wijk in wijken['features']\n",
    "    for c in wijk['properties']['gemeente_en_wijk_naam']\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('friesland_wijken_land_only.p3', 'rb') as f:\n",
    "    wijken, wijk_shapes = pickle.load(f)\n",
    "\n",
    "wijk_names = [wijk['properties']['gemeente_en_wijk_naam'] for wijk in wijken['features']]\n",
    "\n",
    "def get_wijk(point):\n",
    "    for i, shape in enumerate(wijk_shapes):\n",
    "        if shape.contains(point):\n",
    "            return i\n",
    "    return -1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def listify(rd_multipolygon):\n",
    "    if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float):\n",
    "        return list(rd_multipolygon)\n",
    "    return [\n",
    "        listify(element)\n",
    "        for element in rd_multipolygon\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Answers to how participants state a word should be pronounces.\n",
    "\n",
    "answers = pandas.read_sql('''\n",
    "SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
    "FROM       core_surveyresult as survey\n",
    "INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
    "INNER JOIN core_predictionquizresultquestionanswer as answer\n",
    "    ON result.id = answer.prediction_quiz_id\n",
    "''', db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "zero_latlng_questions = {\n",
    "    q\n",
    "    for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
    "    if row['user_lat'] == 0 and row['user_lng'] == 0\n",
    "}\n",
    "answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def reverse(rd_multipolygon):\n",
    "    if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float):\n",
    "        return rd_multipolygon[::-1]\n",
    "    return [\n",
    "        reverse(element)\n",
    "        for element in rd_multipolygon\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  # Remove the CWD from sys.path while we load stuff.\n"
     ]
    }
   ],
   "source": [
    "# Takes approximately 2 minutes\n",
    "points = set(zip(answers_filtered['user_lng'], answers_filtered['user_lat']))\n",
    "\n",
    "wijk_map = dict()\n",
    "for lng, lat in points:\n",
    "    wijk_map[(lng, lat)] = get_wijk(Point(lng, lat))\n",
    "\n",
    "answers_filtered['wijk'] = [\n",
    "    wijk_map[(lng, lat)]\n",
    "    for lat, lng in zip(answers_filtered['user_lat'], answers_filtered['user_lng'])\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \n",
      "/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \"\"\"\n",
      "/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "answers_filtered['question_text_url'] = answers_filtered['question_text'].map(\n",
    "    lambda x: x.replace('\"', '').replace('*', ''))\n",
    "\n",
    "answers_filtered['wijk_name'] = answers_filtered['wijk'].map(\n",
    "    lambda x: wijk_names[x])\n",
    "\n",
    "answers_filtered['answer_text_url'] = answers_filtered['answer_text'].map(\n",
    "    lambda x: x[x.find('('):x.find(')')][1:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "wijken = pandas.DataFrame([\n",
    "    {'#name': name, 'longitude': shape.centroid.xy[0][0], 'latitude': shape.centroid.xy[1][0]}\n",
    "    for name, shape in zip(wijk_names, wijk_shapes)\n",
    "])\n",
    "\n",
    "wijken.set_index('#name', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def merge_dicts(*args):\n",
    "    for arg in args[1:]:\n",
    "        args[0].update(arg)\n",
    "    return args[0]\n",
    "\n",
    "\n",
    "pronunciations = pandas.DataFrame([\n",
    "    merge_dicts(\n",
    "    {\n",
    "        question: answers['answer_text_url']\n",
    "        for question, answers in rows.groupby(\n",
    "            'question_text_url'\n",
    "        ).agg(\n",
    "            {\n",
    "                'answer_text_url': lambda x: [\n",
    "                    {\n",
    "                        'pronunciation': answer_text,\n",
    "                        'count': answer_texts.count(answer_text)\n",
    "                    }\n",
    "                    for answer_texts in [list(x)]\n",
    "                    for answer_text in sorted(set(x))\n",
    "                    \n",
    "                ]    \n",
    "            }\n",
    "        ).iterrows()\n",
    "    }, {\n",
    "       'wijk': wijk_names[wijk]\n",
    "    })\n",
    "    for wijk, rows in answers_filtered.groupby('wijk')\n",
    "    if wijk >= 0\n",
    "])\n",
    "\n",
    "pronunciations.set_index('wijk', inplace=True)\n",
    "pronunciations\n",
    "\n",
    "columns = list(pronunciations.columns)\n",
    "\n",
    "counts = pandas.DataFrame([\n",
    "    merge_dicts({\n",
    "        column + \": \" + x['pronunciation']: 100 * x['count'] / total\n",
    "        for column in columns\n",
    "        for total in [sum(x['count'] for x in row[column])]\n",
    "        for x in row[column]\n",
    "    }, {'': wijk})\n",
    "    for wijk, row in pronunciations.iterrows()\n",
    "])\n",
    "\n",
    "pronunciations = pandas.DataFrame([\n",
    "    merge_dicts({\n",
    "        column: ' / '.join(str(x['pronunciation']) for x in row[column])\n",
    "        for column in columns\n",
    "    }, {'': wijk})\n",
    "    for wijk, row in pronunciations.iterrows()\n",
    "])\n",
    "\n",
    "pronunciations.set_index('', inplace=True)\n",
    "counts.set_index('', inplace=True)\n",
    "counts[counts != counts] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<function shapely.geometry.geo.shape(context)>"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "pronunciations.to_csv('pronunciations_by_wijk.tsv', sep='\\t')\n",
    "counts.to_csv('pronunciation_percentages_by_wijk.tsv', sep='\\t')\n",
    "wijken.to_csv('wijk_centroid.tsv', sep='\\t', columns=['longitude', 'latitude'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('pronunciations_by_wijk.tsv') as f:\n",
    "    p = list(f)\n",
    "    \n",
    "with open('pronunciation_count_by_wijk.tsv') as f:\n",
    "    c = list(f)\n",
    "\n",
    "with open('wijk_centroid.tsv') as f:\n",
    "    w = list(f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}