{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pronunciation-based location prediction confusion\n", "\n", "Setup a pandas dataframe with in each row\n", "\n", " * participant provided (actual) location,\n", " * 3 estimations made by Nanna's heuristic based in what the participant stated to be the correct pronunciation of a word\n", " * distance between the actual and heuristic predicted location\n", " \n", "Averages of the distances are exported for visualisation in QGIS." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "import pandas\n", "import MySQLdb\n", "import numpy\n", "import itertools\n", "import requests\n", "import json\n", "from vincenty import vincenty\n", "\n", "db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen')\n", "\n", "%matplotlib inline\n", "from matplotlib import pyplot, rcParams\n", "from jupyter_progressbar import ProgressBar\n", "\n", "# rcParams['font.family'] = 'Lucinda Console'\n", "rcParams['font.size'] = '24'\n", "rcParams['figure.figsize'] = (20, 10)\n", "rcParams['figure.dpi'] = 100" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def simplify_area_name(x):\n", " return ' '.join(\n", " x.split('/') # split Dutch and Frysian name\n", " [0] # extract Dutch name\n", " .strip()\n", " .split(' ') # Split area name from province, mostly 'Fr'\n", " [:-1] # remove province\n", " ).strip().lower() # rejoin spaces in area name" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "metadata = pandas.read_sql('''SELECT answer.* FROM core_surveyresultquestionanswer as answer''', db)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "grouped = metadata.groupby(['survey_result_id', 'question_id']).agg({\n", " 'question_text': 'first',\n", " 'answer_text': lambda x: x if len(x) == 1 else ', '.join(x)\n", "})\n", "grouped.reset_index(inplace=True)\n", "\n", "grouped = grouped.pivot(index='survey_result_id', columns='question_text', values='answer_text')\n", "\n", "grouped = grouped.rename({\n", " 'Do you go to school?': 'school',\n", " 'Do you go to university?': 'university',\n", " 'What is your age bracket?': 'age_bracket',\n", " 'What is your age?': 'age',\n", " 'What is your gender?': 'gender',\n", " 'Which language are you the most proficient in?': 'language',\n", " 'Which languages do you actively use in your life?': 'active-languages'\n", "}, axis='columns')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": false }, "outputs": [], "source": [ "predictions = pandas.read_sql('''\n", "SELECT \n", " sr.id as id,\n", " sr.area_name as actual_area,\n", " area1_name as area_prediction_1,\n", " area2_name as area_prediction_2,\n", " area3_name as area_prediction_3\n", "FROM core_surveyresult as sr\n", "INNER JOIN core_predictionquizresult as pq\n", " ON sr.id = pq.survey_result_id\n", "''', db)\n", "\n", "predicted_areas = set(map(simplify_area_name,\n", " set(predictions['area_prediction_1']) |\n", " set(predictions['area_prediction_2']) |\n", " set(predictions['area_prediction_3'])\n", "))\n", "actual_areas = set(map(str.lower, predictions['actual_area']))\n", "\n", "areas = list(predicted_areas | actual_areas)\n", "location_to_number = {l: i for i, l in enumerate(areas)}" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": false }, "outputs": [], "source": [ "simplified_predictions = pandas.DataFrame({\n", " 'id': list(predictions['id']),\n", " 'actual': list(map(str.lower, predictions['actual_area'])),\n", " 'prediction_1': list(map(simplify_area_name, predictions['area_prediction_1'])),\n", " 'prediction_2': list(map(simplify_area_name, predictions['area_prediction_2'])),\n", " 'prediction_3': list(map(simplify_area_name, predictions['area_prediction_3'])),\n", "})\n", "# simplified_predictions.set_index('id')\n", "simplified_predictions.to_excel('actual-predictions.xls')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "locations = {location for c in simplified_predictions.columns for location in simplified_predictions[c] if c != 'id'}" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": true }, "outputs": [], "source": [ "names = pandas.read_csv('plaatsen_nl.csv')\n", "\n", "nonominatim = {\n", " name: [row['st_y'], row['x']]\n", " for _, row in names.iterrows()\n", " for column in ['bebouwdeko', 'naamoffici', 'naamnl', 'naamfries']\n", "# for _ in [ print(row[column]) ]\n", " if type(row[column]) == str\n", " for name in [row[column], row[column].lower().replace('-', ' ')]\n", "}" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "21c016f24e23473e807ed3e9c2d942c6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='0s passed', placeholder='0…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "nominatim = {\n", " l: json.loads(\n", " requests.get(\n", " 'https://nominatim.openstreetmap.org/search.php?q=Netherlands%20'\n", " '{}&polygon_geojson=1&viewbox=&format=json'.format(l)\n", " ).text\n", " )\n", " for l in ProgressBar(locations)\n", " if l not in nonominatim\n", "}" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "latlons = {\n", " l: (float(v[0]['lat']), float(v[0]['lon']))\n", " for l, v in nominatim.items()\n", " if len(v) > 0\n", "}\n", "latlons.update(nonominatim)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "for c in {'actual', 'prediction_1', 'prediction_2', 'prediction_3'}:\n", " simplified_predictions['{}_latlon'.format(c)] = [\n", " latlons.get(l, numpy.nan)\n", " for l in simplified_predictions['{}'.format(c)]\n", " ]" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "for c in {'prediction_1_latlon', 'prediction_2_latlon', 'prediction_3_latlon'}:\n", " simplified_predictions['{}_distance'.format(c)] = [\n", " vincenty(x, y) if x == x and y == y else numpy.nan\n", " for x, y in zip(simplified_predictions['actual_latlon'], simplified_predictions[c])\n", " ]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "simplified_predictions = simplified_predictions[[\n", " 'id', 'actual', 'actual_latlon', 'prediction_3_latlon_distance',\n", " 'prediction_1_latlon_distance', 'prediction_2_latlon_distance'\n", "]]\n", "\n", "simplified_predictions = simplified_predictions.rename({\n", " 'prediction_3_latlon_distance': 'distance3',\n", " 'prediction_1_latlon_distance': 'distance1',\n", " 'prediction_2_latlon_distance': 'distance2'\n", "}, axis='columns')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "simplified_predictions = simplified_predictions.join(grouped, on='id')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "simplified_predictions['latitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[0] if x == x else None)\n", "simplified_predictions['longitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[1] if x == x else None)\n", "simplified_predictions = simplified_predictions.drop('actual_latlon', axis='columns')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "simplified_predictions['age_groups'] = [\n", " {'0-10': '0-20', '11-20': '0-20',\n", " '21-30': '21-50', '31-40': '21-50', '41-50': '21-50',\n", " '51-60': '51-100', '61-70': '51-100', '71-80': '51-100', '81-90': '51-100', '91-100': '51-100'}.get(b, None)\n", " for b in simplified_predictions['age_bracket']\n", "]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "scrolled": false }, "outputs": [], "source": [ "# age_groups = simplified_predictions.groupby(['age_groups', 'actual']).agg({\n", "# 'distance1': ['mean', 'min', 'max', 'count', 'size'],\n", "# 'latitude': 'first',\n", "# 'longitude': 'first'\n", "# })\n", "# age_groups.index.get_level_values('age_groups')" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "scrolled": false }, "outputs": [], "source": [ "# gender_groups = simplified_predictions.groupby(['gender', 'actual']).agg({\n", "# 'distance1': ['min', 'mean', 'max', 'count', 'size'],\n", "# 'latitude': 'first',\n", "# 'longitude': 'first'\n", "# })\n", "# gender_groups" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "summary = simplified_predictions[['latitude', 'longitude', 'distance1', 'distance2', 'distance3', 'actual']]" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "summary.to_csv('points.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "geojson = {\n", " \"type\": \"FeatureCollection\",\n", " \"features\": [\n", " {\n", " \"type\": \"Feature\",\n", " \"properties\": {\n", " \"distance 1\": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,\n", " \"distance 2\": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,\n", " \"distance 3\": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,\n", " \"actual\": actual\n", " },\n", " \"geometry\": {\n", " \"type\": \"Point\",\n", " \"coordinates\": list( actual_lat_lon )[::-1]\n", " }\n", " }\n", " for actual, row in simplified_predictions.groupby('actual')\n", " if actual != ''\n", "# for _ in [ print(row['actual_latlon']), print() ]\n", " for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias \n", " if actual_lat_lon == actual_lat_lon\n", " ]\n", "}" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "geojson = {\n", " \"type\": \"FeatureCollection\",\n", " \"features\": [\n", " {\n", " \"type\": \"Feature\",\n", " \"properties\": {\n", " \"distance 1\": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,\n", " \"distance 2\": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,\n", " \"distance 3\": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,\n", " \"actual\": actual\n", " },\n", " \"geometry\": {\n", " \"type\": \"Point\",\n", " \"coordinates\": list( actual_lat_lon )[::-1]\n", " }\n", " }\n", " for actual, row in simplified_predictions.groupby('actual')\n", " if actual != ''\n", "# for _ in [ print(row['actual_latlon']), print() ]\n", " for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias \n", " if actual_lat_lon == actual_lat_lon\n", " ]\n", "}" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }