434 lines
13 KiB
Plaintext
434 lines
13 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Pronunciation-based location prediction confusion\n",
|
||
|
"\n",
|
||
|
"Setup a pandas dataframe with in each row\n",
|
||
|
"\n",
|
||
|
" * participant provided (actual) location,\n",
|
||
|
" * 3 estimations made by Nanna's heuristic based in what the participant stated to be the correct pronunciation of a word\n",
|
||
|
" * distance between the actual and heuristic predicted location\n",
|
||
|
" \n",
|
||
|
"Averages of the distances are exported for visualisation in QGIS."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pickle\n",
|
||
|
"import pandas\n",
|
||
|
"import MySQLdb\n",
|
||
|
"import numpy\n",
|
||
|
"import itertools\n",
|
||
|
"import requests\n",
|
||
|
"import json\n",
|
||
|
"from vincenty import vincenty\n",
|
||
|
"\n",
|
||
|
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen')\n",
|
||
|
"\n",
|
||
|
"%matplotlib inline\n",
|
||
|
"from matplotlib import pyplot, rcParams\n",
|
||
|
"from jupyter_progressbar import ProgressBar\n",
|
||
|
"\n",
|
||
|
"# rcParams['font.family'] = 'Lucinda Console'\n",
|
||
|
"rcParams['font.size'] = '24'\n",
|
||
|
"rcParams['figure.figsize'] = (20, 10)\n",
|
||
|
"rcParams['figure.dpi'] = 100"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def simplify_area_name(x):\n",
|
||
|
" return ' '.join(\n",
|
||
|
" x.split('/') # split Dutch and Frysian name\n",
|
||
|
" [0] # extract Dutch name\n",
|
||
|
" .strip()\n",
|
||
|
" .split(' ') # Split area name from province, mostly 'Fr'\n",
|
||
|
" [:-1] # remove province\n",
|
||
|
" ).strip().lower() # rejoin spaces in area name"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"metadata = pandas.read_sql('''SELECT answer.* FROM core_surveyresultquestionanswer as answer''', db)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"grouped = metadata.groupby(['survey_result_id', 'question_id']).agg({\n",
|
||
|
" 'question_text': 'first',\n",
|
||
|
" 'answer_text': lambda x: x if len(x) == 1 else ', '.join(x)\n",
|
||
|
"})\n",
|
||
|
"grouped.reset_index(inplace=True)\n",
|
||
|
"\n",
|
||
|
"grouped = grouped.pivot(index='survey_result_id', columns='question_text', values='answer_text')\n",
|
||
|
"\n",
|
||
|
"grouped = grouped.rename({\n",
|
||
|
" 'Do you go to school?': 'school',\n",
|
||
|
" 'Do you go to university?': 'university',\n",
|
||
|
" 'What is your age bracket?': 'age_bracket',\n",
|
||
|
" 'What is your age?': 'age',\n",
|
||
|
" 'What is your gender?': 'gender',\n",
|
||
|
" 'Which language are you the most proficient in?': 'language',\n",
|
||
|
" 'Which languages do you actively use in your life?': 'active-languages'\n",
|
||
|
"}, axis='columns')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"metadata": {
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"predictions = pandas.read_sql('''\n",
|
||
|
"SELECT \n",
|
||
|
" sr.id as id,\n",
|
||
|
" sr.area_name as actual_area,\n",
|
||
|
" area1_name as area_prediction_1,\n",
|
||
|
" area2_name as area_prediction_2,\n",
|
||
|
" area3_name as area_prediction_3\n",
|
||
|
"FROM core_surveyresult as sr\n",
|
||
|
"INNER JOIN core_predictionquizresult as pq\n",
|
||
|
" ON sr.id = pq.survey_result_id\n",
|
||
|
"''', db)\n",
|
||
|
"\n",
|
||
|
"predicted_areas = set(map(simplify_area_name,\n",
|
||
|
" set(predictions['area_prediction_1']) |\n",
|
||
|
" set(predictions['area_prediction_2']) |\n",
|
||
|
" set(predictions['area_prediction_3'])\n",
|
||
|
"))\n",
|
||
|
"actual_areas = set(map(str.lower, predictions['actual_area']))\n",
|
||
|
"\n",
|
||
|
"areas = list(predicted_areas | actual_areas)\n",
|
||
|
"location_to_number = {l: i for i, l in enumerate(areas)}"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"metadata": {
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"simplified_predictions = pandas.DataFrame({\n",
|
||
|
" 'id': list(predictions['id']),\n",
|
||
|
" 'actual': list(map(str.lower, predictions['actual_area'])),\n",
|
||
|
" 'prediction_1': list(map(simplify_area_name, predictions['area_prediction_1'])),\n",
|
||
|
" 'prediction_2': list(map(simplify_area_name, predictions['area_prediction_2'])),\n",
|
||
|
" 'prediction_3': list(map(simplify_area_name, predictions['area_prediction_3'])),\n",
|
||
|
"})\n",
|
||
|
"# simplified_predictions.set_index('id')\n",
|
||
|
"simplified_predictions.to_excel('actual-predictions.xls')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 8,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"locations = {location for c in simplified_predictions.columns for location in simplified_predictions[c] if c != 'id'}"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 9,
|
||
|
"metadata": {
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"names = pandas.read_csv('plaatsen_nl.csv')\n",
|
||
|
"\n",
|
||
|
"nonominatim = {\n",
|
||
|
" name: [row['st_y'], row['x']]\n",
|
||
|
" for _, row in names.iterrows()\n",
|
||
|
" for column in ['bebouwdeko', 'naamoffici', 'naamnl', 'naamfries']\n",
|
||
|
"# for _ in [ print(row[column]) ]\n",
|
||
|
" if type(row[column]) == str\n",
|
||
|
" for name in [row[column], row[column].lower().replace('-', ' ')]\n",
|
||
|
"}"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 10,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"application/vnd.jupyter.widget-view+json": {
|
||
|
"model_id": "21c016f24e23473e807ed3e9c2d942c6",
|
||
|
"version_major": 2,
|
||
|
"version_minor": 0
|
||
|
},
|
||
|
"text/plain": [
|
||
|
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
|
||
|
]
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"nominatim = {\n",
|
||
|
" l: json.loads(\n",
|
||
|
" requests.get(\n",
|
||
|
" 'https://nominatim.openstreetmap.org/search.php?q=Netherlands%20'\n",
|
||
|
" '{}&polygon_geojson=1&viewbox=&format=json'.format(l)\n",
|
||
|
" ).text\n",
|
||
|
" )\n",
|
||
|
" for l in ProgressBar(locations)\n",
|
||
|
" if l not in nonominatim\n",
|
||
|
"}"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 28,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"latlons = {\n",
|
||
|
" l: (float(v[0]['lat']), float(v[0]['lon']))\n",
|
||
|
" for l, v in nominatim.items()\n",
|
||
|
" if len(v) > 0\n",
|
||
|
"}\n",
|
||
|
"latlons.update(nonominatim)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 29,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"for c in {'actual', 'prediction_1', 'prediction_2', 'prediction_3'}:\n",
|
||
|
" simplified_predictions['{}_latlon'.format(c)] = [\n",
|
||
|
" latlons.get(l, numpy.nan)\n",
|
||
|
" for l in simplified_predictions['{}'.format(c)]\n",
|
||
|
" ]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 30,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"for c in {'prediction_1_latlon', 'prediction_2_latlon', 'prediction_3_latlon'}:\n",
|
||
|
" simplified_predictions['{}_distance'.format(c)] = [\n",
|
||
|
" vincenty(x, y) if x == x and y == y else numpy.nan\n",
|
||
|
" for x, y in zip(simplified_predictions['actual_latlon'], simplified_predictions[c])\n",
|
||
|
" ]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 31,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"simplified_predictions = simplified_predictions[[\n",
|
||
|
" 'id', 'actual', 'actual_latlon', 'prediction_3_latlon_distance',\n",
|
||
|
" 'prediction_1_latlon_distance', 'prediction_2_latlon_distance'\n",
|
||
|
"]]\n",
|
||
|
"\n",
|
||
|
"simplified_predictions = simplified_predictions.rename({\n",
|
||
|
" 'prediction_3_latlon_distance': 'distance3',\n",
|
||
|
" 'prediction_1_latlon_distance': 'distance1',\n",
|
||
|
" 'prediction_2_latlon_distance': 'distance2'\n",
|
||
|
"}, axis='columns')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 32,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"simplified_predictions = simplified_predictions.join(grouped, on='id')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 33,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"simplified_predictions['latitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[0] if x == x else None)\n",
|
||
|
"simplified_predictions['longitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[1] if x == x else None)\n",
|
||
|
"simplified_predictions = simplified_predictions.drop('actual_latlon', axis='columns')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 34,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"simplified_predictions['age_groups'] = [\n",
|
||
|
" {'0-10': '0-20', '11-20': '0-20',\n",
|
||
|
" '21-30': '21-50', '31-40': '21-50', '41-50': '21-50',\n",
|
||
|
" '51-60': '51-100', '61-70': '51-100', '71-80': '51-100', '81-90': '51-100', '91-100': '51-100'}.get(b, None)\n",
|
||
|
" for b in simplified_predictions['age_bracket']\n",
|
||
|
"]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 35,
|
||
|
"metadata": {
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# age_groups = simplified_predictions.groupby(['age_groups', 'actual']).agg({\n",
|
||
|
"# 'distance1': ['mean', 'min', 'max', 'count', 'size'],\n",
|
||
|
"# 'latitude': 'first',\n",
|
||
|
"# 'longitude': 'first'\n",
|
||
|
"# })\n",
|
||
|
"# age_groups.index.get_level_values('age_groups')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 36,
|
||
|
"metadata": {
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# gender_groups = simplified_predictions.groupby(['gender', 'actual']).agg({\n",
|
||
|
"# 'distance1': ['min', 'mean', 'max', 'count', 'size'],\n",
|
||
|
"# 'latitude': 'first',\n",
|
||
|
"# 'longitude': 'first'\n",
|
||
|
"# })\n",
|
||
|
"# gender_groups"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 40,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"summary = simplified_predictions[['latitude', 'longitude', 'distance1', 'distance2', 'distance3', 'actual']]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 41,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"summary.to_csv('points.csv')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"geojson = {\n",
|
||
|
" \"type\": \"FeatureCollection\",\n",
|
||
|
" \"features\": [\n",
|
||
|
" {\n",
|
||
|
" \"type\": \"Feature\",\n",
|
||
|
" \"properties\": {\n",
|
||
|
" \"distance 1\": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
|
||
|
" \"distance 2\": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
|
||
|
" \"distance 3\": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
|
||
|
" \"actual\": actual\n",
|
||
|
" },\n",
|
||
|
" \"geometry\": {\n",
|
||
|
" \"type\": \"Point\",\n",
|
||
|
" \"coordinates\": list( actual_lat_lon )[::-1]\n",
|
||
|
" }\n",
|
||
|
" }\n",
|
||
|
" for actual, row in simplified_predictions.groupby('actual')\n",
|
||
|
" if actual != ''\n",
|
||
|
"# for _ in [ print(row['actual_latlon']), print() ]\n",
|
||
|
" for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias \n",
|
||
|
" if actual_lat_lon == actual_lat_lon\n",
|
||
|
" ]\n",
|
||
|
"}"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 15,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"geojson = {\n",
|
||
|
" \"type\": \"FeatureCollection\",\n",
|
||
|
" \"features\": [\n",
|
||
|
" {\n",
|
||
|
" \"type\": \"Feature\",\n",
|
||
|
" \"properties\": {\n",
|
||
|
" \"distance 1\": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
|
||
|
" \"distance 2\": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
|
||
|
" \"distance 3\": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
|
||
|
" \"actual\": actual\n",
|
||
|
" },\n",
|
||
|
" \"geometry\": {\n",
|
||
|
" \"type\": \"Point\",\n",
|
||
|
" \"coordinates\": list( actual_lat_lon )[::-1]\n",
|
||
|
" }\n",
|
||
|
" }\n",
|
||
|
" for actual, row in simplified_predictions.groupby('actual')\n",
|
||
|
" if actual != ''\n",
|
||
|
"# for _ in [ print(row['actual_latlon']), print() ]\n",
|
||
|
" for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias \n",
|
||
|
" if actual_lat_lon == actual_lat_lon\n",
|
||
|
" ]\n",
|
||
|
"}"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.6.5"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|