13 KiB
13 KiB
Pronunciation-based location prediction confusion¶
Setup a pandas dataframe with in each row
- participant provided (actual) location,
- 3 estimations made by Nanna's heuristic based in what the participant stated to be the correct pronunciation of a word
- distance between the actual and heuristic predicted location
Averages of the distances are exported for visualisation in QGIS.
In [1]:
import pickle import pandas import MySQLdb import numpy import itertools import requests import json from vincenty import vincenty db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen') %matplotlib inline from matplotlib import pyplot, rcParams from jupyter_progressbar import ProgressBar # rcParams['font.family'] = 'Lucinda Console' rcParams['font.size'] = '24' rcParams['figure.figsize'] = (20, 10) rcParams['figure.dpi'] = 100
In [2]:
def simplify_area_name(x): return ' '.join( x.split('/') # split Dutch and Frysian name [0] # extract Dutch name .strip() .split(' ') # Split area name from province, mostly 'Fr' [:-1] # remove province ).strip().lower() # rejoin spaces in area name
In [3]:
metadata = pandas.read_sql('''SELECT answer.* FROM core_surveyresultquestionanswer as answer''', db)
In [4]:
grouped = metadata.groupby(['survey_result_id', 'question_id']).agg({ 'question_text': 'first', 'answer_text': lambda x: x if len(x) == 1 else ', '.join(x) }) grouped.reset_index(inplace=True) grouped = grouped.pivot(index='survey_result_id', columns='question_text', values='answer_text') grouped = grouped.rename({ 'Do you go to school?': 'school', 'Do you go to university?': 'university', 'What is your age bracket?': 'age_bracket', 'What is your age?': 'age', 'What is your gender?': 'gender', 'Which language are you the most proficient in?': 'language', 'Which languages do you actively use in your life?': 'active-languages' }, axis='columns')
In [5]:
predictions = pandas.read_sql(''' SELECT sr.id as id, sr.area_name as actual_area, area1_name as area_prediction_1, area2_name as area_prediction_2, area3_name as area_prediction_3 FROM core_surveyresult as sr INNER JOIN core_predictionquizresult as pq ON sr.id = pq.survey_result_id ''', db) predicted_areas = set(map(simplify_area_name, set(predictions['area_prediction_1']) | set(predictions['area_prediction_2']) | set(predictions['area_prediction_3']) )) actual_areas = set(map(str.lower, predictions['actual_area'])) areas = list(predicted_areas | actual_areas) location_to_number = {l: i for i, l in enumerate(areas)}
In [6]:
simplified_predictions = pandas.DataFrame({ 'id': list(predictions['id']), 'actual': list(map(str.lower, predictions['actual_area'])), 'prediction_1': list(map(simplify_area_name, predictions['area_prediction_1'])), 'prediction_2': list(map(simplify_area_name, predictions['area_prediction_2'])), 'prediction_3': list(map(simplify_area_name, predictions['area_prediction_3'])), }) # simplified_predictions.set_index('id') simplified_predictions.to_excel('actual-predictions.xls')
In [8]:
locations = {location for c in simplified_predictions.columns for location in simplified_predictions[c] if c != 'id'}
In [9]:
names = pandas.read_csv('plaatsen_nl.csv') nonominatim = { name: [row['st_y'], row['x']] for _, row in names.iterrows() for column in ['bebouwdeko', 'naamoffici', 'naamnl', 'naamfries'] # for _ in [ print(row[column]) ] if type(row[column]) == str for name in [row[column], row[column].lower().replace('-', ' ')] }
In [10]:
nominatim = { l: json.loads( requests.get( 'https://nominatim.openstreetmap.org/search.php?q=Netherlands%20' '{}&polygon_geojson=1&viewbox=&format=json'.format(l) ).text ) for l in ProgressBar(locations) if l not in nonominatim }
VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…
In [28]:
latlons = { l: (float(v[0]['lat']), float(v[0]['lon'])) for l, v in nominatim.items() if len(v) > 0 } latlons.update(nonominatim)
In [29]:
for c in {'actual', 'prediction_1', 'prediction_2', 'prediction_3'}: simplified_predictions['{}_latlon'.format(c)] = [ latlons.get(l, numpy.nan) for l in simplified_predictions['{}'.format(c)] ]
In [30]:
for c in {'prediction_1_latlon', 'prediction_2_latlon', 'prediction_3_latlon'}: simplified_predictions['{}_distance'.format(c)] = [ vincenty(x, y) if x == x and y == y else numpy.nan for x, y in zip(simplified_predictions['actual_latlon'], simplified_predictions[c]) ]
In [31]:
simplified_predictions = simplified_predictions[[ 'id', 'actual', 'actual_latlon', 'prediction_3_latlon_distance', 'prediction_1_latlon_distance', 'prediction_2_latlon_distance' ]] simplified_predictions = simplified_predictions.rename({ 'prediction_3_latlon_distance': 'distance3', 'prediction_1_latlon_distance': 'distance1', 'prediction_2_latlon_distance': 'distance2' }, axis='columns')
In [32]:
simplified_predictions = simplified_predictions.join(grouped, on='id')
In [33]:
simplified_predictions['latitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[0] if x == x else None) simplified_predictions['longitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[1] if x == x else None) simplified_predictions = simplified_predictions.drop('actual_latlon', axis='columns')
In [34]:
simplified_predictions['age_groups'] = [ {'0-10': '0-20', '11-20': '0-20', '21-30': '21-50', '31-40': '21-50', '41-50': '21-50', '51-60': '51-100', '61-70': '51-100', '71-80': '51-100', '81-90': '51-100', '91-100': '51-100'}.get(b, None) for b in simplified_predictions['age_bracket'] ]
In [35]:
# age_groups = simplified_predictions.groupby(['age_groups', 'actual']).agg({ # 'distance1': ['mean', 'min', 'max', 'count', 'size'], # 'latitude': 'first', # 'longitude': 'first' # }) # age_groups.index.get_level_values('age_groups')
In [36]:
# gender_groups = simplified_predictions.groupby(['gender', 'actual']).agg({ # 'distance1': ['min', 'mean', 'max', 'count', 'size'], # 'latitude': 'first', # 'longitude': 'first' # }) # gender_groups
In [40]:
summary = simplified_predictions[['latitude', 'longitude', 'distance1', 'distance2', 'distance3', 'actual']]
In [41]:
summary.to_csv('points.csv')
In [ ]:
geojson = { "type": "FeatureCollection", "features": [ { "type": "Feature", "properties": { "distance 1": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001, "distance 2": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001, "distance 3": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001, "actual": actual }, "geometry": { "type": "Point", "coordinates": list( actual_lat_lon )[::-1] } } for actual, row in simplified_predictions.groupby('actual') if actual != '' # for _ in [ print(row['actual_latlon']), print() ] for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias if actual_lat_lon == actual_lat_lon ] }
In [15]:
geojson = { "type": "FeatureCollection", "features": [ { "type": "Feature", "properties": { "distance 1": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001, "distance 2": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001, "distance 3": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001, "actual": actual }, "geometry": { "type": "Point", "coordinates": list( actual_lat_lon )[::-1] } } for actual, row in simplified_predictions.groupby('actual') if actual != '' # for _ in [ print(row['actual_latlon']), print() ] for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias if actual_lat_lon == actual_lat_lon ] }