stimmenfryslan/notebooks/Prediction Confusion Map.ipynb

13 KiB

Pronunciation-based location prediction confusion

Setup a pandas dataframe with in each row

  • participant provided (actual) location,
  • 3 estimations made by Nanna's heuristic based in what the participant stated to be the correct pronunciation of a word
  • distance between the actual and heuristic predicted location

Averages of the distances are exported for visualisation in QGIS.

In [1]:
import pickle
import pandas
import MySQLdb
import numpy
import itertools
import requests
import json
from vincenty import vincenty

db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen')

%matplotlib inline
from matplotlib import pyplot, rcParams
from jupyter_progressbar import ProgressBar

# rcParams['font.family'] = 'Lucinda Console'
rcParams['font.size'] = '24'
rcParams['figure.figsize'] = (20, 10)
rcParams['figure.dpi'] = 100
In [2]:
def simplify_area_name(x):
    return ' '.join(
        x.split('/') # split Dutch and Frysian name
        [0] # extract Dutch name
        .strip()
        .split(' ') # Split area name from province, mostly 'Fr'
        [:-1] # remove province
    ).strip().lower() # rejoin spaces in area name
In [3]:
metadata = pandas.read_sql('''SELECT answer.* FROM core_surveyresultquestionanswer as answer''', db)
In [4]:
grouped = metadata.groupby(['survey_result_id', 'question_id']).agg({
    'question_text': 'first',
    'answer_text': lambda x: x if len(x) == 1 else ', '.join(x)
})
grouped.reset_index(inplace=True)

grouped = grouped.pivot(index='survey_result_id', columns='question_text', values='answer_text')

grouped = grouped.rename({
    'Do you go to school?': 'school',
    'Do you go to university?': 'university',
    'What is your age bracket?': 'age_bracket',
    'What is your age?': 'age',
    'What is your gender?': 'gender',
    'Which language are you the most proficient in?': 'language',
    'Which languages do you actively use in your life?': 'active-languages'
}, axis='columns')
In [5]:
predictions = pandas.read_sql('''
SELECT 
    sr.id as id,
    sr.area_name as actual_area,
    area1_name as area_prediction_1,
    area2_name as area_prediction_2,
    area3_name as area_prediction_3
FROM core_surveyresult as sr
INNER JOIN core_predictionquizresult as pq
    ON sr.id = pq.survey_result_id
''', db)

predicted_areas = set(map(simplify_area_name,
    set(predictions['area_prediction_1']) |
    set(predictions['area_prediction_2']) |
    set(predictions['area_prediction_3'])
))
actual_areas = set(map(str.lower, predictions['actual_area']))

areas = list(predicted_areas | actual_areas)
location_to_number = {l: i  for i, l in enumerate(areas)}
In [6]:
simplified_predictions = pandas.DataFrame({
    'id': list(predictions['id']),
    'actual': list(map(str.lower, predictions['actual_area'])),
    'prediction_1': list(map(simplify_area_name, predictions['area_prediction_1'])),
    'prediction_2': list(map(simplify_area_name, predictions['area_prediction_2'])),
    'prediction_3': list(map(simplify_area_name, predictions['area_prediction_3'])),
})
# simplified_predictions.set_index('id')
simplified_predictions.to_excel('actual-predictions.xls')
In [8]:
locations = {location for c in simplified_predictions.columns for location in simplified_predictions[c] if c != 'id'}
In [9]:
names = pandas.read_csv('plaatsen_nl.csv')

nonominatim = {
    name: [row['st_y'], row['x']]
    for _, row in names.iterrows()
    for column in ['bebouwdeko', 'naamoffici', 'naamnl', 'naamfries']
#     for _ in [ print(row[column]) ]
    if type(row[column]) == str
    for name in [row[column], row[column].lower().replace('-', ' ')]
}
In [10]:
nominatim = {
    l: json.loads(
        requests.get(
            'https://nominatim.openstreetmap.org/search.php?q=Netherlands%20'
            '{}&polygon_geojson=1&viewbox=&format=json'.format(l)
        ).text
    )
    for l in ProgressBar(locations)
    if l not in nonominatim
}
VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…
In [28]:
latlons = {
    l: (float(v[0]['lat']), float(v[0]['lon']))
    for l, v in nominatim.items()
    if len(v) > 0
}
latlons.update(nonominatim)
In [29]:
for c in {'actual', 'prediction_1', 'prediction_2', 'prediction_3'}:
    simplified_predictions['{}_latlon'.format(c)] = [
        latlons.get(l, numpy.nan)
        for l in simplified_predictions['{}'.format(c)]
    ]
In [30]:
for c in {'prediction_1_latlon', 'prediction_2_latlon', 'prediction_3_latlon'}:
    simplified_predictions['{}_distance'.format(c)] = [
        vincenty(x, y) if x == x and y == y else numpy.nan
        for x, y in zip(simplified_predictions['actual_latlon'], simplified_predictions[c])
    ]
In [31]:
simplified_predictions = simplified_predictions[[
    'id', 'actual', 'actual_latlon', 'prediction_3_latlon_distance',
    'prediction_1_latlon_distance', 'prediction_2_latlon_distance'
]]

simplified_predictions = simplified_predictions.rename({
    'prediction_3_latlon_distance': 'distance3',
    'prediction_1_latlon_distance': 'distance1',
    'prediction_2_latlon_distance': 'distance2'
}, axis='columns')
In [32]:
simplified_predictions = simplified_predictions.join(grouped, on='id')
In [33]:
simplified_predictions['latitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[0] if x == x else None)
simplified_predictions['longitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[1] if x == x else None)
simplified_predictions = simplified_predictions.drop('actual_latlon', axis='columns')
In [34]:
simplified_predictions['age_groups'] = [
    {'0-10': '0-20', '11-20': '0-20',
     '21-30': '21-50', '31-40': '21-50', '41-50': '21-50',
     '51-60': '51-100', '61-70': '51-100', '71-80': '51-100', '81-90': '51-100', '91-100': '51-100'}.get(b, None)
    for b in simplified_predictions['age_bracket']
]
In [35]:
# age_groups = simplified_predictions.groupby(['age_groups', 'actual']).agg({
#     'distance1': ['mean', 'min', 'max', 'count', 'size'],
#     'latitude': 'first',
#     'longitude': 'first'
# })
# age_groups.index.get_level_values('age_groups')
In [36]:
# gender_groups = simplified_predictions.groupby(['gender', 'actual']).agg({
#     'distance1': ['min', 'mean', 'max', 'count', 'size'],
#     'latitude': 'first',
#     'longitude': 'first'
# })
# gender_groups
In [40]:
summary = simplified_predictions[['latitude', 'longitude', 'distance1', 'distance2', 'distance3', 'actual']]
In [41]:
summary.to_csv('points.csv')
In [ ]:
geojson = {
  "type": "FeatureCollection",
  "features": [
    {
      "type": "Feature",
      "properties": {
        "distance 1": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,
        "distance 2": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,
        "distance 3": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,
        "actual": actual
      },
      "geometry": {
        "type": "Point",
        "coordinates": list( actual_lat_lon )[::-1]
      }
    }
    for actual, row in simplified_predictions.groupby('actual')
    if actual != ''
#     for _ in [ print(row['actual_latlon']), print() ]
    for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias 
    if actual_lat_lon == actual_lat_lon
  ]
}
In [15]:
geojson = {
  "type": "FeatureCollection",
  "features": [
    {
      "type": "Feature",
      "properties": {
        "distance 1": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,
        "distance 2": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,
        "distance 3": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,
        "actual": actual
      },
      "geometry": {
        "type": "Point",
        "coordinates": list( actual_lat_lon )[::-1]
      }
    }
    for actual, row in simplified_predictions.groupby('actual')
    if actual != ''
#     for _ in [ print(row['actual_latlon']), print() ]
    for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias 
    if actual_lat_lon == actual_lat_lon
  ]
}