stimmenfryslan/notebooks/Prediction Confusion Map.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pronunciation-based location prediction confusion\n",
    "\n",
    "Setup a pandas dataframe with in each row\n",
    "\n",
    " * participant provided (actual) location,\n",
    " * 3 estimations made by Nanna's heuristic based in what the participant stated to be the correct pronunciation of a word\n",
    " * distance between the actual and heuristic predicted location\n",
    " \n",
    "Averages of the distances are exported for visualisation in QGIS."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import pandas\n",
    "import MySQLdb\n",
    "import numpy\n",
    "import itertools\n",
    "import requests\n",
    "import json\n",
    "from vincenty import vincenty\n",
    "\n",
    "db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen')\n",
    "\n",
    "%matplotlib inline\n",
    "from matplotlib import pyplot, rcParams\n",
    "from jupyter_progressbar import ProgressBar\n",
    "\n",
    "# rcParams['font.family'] = 'Lucinda Console'\n",
    "rcParams['font.size'] = '24'\n",
    "rcParams['figure.figsize'] = (20, 10)\n",
    "rcParams['figure.dpi'] = 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def simplify_area_name(x):\n",
    "    return ' '.join(\n",
    "        x.split('/') # split Dutch and Frysian name\n",
    "        [0] # extract Dutch name\n",
    "        .strip()\n",
    "        .split(' ') # Split area name from province, mostly 'Fr'\n",
    "        [:-1] # remove province\n",
    "    ).strip().lower() # rejoin spaces in area name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata = pandas.read_sql('''SELECT answer.* FROM core_surveyresultquestionanswer as answer''', db)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped = metadata.groupby(['survey_result_id', 'question_id']).agg({\n",
    "    'question_text': 'first',\n",
    "    'answer_text': lambda x: x if len(x) == 1 else ', '.join(x)\n",
    "})\n",
    "grouped.reset_index(inplace=True)\n",
    "\n",
    "grouped = grouped.pivot(index='survey_result_id', columns='question_text', values='answer_text')\n",
    "\n",
    "grouped = grouped.rename({\n",
    "    'Do you go to school?': 'school',\n",
    "    'Do you go to university?': 'university',\n",
    "    'What is your age bracket?': 'age_bracket',\n",
    "    'What is your age?': 'age',\n",
    "    'What is your gender?': 'gender',\n",
    "    'Which language are you the most proficient in?': 'language',\n",
    "    'Which languages do you actively use in your life?': 'active-languages'\n",
    "}, axis='columns')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "predictions = pandas.read_sql('''\n",
    "SELECT \n",
    "    sr.id as id,\n",
    "    sr.area_name as actual_area,\n",
    "    area1_name as area_prediction_1,\n",
    "    area2_name as area_prediction_2,\n",
    "    area3_name as area_prediction_3\n",
    "FROM core_surveyresult as sr\n",
    "INNER JOIN core_predictionquizresult as pq\n",
    "    ON sr.id = pq.survey_result_id\n",
    "''', db)\n",
    "\n",
    "predicted_areas = set(map(simplify_area_name,\n",
    "    set(predictions['area_prediction_1']) |\n",
    "    set(predictions['area_prediction_2']) |\n",
    "    set(predictions['area_prediction_3'])\n",
    "))\n",
    "actual_areas = set(map(str.lower, predictions['actual_area']))\n",
    "\n",
    "areas = list(predicted_areas | actual_areas)\n",
    "location_to_number = {l: i  for i, l in enumerate(areas)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "simplified_predictions = pandas.DataFrame({\n",
    "    'id': list(predictions['id']),\n",
    "    'actual': list(map(str.lower, predictions['actual_area'])),\n",
    "    'prediction_1': list(map(simplify_area_name, predictions['area_prediction_1'])),\n",
    "    'prediction_2': list(map(simplify_area_name, predictions['area_prediction_2'])),\n",
    "    'prediction_3': list(map(simplify_area_name, predictions['area_prediction_3'])),\n",
    "})\n",
    "# simplified_predictions.set_index('id')\n",
    "simplified_predictions.to_excel('actual-predictions.xls')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "locations = {location for c in simplified_predictions.columns for location in simplified_predictions[c] if c != 'id'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "names = pandas.read_csv('plaatsen_nl.csv')\n",
    "\n",
    "nonominatim = {\n",
    "    name: [row['st_y'], row['x']]\n",
    "    for _, row in names.iterrows()\n",
    "    for column in ['bebouwdeko', 'naamoffici', 'naamnl', 'naamfries']\n",
    "#     for _ in [ print(row[column]) ]\n",
    "    if type(row[column]) == str\n",
    "    for name in [row[column], row[column].lower().replace('-', ' ')]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "21c016f24e23473e807ed3e9c2d942c6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "nominatim = {\n",
    "    l: json.loads(\n",
    "        requests.get(\n",
    "            'https://nominatim.openstreetmap.org/search.php?q=Netherlands%20'\n",
    "            '{}&polygon_geojson=1&viewbox=&format=json'.format(l)\n",
    "        ).text\n",
    "    )\n",
    "    for l in ProgressBar(locations)\n",
    "    if l not in nonominatim\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "latlons = {\n",
    "    l: (float(v[0]['lat']), float(v[0]['lon']))\n",
    "    for l, v in nominatim.items()\n",
    "    if len(v) > 0\n",
    "}\n",
    "latlons.update(nonominatim)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in {'actual', 'prediction_1', 'prediction_2', 'prediction_3'}:\n",
    "    simplified_predictions['{}_latlon'.format(c)] = [\n",
    "        latlons.get(l, numpy.nan)\n",
    "        for l in simplified_predictions['{}'.format(c)]\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "for c in {'prediction_1_latlon', 'prediction_2_latlon', 'prediction_3_latlon'}:\n",
    "    simplified_predictions['{}_distance'.format(c)] = [\n",
    "        vincenty(x, y) if x == x and y == y else numpy.nan\n",
    "        for x, y in zip(simplified_predictions['actual_latlon'], simplified_predictions[c])\n",
    "    ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "simplified_predictions = simplified_predictions[[\n",
    "    'id', 'actual', 'actual_latlon', 'prediction_3_latlon_distance',\n",
    "    'prediction_1_latlon_distance', 'prediction_2_latlon_distance'\n",
    "]]\n",
    "\n",
    "simplified_predictions = simplified_predictions.rename({\n",
    "    'prediction_3_latlon_distance': 'distance3',\n",
    "    'prediction_1_latlon_distance': 'distance1',\n",
    "    'prediction_2_latlon_distance': 'distance2'\n",
    "}, axis='columns')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "simplified_predictions = simplified_predictions.join(grouped, on='id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "simplified_predictions['latitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[0] if x == x else None)\n",
    "simplified_predictions['longitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[1] if x == x else None)\n",
    "simplified_predictions = simplified_predictions.drop('actual_latlon', axis='columns')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "simplified_predictions['age_groups'] = [\n",
    "    {'0-10': '0-20', '11-20': '0-20',\n",
    "     '21-30': '21-50', '31-40': '21-50', '41-50': '21-50',\n",
    "     '51-60': '51-100', '61-70': '51-100', '71-80': '51-100', '81-90': '51-100', '91-100': '51-100'}.get(b, None)\n",
    "    for b in simplified_predictions['age_bracket']\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# age_groups = simplified_predictions.groupby(['age_groups', 'actual']).agg({\n",
    "#     'distance1': ['mean', 'min', 'max', 'count', 'size'],\n",
    "#     'latitude': 'first',\n",
    "#     'longitude': 'first'\n",
    "# })\n",
    "# age_groups.index.get_level_values('age_groups')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# gender_groups = simplified_predictions.groupby(['gender', 'actual']).agg({\n",
    "#     'distance1': ['min', 'mean', 'max', 'count', 'size'],\n",
    "#     'latitude': 'first',\n",
    "#     'longitude': 'first'\n",
    "# })\n",
    "# gender_groups"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "summary = simplified_predictions[['latitude', 'longitude', 'distance1', 'distance2', 'distance3', 'actual']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "summary.to_csv('points.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "geojson = {\n",
    "  \"type\": \"FeatureCollection\",\n",
    "  \"features\": [\n",
    "    {\n",
    "      \"type\": \"Feature\",\n",
    "      \"properties\": {\n",
    "        \"distance 1\": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
    "        \"distance 2\": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
    "        \"distance 3\": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
    "        \"actual\": actual\n",
    "      },\n",
    "      \"geometry\": {\n",
    "        \"type\": \"Point\",\n",
    "        \"coordinates\": list( actual_lat_lon )[::-1]\n",
    "      }\n",
    "    }\n",
    "    for actual, row in simplified_predictions.groupby('actual')\n",
    "    if actual != ''\n",
    "#     for _ in [ print(row['actual_latlon']), print() ]\n",
    "    for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias \n",
    "    if actual_lat_lon == actual_lat_lon\n",
    "  ]\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "geojson = {\n",
    "  \"type\": \"FeatureCollection\",\n",
    "  \"features\": [\n",
    "    {\n",
    "      \"type\": \"Feature\",\n",
    "      \"properties\": {\n",
    "        \"distance 1\": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
    "        \"distance 2\": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
    "        \"distance 3\": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
    "        \"actual\": actual\n",
    "      },\n",
    "      \"geometry\": {\n",
    "        \"type\": \"Point\",\n",
    "        \"coordinates\": list( actual_lat_lon )[::-1]\n",
    "      }\n",
    "    }\n",
    "    for actual, row in simplified_predictions.groupby('actual')\n",
    "    if actual != ''\n",
    "#     for _ in [ print(row['actual_latlon']), print() ]\n",
    "    for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias \n",
    "    if actual_lat_lon == actual_lat_lon\n",
    "  ]\n",
    "}"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
first commit 2018-09-28 10:35:17 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"# Pronunciation-based location prediction confusion\n",`
			`"\n",`
			`"Setup a pandas dataframe with in each row\n",`
			`"\n",`
			`" * participant provided (actual) location,\n",`
			`" * 3 estimations made by Nanna's heuristic based in what the participant stated to be the correct pronunciation of a word\n",`
			`" * distance between the actual and heuristic predicted location\n",`
			`" \n",`
			`"Averages of the distances are exported for visualisation in QGIS."`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 1,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import pickle\n",`
			`"import pandas\n",`
			`"import MySQLdb\n",`
			`"import numpy\n",`
			`"import itertools\n",`
			`"import requests\n",`
			`"import json\n",`
			`"from vincenty import vincenty\n",`
			`"\n",`
			`"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen')\n",`
			`"\n",`
			`"%matplotlib inline\n",`
			`"from matplotlib import pyplot, rcParams\n",`
			`"from jupyter_progressbar import ProgressBar\n",`
			`"\n",`
			`"# rcParams['font.family'] = 'Lucinda Console'\n",`
			`"rcParams['font.size'] = '24'\n",`
			`"rcParams['figure.figsize'] = (20, 10)\n",`
			`"rcParams['figure.dpi'] = 100"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"def simplify_area_name(x):\n",`
			`" return ' '.join(\n",`
			`" x.split('/') # split Dutch and Frysian name\n",`
			`" [0] # extract Dutch name\n",`
			`" .strip()\n",`
			`" .split(' ') # Split area name from province, mostly 'Fr'\n",`
			`" [:-1] # remove province\n",`
			`" ).strip().lower() # rejoin spaces in area name"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"metadata = pandas.read_sql('''SELECT answer.* FROM core_surveyresultquestionanswer as answer''', db)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 4,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"grouped = metadata.groupby(['survey_result_id', 'question_id']).agg({\n",`
			`" 'question_text': 'first',\n",`
			`" 'answer_text': lambda x: x if len(x) == 1 else ', '.join(x)\n",`
			`"})\n",`
			`"grouped.reset_index(inplace=True)\n",`
			`"\n",`
			`"grouped = grouped.pivot(index='survey_result_id', columns='question_text', values='answer_text')\n",`
			`"\n",`
			`"grouped = grouped.rename({\n",`
			`" 'Do you go to school?': 'school',\n",`
			`" 'Do you go to university?': 'university',\n",`
			`" 'What is your age bracket?': 'age_bracket',\n",`
			`" 'What is your age?': 'age',\n",`
			`" 'What is your gender?': 'gender',\n",`
			`" 'Which language are you the most proficient in?': 'language',\n",`
			`" 'Which languages do you actively use in your life?': 'active-languages'\n",`
			`"}, axis='columns')"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
			`"metadata": {`
			`"scrolled": false`
			`},`
			`"outputs": [],`
			`"source": [`
			`"predictions = pandas.read_sql('''\n",`
			`"SELECT \n",`
			`" sr.id as id,\n",`
			`" sr.area_name as actual_area,\n",`
			`" area1_name as area_prediction_1,\n",`
			`" area2_name as area_prediction_2,\n",`
			`" area3_name as area_prediction_3\n",`
			`"FROM core_surveyresult as sr\n",`
			`"INNER JOIN core_predictionquizresult as pq\n",`
			`" ON sr.id = pq.survey_result_id\n",`
			`"''', db)\n",`
			`"\n",`
			`"predicted_areas = set(map(simplify_area_name,\n",`
			`" set(predictions['area_prediction_1']) \|\n",`
			`" set(predictions['area_prediction_2']) \|\n",`
			`" set(predictions['area_prediction_3'])\n",`
			`"))\n",`
			`"actual_areas = set(map(str.lower, predictions['actual_area']))\n",`
			`"\n",`
			`"areas = list(predicted_areas \| actual_areas)\n",`
			`"location_to_number = {l: i for i, l in enumerate(areas)}"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 6,`
			`"metadata": {`
			`"scrolled": false`
			`},`
			`"outputs": [],`
			`"source": [`
			`"simplified_predictions = pandas.DataFrame({\n",`
			`" 'id': list(predictions['id']),\n",`
			`" 'actual': list(map(str.lower, predictions['actual_area'])),\n",`
			`" 'prediction_1': list(map(simplify_area_name, predictions['area_prediction_1'])),\n",`
			`" 'prediction_2': list(map(simplify_area_name, predictions['area_prediction_2'])),\n",`
			`" 'prediction_3': list(map(simplify_area_name, predictions['area_prediction_3'])),\n",`
			`"})\n",`
			`"# simplified_predictions.set_index('id')\n",`
			`"simplified_predictions.to_excel('actual-predictions.xls')"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 8,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"locations = {location for c in simplified_predictions.columns for location in simplified_predictions[c] if c != 'id'}"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 9,`
			`"metadata": {`
			`"scrolled": true`
			`},`
			`"outputs": [],`
			`"source": [`
			`"names = pandas.read_csv('plaatsen_nl.csv')\n",`
			`"\n",`
			`"nonominatim = {\n",`
			`" name: [row['st_y'], row['x']]\n",`
			`" for _, row in names.iterrows()\n",`
			`" for column in ['bebouwdeko', 'naamoffici', 'naamnl', 'naamfries']\n",`
			`"# for _ in [ print(row[column]) ]\n",`
			`" if type(row[column]) == str\n",`
			`" for name in [row[column], row[column].lower().replace('-', ' ')]\n",`
			`"}"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 10,`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"application/vnd.jupyter.widget-view+json": {`
			`"model_id": "21c016f24e23473e807ed3e9c2d942c6",`
			`"version_major": 2,`
			`"version_minor": 0`
			`},`
			`"text/plain": [`
			`"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`}`
			`],`
			`"source": [`
			`"nominatim = {\n",`
			`" l: json.loads(\n",`
			`" requests.get(\n",`
			`" 'https://nominatim.openstreetmap.org/search.php?q=Netherlands%20'\n",`
			`" '{}&polygon_geojson=1&viewbox=&format=json'.format(l)\n",`
			`" ).text\n",`
			`" )\n",`
			`" for l in ProgressBar(locations)\n",`
			`" if l not in nonominatim\n",`
			`"}"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 28,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"latlons = {\n",`
			`" l: (float(v[0]['lat']), float(v[0]['lon']))\n",`
			`" for l, v in nominatim.items()\n",`
			`" if len(v) > 0\n",`
			`"}\n",`
			`"latlons.update(nonominatim)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 29,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"for c in {'actual', 'prediction_1', 'prediction_2', 'prediction_3'}:\n",`
			`" simplified_predictions['{}_latlon'.format(c)] = [\n",`
			`" latlons.get(l, numpy.nan)\n",`
			`" for l in simplified_predictions['{}'.format(c)]\n",`
			`" ]"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 30,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"for c in {'prediction_1_latlon', 'prediction_2_latlon', 'prediction_3_latlon'}:\n",`
			`" simplified_predictions['{}_distance'.format(c)] = [\n",`
			`" vincenty(x, y) if x == x and y == y else numpy.nan\n",`
			`" for x, y in zip(simplified_predictions['actual_latlon'], simplified_predictions[c])\n",`
			`" ]"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 31,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"simplified_predictions = simplified_predictions[[\n",`
			`" 'id', 'actual', 'actual_latlon', 'prediction_3_latlon_distance',\n",`
			`" 'prediction_1_latlon_distance', 'prediction_2_latlon_distance'\n",`
			`"]]\n",`
			`"\n",`
			`"simplified_predictions = simplified_predictions.rename({\n",`
			`" 'prediction_3_latlon_distance': 'distance3',\n",`
			`" 'prediction_1_latlon_distance': 'distance1',\n",`
			`" 'prediction_2_latlon_distance': 'distance2'\n",`
			`"}, axis='columns')"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 32,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"simplified_predictions = simplified_predictions.join(grouped, on='id')"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 33,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"simplified_predictions['latitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[0] if x == x else None)\n",`
			`"simplified_predictions['longitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[1] if x == x else None)\n",`
			`"simplified_predictions = simplified_predictions.drop('actual_latlon', axis='columns')"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 34,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"simplified_predictions['age_groups'] = [\n",`
			`" {'0-10': '0-20', '11-20': '0-20',\n",`
			`" '21-30': '21-50', '31-40': '21-50', '41-50': '21-50',\n",`
			`" '51-60': '51-100', '61-70': '51-100', '71-80': '51-100', '81-90': '51-100', '91-100': '51-100'}.get(b, None)\n",`
			`" for b in simplified_predictions['age_bracket']\n",`
			`"]"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 35,`
			`"metadata": {`
			`"scrolled": false`
			`},`
			`"outputs": [],`
			`"source": [`
			`"# age_groups = simplified_predictions.groupby(['age_groups', 'actual']).agg({\n",`
			`"# 'distance1': ['mean', 'min', 'max', 'count', 'size'],\n",`
			`"# 'latitude': 'first',\n",`
			`"# 'longitude': 'first'\n",`
			`"# })\n",`
			`"# age_groups.index.get_level_values('age_groups')"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 36,`
			`"metadata": {`
			`"scrolled": false`
			`},`
			`"outputs": [],`
			`"source": [`
			`"# gender_groups = simplified_predictions.groupby(['gender', 'actual']).agg({\n",`
			`"# 'distance1': ['min', 'mean', 'max', 'count', 'size'],\n",`
			`"# 'latitude': 'first',\n",`
			`"# 'longitude': 'first'\n",`
			`"# })\n",`
			`"# gender_groups"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 40,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"summary = simplified_predictions[['latitude', 'longitude', 'distance1', 'distance2', 'distance3', 'actual']]"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 41,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"summary.to_csv('points.csv')"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"geojson = {\n",`
			`" \"type\": \"FeatureCollection\",\n",`
			`" \"features\": [\n",`
			`" {\n",`
			`" \"type\": \"Feature\",\n",`
			`" \"properties\": {\n",`
			`" \"distance 1\": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",`
			`" \"distance 2\": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",`
			`" \"distance 3\": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",`
			`" \"actual\": actual\n",`
			`" },\n",`
			`" \"geometry\": {\n",`
			`" \"type\": \"Point\",\n",`
			`" \"coordinates\": list( actual_lat_lon )[::-1]\n",`
			`" }\n",`
			`" }\n",`
			`" for actual, row in simplified_predictions.groupby('actual')\n",`
			`" if actual != ''\n",`
			`"# for _ in [ print(row['actual_latlon']), print() ]\n",`
			`" for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias \n",`
			`" if actual_lat_lon == actual_lat_lon\n",`
			`" ]\n",`
			`"}"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 15,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"geojson = {\n",`
			`" \"type\": \"FeatureCollection\",\n",`
			`" \"features\": [\n",`
			`" {\n",`
			`" \"type\": \"Feature\",\n",`
			`" \"properties\": {\n",`
			`" \"distance 1\": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",`
			`" \"distance 2\": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",`
			`" \"distance 3\": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",`
			`" \"actual\": actual\n",`
			`" },\n",`
			`" \"geometry\": {\n",`
			`" \"type\": \"Point\",\n",`
			`" \"coordinates\": list( actual_lat_lon )[::-1]\n",`
			`" }\n",`
			`" }\n",`
			`" for actual, row in simplified_predictions.groupby('actual')\n",`
			`" if actual != ''\n",`
			`"# for _ in [ print(row['actual_latlon']), print() ]\n",`
			`" for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias \n",`
			`" if actual_lat_lon == actual_lat_lon\n",`
			`" ]\n",`
			`"}"`
			`]`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.6.5"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2`
			`}`