4.6 MiB
4.6 MiB
Geographical pronunciation statistics¶
In [13]:
import pandas import MySQLdb import numpy import json db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8') %matplotlib notebook from matplotlib import pyplot import folium from IPython.display import display from shapely.geometry import Polygon, MultiPolygon, shape, Point, box, mapping from jupyter_progressbar import ProgressBar from collections import defaultdict from ipy_table import make_table from html import escape import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import LogNorm from sklearn import mixture from skimage.measure import find_contours from collections import Counter from random import shuffle
In [3]:
# Borders of Frysian municipalities with open('Friesland_AL8.GeoJson') as f: gemeentes = json.load(f)
In [34]:
coords = [feature['geometry'] for feature in gemeentes['features']] coords_folium = [[[[c__[::-1] for c__ in c_] for c_ in c] for c in coords_['coordinates']] for coords_ in coords] shapes = [shape(coords_).simplify(tolerance=0.001) for coords_ in coords] gemeente_names = [feature['properties']['name'] for feature in gemeentes['features']] def get_gemeente(point): for i, shape in enumerate(shapes): if shape.contains(point): return i return -1
In [35]:
# Answers to how participants state a word should be pronounces. answers = pandas.read_sql(''' SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text FROM core_surveyresult as survey INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id INNER JOIN core_predictionquizresultquestionanswer as answer ON result.id = answer.prediction_quiz_id ''', db)
In [36]:
zero_latlng_questions = { q for q, row in answers.groupby('question_text').agg('std').iterrows() if row['user_lat'] == 0 and row['user_lng'] == 0 } answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)]
In [37]:
# Takes approximately 2 minutes gemeente_map = { (lng, lat): get_gemeente(Point(lng, lat)) for lng, lat in set(zip(answers_filtered['user_lng'], answers_filtered['user_lat'])) } answers_filtered['gemeente'] = [ gemeente_map[(lng, lat)] for lat, lng in zip(answers_filtered['user_lat'], answers_filtered['user_lng']) ]
/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy # Remove the CWD from sys.path while we load stuff.
In [38]:
answers_filtered['question_text_url'] = answers_filtered['question_text'].map( lambda x: x.replace('"', '').replace('*', ''))
/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [44]:
def reverse(rd_multipolygon): if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float): return rd_multipolygon[::-1] return [ reverse(element) for element in rd_multipolygon ]
In [46]:
def get_palette(n, no_black=True, no_white=True): with open('glasbey/{}_colors.txt'.format(n + no_black + no_white)) as f: return [ '#%02x%02x%02x' % tuple(int(c) for c in line.replace('\n', '').split(',')) for line in f if not no_black or line != '0,0,0\n' if not no_white or line != '255,255,255\n' ]
In [89]:
cdfs = [ np.array([ box(xmin, ymin, xmax_, ymax).intersection(shape).area / shape.area for xmax_ in numpy.linspace(xmin, xmax, 10001) ][1:]) for shape in ProgressBar(shapes) for xmin, ymin, xmax, ymax in [shape.bounds] ]
VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…
In [106]:
cutoff_percentage = 0.05 for question, rows in ProgressBar( answers_filtered.groupby('question_text_url'), size=len(answers_filtered['question_text_url'].unique()) ): m = folium.Map((rows['user_lat'].median(), rows['user_lng'].median()), tiles='stamentoner', zoom_start=10) answer_texts = { answer_text for gemeente, _ in enumerate(gemeente_names) for rows_ in [rows[rows['gemeente'] == gemeente]] for answer_text, rows__ in rows_.groupby('answer_text') if len(rows__) / len(rows_) >= cutoff_percentage } palette = dict(zip(answer_texts, get_palette(len(answer_texts)))) n_other = len(rows) - sum(sum(rows['answer_text'] == answer_text) for answer_text in answer_texts) groups = { answer_text: folium.FeatureGroup(name=name, overlay=True) for answer_text, color in palette.items() for name in [ '<span style=\\"color:{}; \\">{} ({})'.format( color, escape(answer_text), sum(rows['answer_text'] == answer_text) ) ] } groups['other'] = folium.FeatureGroup( name='<span style=\\"color: black; \\">{} ({})'.format( escape('other'),n_other), overlay=True ) for gemeente, gemeente_name in enumerate(gemeente_names): rows_ = rows[rows['gemeente'] == gemeente] order = [a for _, a in sorted(( (r['user_lat'], answer) for answer, r in rows_.groupby('answer_text').count().iterrows() ), reverse=True)] gemeente_shape = shapes[gemeente] xmin, ymin, xmax, ymax = gemeente_shape.bounds xmin_cum = xmin # print(sum(sum(rows_['answer_text'] == answer_text) for answer_text in order)) # print(len(rows_)) cum_percentage = 0 for i, answer_text in enumerate(order): total = sum(rows_['answer_text'] == answer_text) percentage = total / len(rows_) cum_percentage += percentage if i == 0: max_percentage = percentage name = '{} ({}, {}%)'.format(answer_text, total, int(round(100*percentage))) if percentage < cutoff_percentage: xmax_ = xmax color = '#ffffff' answer_text = 'other' total = sum(sum(rows_['answer_text'] == answer_text) for answer_text in order[i:]) percentage = total / len(rows_) done = True else: percentage_corrected = numpy.abs(cdfs[gemeente] - cum_percentage).argmin() / 10000 xmax_ = xmin + percentage_corrected * (xmax - xmin) color = palette[answer_text] done = False answer_shape = gemeente_shape.intersection(box(xmin_cum, ymin, xmax_, ymax)) xmin_cum = xmax_ # color = '#%02x%02x%02x' % tuple(int(255 * c) for c in cmap(percentage / max_percentage)[:3]) polygon = folium.Polygon( reverse(mapping(answer_shape)['coordinates']), fill_color=color, fill_opacity=0.8, color=None, popup='{} ({}, {: 3d}%)'.format(answer_text, total, round(100*percentage)) ) polygon.add_to(groups[answer_text]) if done: break polygon = folium.Polygon( reverse(mapping(gemeente_shape)['coordinates']), fill_color=None, color='#000000' ) polygon.add_to(m) for _, group in sorted(groups.items(), key=lambda x: sum(rows['answer_text'] == x[0]), reverse=True): group.add_to(m) folium.map.LayerControl('topright', collapsed=False).add_to(m) display(m) m.save('maps/heatmaps-combined/{}.html'.format(question)) # break
VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…
In [109]:
import glob with open('maps/heatmaps-combined/index.html', 'w') as f: f.write('<html><head></head><body>' + '<br/>\n'.join( '\t<a href="{}">{}<a>'.format(fn, fn[:-5].replace('_', ' ')) for fn in sorted( glob.glob('maps/heatmaps-combined/*.html') ) for fn in [fn[len('maps/heatmaps-combined/'):]] ) + "</body></html>")
In [ ]: