12 KiB
12 KiB
Geographical pronunciation statistics¶
In [1]:
import pandas import MySQLdb import numpy import json db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8') %matplotlib notebook from matplotlib import pyplot import folium from IPython.display import display from shapely.geometry import Polygon, MultiPolygon, shape, Point from jsbutton import JsButton from shapely.geometry import LineString, MultiLineString from jupyter_progressbar import ProgressBar from collections import defaultdict, Counter from ipy_table import make_table from html import escape import numpy as np from random import shuffle import pickle from jupyter_progressbar import ProgressBar
In [2]:
with open('friesland_wijken_land_only.p3', 'rb') as f: wijken, wijk_shapes = pickle.load(f) for x in wijken['features']: x['type'] = 'Feature' with open('friesland_wijken_geojson.json', 'w') as f: wijken['features'] = wijken['features'] json.dump(wijken, f, indent=1)
In [3]:
from osgeo import gdal, ogr srcDS = gdal.OpenEx('friesland_wijken_geojson.json') ds = gdal.VectorTranslate('friesland_wijken_geojson.kml', srcDS, format='kml')
In [4]:
''.join({ c for wijk in wijken['features'] for c in wijk['properties']['gemeente_en_wijk_naam'] })
Out[4]:
'k4luâ7mWBAgDSKhCVaysNdr TjeoE85JzëGúcM.,IRtp2-bLû69Un0wZF3Hv1iOfô'
In [5]:
with open('friesland_wijken_land_only.p3', 'rb') as f: wijken, wijk_shapes = pickle.load(f) wijk_names = [wijk['properties']['gemeente_en_wijk_naam'] for wijk in wijken['features']] def get_wijk(point): for i, shape in enumerate(wijk_shapes): if shape.contains(point): return i return -1
In [6]:
def listify(rd_multipolygon): if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float): return list(rd_multipolygon) return [ listify(element) for element in rd_multipolygon ]
In [7]:
# Answers to how participants state a word should be pronounces. answers = pandas.read_sql(''' SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text FROM core_surveyresult as survey INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id INNER JOIN core_predictionquizresultquestionanswer as answer ON result.id = answer.prediction_quiz_id ''', db)
In [8]:
zero_latlng_questions = { q for q, row in answers.groupby('question_text').agg('std').iterrows() if row['user_lat'] == 0 and row['user_lng'] == 0 } answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)]
In [9]:
def reverse(rd_multipolygon): if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float): return rd_multipolygon[::-1] return [ reverse(element) for element in rd_multipolygon ]
In [10]:
# Takes approximately 2 minutes points = set(zip(answers_filtered['user_lng'], answers_filtered['user_lat'])) wijk_map = dict() for lng, lat in points: wijk_map[(lng, lat)] = get_wijk(Point(lng, lat)) answers_filtered['wijk'] = [ wijk_map[(lng, lat)] for lat, lng in zip(answers_filtered['user_lat'], answers_filtered['user_lng']) ]
/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy # Remove the CWD from sys.path while we load stuff.
In [11]:
answers_filtered['question_text_url'] = answers_filtered['question_text'].map( lambda x: x.replace('"', '').replace('*', '')) answers_filtered['wijk_name'] = answers_filtered['wijk'].map( lambda x: wijk_names[x]) answers_filtered['answer_text_url'] = answers_filtered['answer_text'].map( lambda x: x[x.find('('):x.find(')')][1:])
/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy /home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """ /home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [12]:
wijken = pandas.DataFrame([ {'#name': name, 'longitude': shape.centroid.xy[0][0], 'latitude': shape.centroid.xy[1][0]} for name, shape in zip(wijk_names, wijk_shapes) ]) wijken.set_index('#name', inplace=True)
In [23]:
def merge_dicts(*args): for arg in args[1:]: args[0].update(arg) return args[0] pronunciations = pandas.DataFrame([ merge_dicts( { question: answers['answer_text_url'] for question, answers in rows.groupby( 'question_text_url' ).agg( { 'answer_text_url': lambda x: [ { 'pronunciation': answer_text, 'count': answer_texts.count(answer_text) } for answer_texts in [list(x)] for answer_text in sorted(set(x)) ] } ).iterrows() }, { 'wijk': wijk_names[wijk] }) for wijk, rows in answers_filtered.groupby('wijk') if wijk >= 0 ]) pronunciations.set_index('wijk', inplace=True) pronunciations columns = list(pronunciations.columns) counts = pandas.DataFrame([ merge_dicts({ column + ": " + x['pronunciation']: 100 * x['count'] / total for column in columns for total in [sum(x['count'] for x in row[column])] for x in row[column] }, {'': wijk}) for wijk, row in pronunciations.iterrows() ]) pronunciations = pandas.DataFrame([ merge_dicts({ column: ' / '.join(str(x['pronunciation']) for x in row[column]) for column in columns }, {'': wijk}) for wijk, row in pronunciations.iterrows() ]) pronunciations.set_index('', inplace=True) counts.set_index('', inplace=True) counts[counts != counts] = 0
In [29]:
shape
Out[29]:
<function shapely.geometry.geo.shape(context)>
In [26]:
pronunciations.to_csv('pronunciations_by_wijk.tsv', sep='\t') counts.to_csv('pronunciation_percentages_by_wijk.tsv', sep='\t') wijken.to_csv('wijk_centroid.tsv', sep='\t', columns=['longitude', 'latitude'])
In [27]:
with open('pronunciations_by_wijk.tsv') as f: p = list(f) with open('pronunciation_count_by_wijk.tsv') as f: c = list(f) with open('wijk_centroid.tsv') as f: w = list(f)