stimmenfryslan/notebooks/Pronunciations Table per Wi...

12 KiB

Geographical pronunciation statistics

In [1]:
import pandas
import MySQLdb
import numpy
import json

db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')

%matplotlib notebook
from matplotlib import pyplot
import folium
from IPython.display import display
from shapely.geometry import Polygon, MultiPolygon, shape, Point
from jsbutton import JsButton
from shapely.geometry import LineString, MultiLineString
from jupyter_progressbar import ProgressBar
from collections import defaultdict, Counter
from ipy_table import make_table
from html import escape

import numpy as np
from random import shuffle
import pickle
from jupyter_progressbar import ProgressBar
In [2]:
with open('friesland_wijken_land_only.p3', 'rb') as f:
    wijken, wijk_shapes = pickle.load(f)

for x in wijken['features']:
    x['type'] = 'Feature'

with open('friesland_wijken_geojson.json', 'w') as f:
    wijken['features'] = wijken['features']
    json.dump(wijken, f, indent=1)
In [3]:
from osgeo import gdal, ogr

srcDS = gdal.OpenEx('friesland_wijken_geojson.json')
ds = gdal.VectorTranslate('friesland_wijken_geojson.kml', srcDS, format='kml')
In [4]:
''.join({
    c
    for wijk in wijken['features']
    for c in wijk['properties']['gemeente_en_wijk_naam']
})
Out[4]:
'k4luâ7mWBAgDSKhCVaysNdr TjeoE85JzëGúcM.,IRtp2-bLû69Un0wZF3Hv1iOfô'
In [5]:
with open('friesland_wijken_land_only.p3', 'rb') as f:
    wijken, wijk_shapes = pickle.load(f)

wijk_names = [wijk['properties']['gemeente_en_wijk_naam'] for wijk in wijken['features']]

def get_wijk(point):
    for i, shape in enumerate(wijk_shapes):
        if shape.contains(point):
            return i
    return -1
In [6]:
def listify(rd_multipolygon):
    if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float):
        return list(rd_multipolygon)
    return [
        listify(element)
        for element in rd_multipolygon
    ]
In [7]:
# Answers to how participants state a word should be pronounces.

answers = pandas.read_sql('''
SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text
FROM       core_surveyresult as survey
INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id
INNER JOIN core_predictionquizresultquestionanswer as answer
    ON result.id = answer.prediction_quiz_id
''', db)
In [8]:
zero_latlng_questions = {
    q
    for q, row in answers.groupby('question_text').agg('std').iterrows()
    if row['user_lat'] == 0 and row['user_lng'] == 0
}
answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)]
In [9]:
def reverse(rd_multipolygon):
    if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float):
        return rd_multipolygon[::-1]
    return [
        reverse(element)
        for element in rd_multipolygon
    ]
In [10]:
# Takes approximately 2 minutes
points = set(zip(answers_filtered['user_lng'], answers_filtered['user_lat']))

wijk_map = dict()
for lng, lat in points:
    wijk_map[(lng, lat)] = get_wijk(Point(lng, lat))

answers_filtered['wijk'] = [
    wijk_map[(lng, lat)]
    for lat, lng in zip(answers_filtered['user_lat'], answers_filtered['user_lng'])
]
/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
In [11]:
answers_filtered['question_text_url'] = answers_filtered['question_text'].map(
    lambda x: x.replace('"', '').replace('*', ''))

answers_filtered['wijk_name'] = answers_filtered['wijk'].map(
    lambda x: wijk_names[x])

answers_filtered['answer_text_url'] = answers_filtered['answer_text'].map(
    lambda x: x[x.find('('):x.find(')')][1:])
/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
In [12]:
wijken = pandas.DataFrame([
    {'#name': name, 'longitude': shape.centroid.xy[0][0], 'latitude': shape.centroid.xy[1][0]}
    for name, shape in zip(wijk_names, wijk_shapes)
])

wijken.set_index('#name', inplace=True)
In [23]:
def merge_dicts(*args):
    for arg in args[1:]:
        args[0].update(arg)
    return args[0]


pronunciations = pandas.DataFrame([
    merge_dicts(
    {
        question: answers['answer_text_url']
        for question, answers in rows.groupby(
            'question_text_url'
        ).agg(
            {
                'answer_text_url': lambda x: [
                    {
                        'pronunciation': answer_text,
                        'count': answer_texts.count(answer_text)
                    }
                    for answer_texts in [list(x)]
                    for answer_text in sorted(set(x))
                    
                ]    
            }
        ).iterrows()
    }, {
       'wijk': wijk_names[wijk]
    })
    for wijk, rows in answers_filtered.groupby('wijk')
    if wijk >= 0
])

pronunciations.set_index('wijk', inplace=True)
pronunciations

columns = list(pronunciations.columns)

counts = pandas.DataFrame([
    merge_dicts({
        column + ": " + x['pronunciation']: 100 * x['count'] / total
        for column in columns
        for total in [sum(x['count'] for x in row[column])]
        for x in row[column]
    }, {'': wijk})
    for wijk, row in pronunciations.iterrows()
])

pronunciations = pandas.DataFrame([
    merge_dicts({
        column: ' / '.join(str(x['pronunciation']) for x in row[column])
        for column in columns
    }, {'': wijk})
    for wijk, row in pronunciations.iterrows()
])

pronunciations.set_index('', inplace=True)
counts.set_index('', inplace=True)
counts[counts != counts] = 0
In [29]:
shape
Out[29]:
<function shapely.geometry.geo.shape(context)>
In [26]:
pronunciations.to_csv('pronunciations_by_wijk.tsv', sep='\t')
counts.to_csv('pronunciation_percentages_by_wijk.tsv', sep='\t')
wijken.to_csv('wijk_centroid.tsv', sep='\t', columns=['longitude', 'latitude'])
In [27]:
with open('pronunciations_by_wijk.tsv') as f:
    p = list(f)
    
with open('pronunciation_count_by_wijk.tsv') as f:
    c = list(f)

with open('wijk_centroid.tsv') as f:
    w = list(f)