stimmenfryslan/notebooks/Group recordings in 4 Frysi...

7.5 KiB

Group recordings in 4 Frysian dialect regions

  • Klaaifrysk
  • Waldfrysk
  • Sudwesthoeksk
  • Noardhoeksk

First run Dialect Regions from image.ipynb.

dialect regions

In [2]:
from math import floor
import json
import pandas
import MySQLdb
from collections import Counter

from math import sqrt
import numpy as np
from shapely.geometry import shape, Point
from vincenty import vincenty

from jupyter_progressbar import ProgressBar

db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')

Input

Load the geojson with the dialect region and create shapely shapes.

In [3]:
with open('../data/fryslan_dialect_regions.geojson', 'r') as f:
    geojson = json.load(f)

dialect_regions = [region['properties']['dialect'] for region in geojson['features']]
In [4]:
shapes = {
    feature['properties']['dialect']: shape(feature['geometry'])
    for feature in geojson['features']
}

def regions_for(coordinate):
    regions = {
        region_name
        for region_name, shape in shapes.items()
        if shape.contains(Point(*coordinate))
    }
    return regions

def distance_to_shape(shape, longitude, latitude):
    ext = shape.exterior
    p = ext.interpolate(ext.project(Point(longitude, latitude)))
    return vincenty((latitude, longitude), (p.y, p.x))

Query and process

Query all picture game and free speech recordings and assign the dialect region.

In [5]:
def dialect_regions_and_distance(data):
    return[
        {
            'dialects': [
                {
                    'dialect': dialect,
                    'boundary_distance': distance_to_shape(shapes[dialect], longitude, latitude),
                }
                for dialect in regions_for((longitude, latitude))
            ],
            'filename': filename,
        }
        for filename, (latitude, longitude) in ProgressBar(
            data[['latitude', 'longitude']].iterrows(),
            size=len(data)
        )
    ]
In [6]:
picture_games = pandas.read_sql('''
SELECT language.name as language, item.name as picture,
       survey.user_lat as latitude, survey.user_lng as longitude,
       survey.area_name as area, survey.country_name as country,
       result.recording as filename,
       result.submitted_at as date
FROM       core_surveyresult as survey
INNER JOIN core_picturegameresult as result ON survey.id = result.survey_result_id
INNER JOIN core_language as language ON language.id = result.language_id
INNER JOIN core_picturegameitem as item
    ON result.picture_game_item_id = item.id
''', db)
picture_games.set_index('filename', inplace=True)
In [7]:
dialect_region_per_picture_game = dialect_regions_and_distance(picture_games)
VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…
In [8]:
df = pandas.DataFrame([
    [r['filename'], r['dialects'][0]['dialect'], r['dialects'][0]['boundary_distance']]
    for r in dialect_region_per_picture_game
    if len(r['dialects']) == 1
], columns = ['filename', 'dialect', 'boundary_distance'])

df.to_excel('../data/picture_game_recordings_by_dialect.xlsx')
df.to_csv('../data/picture_game_recordings_by_dialect.csv')
In [9]:
free_speech_games = pandas.read_sql('''
SELECT language.name as language,
       survey.user_lat as latitude, survey.user_lng as longitude,
       survey.area_name as area, survey.country_name as country,
       result.recording as filename,
       result.submitted_at as date
FROM       core_surveyresult as survey
INNER JOIN core_freespeechresult as result ON survey.id = result.survey_result_id
INNER JOIN core_language as language ON language.id = result.language_id
''', db)
free_speech_games.set_index('filename', inplace=True)
In [10]:
dialect_region_per_free_speech = dialect_regions_and_distance(free_speech_games)
VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…
In [11]:
df = pandas.DataFrame([
    [r['filename'], r['dialects'][0]['dialect'], r['dialects'][0]['boundary_distance']]
    for r in dialect_region_per_free_speech
    if len(r['dialects']) == 1
], columns = ['filename', 'dialect', 'boundary_distance'])

df.to_excel('../data/free_speech_recordings_by_dialect.xlsx')
df.to_csv('../data/free_speech_recordings_by_dialect.csv')