266 lines
7.4 KiB
Plaintext
266 lines
7.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Group recordings in 4 Frysian dialect regions\n",
|
|
"\n",
|
|
" * Klaaifrysk\n",
|
|
" * Waldfrysk\n",
|
|
" * Sudwesthoeksk\n",
|
|
" * Noardhoeksk\n",
|
|
" \n",
|
|
"First run `Dialect Regions from image.ipynb`.\n",
|
|
"\n",
|
|
"![dialect regions](../data/dialects.png)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from math import floor\n",
|
|
"import json\n",
|
|
"import pandas\n",
|
|
"import MySQLdb\n",
|
|
"from collections import Counter\n",
|
|
"\n",
|
|
"from math import sqrt\n",
|
|
"import numpy as np\n",
|
|
"from shapely.geometry import shape, Point\n",
|
|
"from vincenty import vincenty\n",
|
|
"\n",
|
|
"from jupyter_progressbar import ProgressBar\n",
|
|
"\n",
|
|
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Input\n",
|
|
"\n",
|
|
"Load the geojson with the dialect region and create shapely shapes."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open('dialect_regions.geojson', 'r') as f:\n",
|
|
" geojson = json.load(f)\n",
|
|
"\n",
|
|
"dialect_regions = [region['properties']['dialect'] for region in geojson['features']]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"shapes = {\n",
|
|
" feature['properties']['dialect']: shape(feature['geometry'])\n",
|
|
" for feature in geojson['features']\n",
|
|
"}\n",
|
|
"\n",
|
|
"def regions_for(coordinate):\n",
|
|
" regions = {\n",
|
|
" region_name\n",
|
|
" for region_name, shape in shapes.items()\n",
|
|
" if shape.contains(Point(*coordinate))\n",
|
|
" }\n",
|
|
" return regions\n",
|
|
"\n",
|
|
"def distance_to_shape(shape, longitude, latitude):\n",
|
|
" ext = shape.exterior\n",
|
|
" p = ext.interpolate(ext.project(Point(longitude, latitude)))\n",
|
|
" return vincenty((latitude, longitude), (p.y, p.x))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Query and process\n",
|
|
"\n",
|
|
"Query all picture game and free speech recordings and assign the dialect region."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def dialect_regions_and_distance(data):\n",
|
|
" return[\n",
|
|
" {\n",
|
|
" 'dialects': [\n",
|
|
" {\n",
|
|
" 'dialect': dialect,\n",
|
|
" 'boundary_distance': distance_to_shape(shapes[dialect], longitude, latitude),\n",
|
|
" }\n",
|
|
" for dialect in regions_for((longitude, latitude))\n",
|
|
" ],\n",
|
|
" 'filename': filename,\n",
|
|
" }\n",
|
|
" for filename, (latitude, longitude) in ProgressBar(\n",
|
|
" data[['latitude', 'longitude']].iterrows(),\n",
|
|
" size=len(data)\n",
|
|
" )\n",
|
|
" ]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"picture_games = pandas.read_sql('''\n",
|
|
"SELECT language.name as language, item.name as picture,\n",
|
|
" survey.user_lat as latitude, survey.user_lng as longitude,\n",
|
|
" survey.area_name as area, survey.country_name as country,\n",
|
|
" result.recording as filename,\n",
|
|
" result.submitted_at as date\n",
|
|
"FROM core_surveyresult as survey\n",
|
|
"INNER JOIN core_picturegameresult as result ON survey.id = result.survey_result_id\n",
|
|
"INNER JOIN core_language as language ON language.id = result.language_id\n",
|
|
"INNER JOIN core_picturegameitem as item\n",
|
|
" ON result.picture_game_item_id = item.id\n",
|
|
"''', db)\n",
|
|
"picture_games.set_index('filename', inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "67ed3190256b447c81daf3df1f189318",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"dialect_region_per_picture_game = dialect_regions_and_distance(picture_games)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pandas.DataFrame([\n",
|
|
" [r['filename'], r['dialects'][0]['dialect'], r['dialects'][0]['boundary_distance']]\n",
|
|
" for r in dialect_region_per_picture_game\n",
|
|
" if len(r['dialects']) == 1\n",
|
|
"], columns = ['filename', 'dialect', 'boundary_distance'])\n",
|
|
"\n",
|
|
"df.to_excel('../data/picture_game_recordings_by_dialect.xlsx')\n",
|
|
"df.to_csv('../data/picture_game_recordings_by_dialect.csv')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"free_speech_games = pandas.read_sql('''\n",
|
|
"SELECT language.name as language,\n",
|
|
" survey.user_lat as latitude, survey.user_lng as longitude,\n",
|
|
" survey.area_name as area, survey.country_name as country,\n",
|
|
" result.recording as filename,\n",
|
|
" result.submitted_at as date\n",
|
|
"FROM core_surveyresult as survey\n",
|
|
"INNER JOIN core_freespeechresult as result ON survey.id = result.survey_result_id\n",
|
|
"INNER JOIN core_language as language ON language.id = result.language_id\n",
|
|
"''', db)\n",
|
|
"free_speech_games.set_index('filename', inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "201b0aed64e8494db603de15b560d919",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"dialect_region_per_free_speech = dialect_regions_and_distance(free_speech_games)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pandas.DataFrame([\n",
|
|
" [r['filename'], r['dialects'][0]['dialect'], r['dialects'][0]['boundary_distance']]\n",
|
|
" for r in dialect_region_per_free_speech\n",
|
|
" if len(r['dialects']) == 1\n",
|
|
"], columns = ['filename', 'dialect', 'boundary_distance'])\n",
|
|
"\n",
|
|
"df.to_excel('../data/free_speech_recordings_by_dialect.xlsx')\n",
|
|
"df.to_csv('../data/free_speech_recordings_by_dialect.csv')"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 1
|
|
}
|