cleanup notebooks Frysian dialect regions
This commit is contained in:
265
notebooks/Group recordings in 4 Frysian dialect regions.ipynb
Normal file
265
notebooks/Group recordings in 4 Frysian dialect regions.ipynb
Normal file
@@ -0,0 +1,265 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Group recordings in 4 Frysian dialect regions\n",
|
||||
"\n",
|
||||
" * Klaaifrysk\n",
|
||||
" * Waldfrysk\n",
|
||||
" * Sudwesthoeksk\n",
|
||||
" * Noardhoeksk\n",
|
||||
" \n",
|
||||
"First run `Dialect Regions from image.ipynb`.\n",
|
||||
"\n",
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from math import floor\n",
|
||||
"import json\n",
|
||||
"import pandas\n",
|
||||
"import MySQLdb\n",
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"from math import sqrt\n",
|
||||
"import numpy as np\n",
|
||||
"from shapely.geometry import shape, Point\n",
|
||||
"from vincenty import vincenty\n",
|
||||
"\n",
|
||||
"from jupyter_progressbar import ProgressBar\n",
|
||||
"\n",
|
||||
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Input\n",
|
||||
"\n",
|
||||
"Load the geojson with the dialect region and create shapely shapes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('dialect_regions.geojson', 'r') as f:\n",
|
||||
" geojson = json.load(f)\n",
|
||||
"\n",
|
||||
"dialect_regions = [region['properties']['dialect'] for region in geojson['features']]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"shapes = {\n",
|
||||
" feature['properties']['dialect']: shape(feature['geometry'])\n",
|
||||
" for feature in geojson['features']\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"def regions_for(coordinate):\n",
|
||||
" regions = {\n",
|
||||
" region_name\n",
|
||||
" for region_name, shape in shapes.items()\n",
|
||||
" if shape.contains(Point(*coordinate))\n",
|
||||
" }\n",
|
||||
" return regions\n",
|
||||
"\n",
|
||||
"def distance_to_shape(shape, longitude, latitude):\n",
|
||||
" ext = shape.exterior\n",
|
||||
" p = ext.interpolate(ext.project(Point(longitude, latitude)))\n",
|
||||
" return vincenty((latitude, longitude), (p.y, p.x))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Query and process\n",
|
||||
"\n",
|
||||
"Query all picture game and free speech recordings and assign the dialect region."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def dialect_regions_and_distance(data):\n",
|
||||
" return[\n",
|
||||
" {\n",
|
||||
" 'dialects': [\n",
|
||||
" {\n",
|
||||
" 'dialect': dialect,\n",
|
||||
" 'boundary_distance': distance_to_shape(shapes[dialect], longitude, latitude),\n",
|
||||
" }\n",
|
||||
" for dialect in regions_for((longitude, latitude))\n",
|
||||
" ],\n",
|
||||
" 'filename': filename,\n",
|
||||
" }\n",
|
||||
" for filename, (latitude, longitude) in ProgressBar(\n",
|
||||
" data[['latitude', 'longitude']].iterrows(),\n",
|
||||
" size=len(data)\n",
|
||||
" )\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"picture_games = pandas.read_sql('''\n",
|
||||
"SELECT language.name as language, item.name as picture,\n",
|
||||
" survey.user_lat as latitude, survey.user_lng as longitude,\n",
|
||||
" survey.area_name as area, survey.country_name as country,\n",
|
||||
" result.recording as filename,\n",
|
||||
" result.submitted_at as date\n",
|
||||
"FROM core_surveyresult as survey\n",
|
||||
"INNER JOIN core_picturegameresult as result ON survey.id = result.survey_result_id\n",
|
||||
"INNER JOIN core_language as language ON language.id = result.language_id\n",
|
||||
"INNER JOIN core_picturegameitem as item\n",
|
||||
" ON result.picture_game_item_id = item.id\n",
|
||||
"''', db)\n",
|
||||
"picture_games.set_index('filename', inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "67ed3190256b447c81daf3df1f189318",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dialect_region_per_picture_game = dialect_regions_and_distance(picture_games)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pandas.DataFrame([\n",
|
||||
" [r['filename'], r['dialects'][0]['dialect'], r['dialects'][0]['boundary_distance']]\n",
|
||||
" for r in dialect_region_per_picture_game\n",
|
||||
" if len(r['dialects']) == 1\n",
|
||||
"], columns = ['filename', 'dialect', 'boundary_distance'])\n",
|
||||
"\n",
|
||||
"df.to_excel('../data/picture_game_recordings_by_dialect.xlsx')\n",
|
||||
"df.to_csv('../data/picture_game_recordings_by_dialect.csv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"free_speech_games = pandas.read_sql('''\n",
|
||||
"SELECT language.name as language,\n",
|
||||
" survey.user_lat as latitude, survey.user_lng as longitude,\n",
|
||||
" survey.area_name as area, survey.country_name as country,\n",
|
||||
" result.recording as filename,\n",
|
||||
" result.submitted_at as date\n",
|
||||
"FROM core_surveyresult as survey\n",
|
||||
"INNER JOIN core_freespeechresult as result ON survey.id = result.survey_result_id\n",
|
||||
"INNER JOIN core_language as language ON language.id = result.language_id\n",
|
||||
"''', db)\n",
|
||||
"free_speech_games.set_index('filename', inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "201b0aed64e8494db603de15b560d919",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dialect_region_per_free_speech = dialect_regions_and_distance(free_speech_games)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = pandas.DataFrame([\n",
|
||||
" [r['filename'], r['dialects'][0]['dialect'], r['dialects'][0]['boundary_distance']]\n",
|
||||
" for r in dialect_region_per_free_speech\n",
|
||||
" if len(r['dialects']) == 1\n",
|
||||
"], columns = ['filename', 'dialect', 'boundary_distance'])\n",
|
||||
"\n",
|
||||
"df.to_excel('../data/free_speech_recordings_by_dialect.xlsx')\n",
|
||||
"df.to_csv('../data/free_speech_recordings_by_dialect.csv')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
Reference in New Issue
Block a user