cleanup notebooks Frysian dialect regions

2018-09-28 12:28:54 +02:00
parent 6968a587ba
commit 204682cf26
8 changed files with 37654 additions and 2 deletions
--- a/notebooks/Group
+++ b/notebooks/Group
@@ -0,0 +1,265 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Group recordings in 4 Frysian dialect regions\n",
+    "\n",
+    " * Klaaifrysk\n",
+    " * Waldfrysk\n",
+    " * Sudwesthoeksk\n",
+    " * Noardhoeksk\n",
+    " \n",
+    "First run `Dialect Regions from image.ipynb`.\n",
+    "\n",
+    "![dialect regions](../data/dialects.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from math import floor\n",
+    "import json\n",
+    "import pandas\n",
+    "import MySQLdb\n",
+    "from collections import Counter\n",
+    "\n",
+    "from math import sqrt\n",
+    "import numpy as np\n",
+    "from shapely.geometry import shape, Point\n",
+    "from vincenty import vincenty\n",
+    "\n",
+    "from jupyter_progressbar import ProgressBar\n",
+    "\n",
+    "db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Input\n",
+    "\n",
+    "Load the geojson with the dialect region and create shapely shapes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "with open('dialect_regions.geojson', 'r') as f:\n",
+    "    geojson = json.load(f)\n",
+    "\n",
+    "dialect_regions = [region['properties']['dialect'] for region in geojson['features']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "shapes = {\n",
+    "    feature['properties']['dialect']: shape(feature['geometry'])\n",
+    "    for feature in geojson['features']\n",
+    "}\n",
+    "\n",
+    "def regions_for(coordinate):\n",
+    "    regions = {\n",
+    "        region_name\n",
+    "        for region_name, shape in shapes.items()\n",
+    "        if shape.contains(Point(*coordinate))\n",
+    "    }\n",
+    "    return regions\n",
+    "\n",
+    "def distance_to_shape(shape, longitude, latitude):\n",
+    "    ext = shape.exterior\n",
+    "    p = ext.interpolate(ext.project(Point(longitude, latitude)))\n",
+    "    return vincenty((latitude, longitude), (p.y, p.x))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Query and process\n",
+    "\n",
+    "Query all picture game and free speech recordings and assign the dialect region."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def dialect_regions_and_distance(data):\n",
+    "    return[\n",
+    "        {\n",
+    "            'dialects': [\n",
+    "                {\n",
+    "                    'dialect': dialect,\n",
+    "                    'boundary_distance': distance_to_shape(shapes[dialect], longitude, latitude),\n",
+    "                }\n",
+    "                for dialect in regions_for((longitude, latitude))\n",
+    "            ],\n",
+    "            'filename': filename,\n",
+    "        }\n",
+    "        for filename, (latitude, longitude) in ProgressBar(\n",
+    "            data[['latitude', 'longitude']].iterrows(),\n",
+    "            size=len(data)\n",
+    "        )\n",
+    "    ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "picture_games = pandas.read_sql('''\n",
+    "SELECT language.name as language, item.name as picture,\n",
+    "       survey.user_lat as latitude, survey.user_lng as longitude,\n",
+    "       survey.area_name as area, survey.country_name as country,\n",
+    "       result.recording as filename,\n",
+    "       result.submitted_at as date\n",
+    "FROM       core_surveyresult as survey\n",
+    "INNER JOIN core_picturegameresult as result ON survey.id = result.survey_result_id\n",
+    "INNER JOIN core_language as language ON language.id = result.language_id\n",
+    "INNER JOIN core_picturegameitem as item\n",
+    "    ON result.picture_game_item_id = item.id\n",
+    "''', db)\n",
+    "picture_games.set_index('filename', inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "67ed3190256b447c81daf3df1f189318",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "dialect_region_per_picture_game = dialect_regions_and_distance(picture_games)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pandas.DataFrame([\n",
+    "    [r['filename'], r['dialects'][0]['dialect'], r['dialects'][0]['boundary_distance']]\n",
+    "    for r in dialect_region_per_picture_game\n",
+    "    if len(r['dialects']) == 1\n",
+    "], columns = ['filename', 'dialect', 'boundary_distance'])\n",
+    "\n",
+    "df.to_excel('../data/picture_game_recordings_by_dialect.xlsx')\n",
+    "df.to_csv('../data/picture_game_recordings_by_dialect.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "free_speech_games = pandas.read_sql('''\n",
+    "SELECT language.name as language,\n",
+    "       survey.user_lat as latitude, survey.user_lng as longitude,\n",
+    "       survey.area_name as area, survey.country_name as country,\n",
+    "       result.recording as filename,\n",
+    "       result.submitted_at as date\n",
+    "FROM       core_surveyresult as survey\n",
+    "INNER JOIN core_freespeechresult as result ON survey.id = result.survey_result_id\n",
+    "INNER JOIN core_language as language ON language.id = result.language_id\n",
+    "''', db)\n",
+    "free_speech_games.set_index('filename', inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "201b0aed64e8494db603de15b560d919",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "dialect_region_per_free_speech = dialect_regions_and_distance(free_speech_games)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pandas.DataFrame([\n",
+    "    [r['filename'], r['dialects'][0]['dialect'], r['dialects'][0]['boundary_distance']]\n",
+    "    for r in dialect_region_per_free_speech\n",
+    "    if len(r['dialects']) == 1\n",
+    "], columns = ['filename', 'dialect', 'boundary_distance'])\n",
+    "\n",
+    "df.to_excel('../data/free_speech_recordings_by_dialect.xlsx')\n",
+    "df.to_csv('../data/free_speech_recordings_by_dialect.csv')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}