resturcured for reproducability

2019-03-19 12:53:12 +01:00
parent 7da2bfc400
commit eaa71c9eeb
102 changed files with 2251 additions and 1472517 deletions
--- a/Municipalities.ipynb
+++ b/Municipalities.ipynb
--- a/notebooks/Dialect
+++ b/notebooks/Dialect
--- a/notebooks/Gabmap
+++ b/notebooks/Gabmap
@@ -1,83 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Gabmap format\n",
-    "\n",
-    "Exploration of the format of the lines in example Gabmap files Martijn had sent."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('../data/martijn_format/Dutch613-coordinates.txt') as f:\n",
-    "    coordinates = list(f)\n",
-    "    \n",
-    "with open('../data/martijn_format/Nederlands-ipa.utxt') as f:\n",
-    "    table = list(f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "coordinates[0].split('\\t')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "coordinates[1].split('\\t')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "table[0].split('\\t')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "table[1].split('\\t')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/notebooks/Gabmap
+++ b/notebooks/Gabmap
@@ -1,458 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Geographical pronunciation tables, simple example\n",
-    "\n",
-    "Simple example to create gabmap files for two words with few pronunciations an two regions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "sys.path.append('..')\n",
-    "\n",
-    "import pandas\n",
-    "import MySQLdb\n",
-    "import json\n",
-    "import copy\n",
-    "\n",
-    "db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
-    "\n",
-    "from shapely.geometry import shape, Point\n",
-    "\n",
-    "from gabmap import create_gabmap_dataframes\n",
-    "\n",
-    "from stimmen.geojson import merge_features"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('../data/Friesland_wijken.geojson') as f:\n",
-    "    regions = json.load(f)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load and simplify"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Answers to how participants state a word should be pronounced\n",
-    "\n",
-    "answers = pandas.read_sql('''\n",
-    "SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
-    "FROM       core_surveyresult as survey\n",
-    "INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
-    "INNER JOIN core_predictionquizresultquestionanswer as answer\n",
-    "    ON result.id = answer.prediction_quiz_id\n",
-    "''', db)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "regions_simple = merge_features(copy.deepcopy(regions),\n",
-    "    condition=lambda feature: feature['properties']['GM_NAAM'] == 'Heerenveen',\n",
-    ")\n",
-    "\n",
-    "regions_simple = merge_features(\n",
-    "    regions_simple,\n",
-    "    condition=lambda feature: feature['properties']['GM_NAAM'] == 'Leeuwarden',\n",
-    ")\n",
-    "regions_simple['features'] = regions_simple['features'][-2:]\n",
-    "\n",
-    "regions_simple['features'][0]['properties']['name'] = 'Heerenveen'\n",
-    "regions_simple['features'][1]['properties']['name'] = 'Leeuwarden'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "answers_simple = answers[\n",
-    "    (answers['question_text'] == '\"blad\" (aan een boom)') |\n",
-    "    (answers['question_text'] == '\"vis\"')\n",
-    "].copy()\n",
-    "\n",
-    "answers_simple['question_text'] = answers_simple['question_text'].map(\n",
-    "    lambda x: x.replace('\"', '').replace('*', ''))\n",
-    "\n",
-    "answers_simple['answer_text'] = answers_simple['answer_text'].map(\n",
-    "    lambda x: x[x.find('('):x.find(')')][1:])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Two words, boom and vis, with each 4 and 2 pronunciations"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>answer_text</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>question_text</th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>blad (aan een boom)</th>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>vis</th>\n",
-       "      <td>2</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                     answer_text\n",
-       "question_text                   \n",
-       "blad (aan een boom)            4\n",
-       "vis                            2"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "answers_simple.groupby('question_text').agg({'answer_text': lambda x: len(set(x))})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "centroids_example, pronunciations_example, counts_example = create_gabmap_dataframes(\n",
-    "    regions_simple, answers_simple,\n",
-    "    latitude_column='user_lat', longitude_column='user_lng',\n",
-    "    word_column='question_text', pronunciation_column='answer_text',\n",
-    "    region_name_property='name'\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Resulting tables\n",
-    "\n",
-    "Stored as tab separated files for gabmap"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>latitude</th>\n",
-       "      <th>longitude</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>#name</th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>Heerenveen</th>\n",
-       "      <td>52.996076</td>\n",
-       "      <td>5.977925</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>Leeuwarden</th>\n",
-       "      <td>53.169940</td>\n",
-       "      <td>5.797613</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "             latitude  longitude\n",
-       "#name                           \n",
-       "Heerenveen  52.996076   5.977925\n",
-       "Leeuwarden  53.169940   5.797613"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "centroids_example"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>blad (aan een boom)</th>\n",
-       "      <th>vis</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>Heerenveen</th>\n",
-       "      <td>blet / blɑt / blɔd / blɛ:t</td>\n",
-       "      <td>fisk / fɪs</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>Leeuwarden</th>\n",
-       "      <td>blet / blɑt / blɔd / blɛ:t</td>\n",
-       "      <td>fisk / fɪs</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                   blad (aan een boom)         vis\n",
-       "                                                  \n",
-       "Heerenveen  blet / blɑt / blɔd / blɛ:t  fisk / fɪs\n",
-       "Leeuwarden  blet / blɑt / blɔd / blɛ:t  fisk / fɪs"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "pronunciations_example"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>blad (aan een boom): blet</th>\n",
-       "      <th>blad (aan een boom): blɑt</th>\n",
-       "      <th>blad (aan een boom): blɔd</th>\n",
-       "      <th>blad (aan een boom): blɛ:t</th>\n",
-       "      <th>vis: fisk</th>\n",
-       "      <th>vis: fɪs</th>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "      <th></th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>Heerenveen</th>\n",
-       "      <td>31.654676</td>\n",
-       "      <td>2.158273</td>\n",
-       "      <td>2.158273</td>\n",
-       "      <td>64.028777</td>\n",
-       "      <td>52.517986</td>\n",
-       "      <td>47.482014</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>Leeuwarden</th>\n",
-       "      <td>7.865169</td>\n",
-       "      <td>7.022472</td>\n",
-       "      <td>8.707865</td>\n",
-       "      <td>76.404494</td>\n",
-       "      <td>75.000000</td>\n",
-       "      <td>25.000000</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            blad (aan een boom): blet  blad (aan een boom): blɑt  \\\n",
-       "                                                                   \n",
-       "Heerenveen                  31.654676                   2.158273   \n",
-       "Leeuwarden                   7.865169                   7.022472   \n",
-       "\n",
-       "            blad (aan een boom): blɔd  blad (aan een boom): blɛ:t  vis: fisk  \\\n",
-       "                                                                               \n",
-       "Heerenveen                   2.158273                   64.028777  52.517986   \n",
-       "Leeuwarden                   8.707865                   76.404494  75.000000   \n",
-       "\n",
-       "             vis: fɪs  \n",
-       "                       \n",
-       "Heerenveen  47.482014  \n",
-       "Leeuwarden  25.000000  "
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "counts_example"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pronunciations_example.to_csv('../data/Pronunciations_example.gabmap.tsv', sep='\\t')\n",
-    "counts_example.to_csv('../data/Pronunciation_percentages_example.gabmap.tsv', sep='\\t')\n",
-    "centroids_example.to_csv('../data/Centroids_example.gabmap.tsv', sep='\\t', columns=['longitude', 'latitude'])\n",
-    "with open('../data/Gabmap_example.geojson', 'w') as f:\n",
-    "    json.dump(regions_simple, f, indent=1)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/notebooks/Gabmap
+++ b/notebooks/Gabmap
@@ -1,157 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Geographical pronunciation tables\n",
-    "\n",
-    "Creates gabmap files with region centroids, percentages and pronunciations for wijken in Friesland."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "sys.path.append('..')\n",
-    "\n",
-    "import pandas\n",
-    "import MySQLdb\n",
-    "import json\n",
-    "import copy\n",
-    "\n",
-    "db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
-    "\n",
-    "from shapely.geometry import shape, Point\n",
-    "\n",
-    "from gabmap import create_gabmap_dataframes"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('../data/Friesland_wijken.geojson') as f:\n",
-    "    regions = json.load(f)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Answers to how participants state a word should be pronounced\n",
-    "\n",
-    "answers = pandas.read_sql('''\n",
-    "SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
-    "FROM       core_surveyresult as survey\n",
-    "INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
-    "INNER JOIN core_predictionquizresultquestionanswer as answer\n",
-    "    ON result.id = answer.prediction_quiz_id\n",
-    "''', db)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "zero_latlng_questions = {\n",
-    "    q\n",
-    "    for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
-    "    if row['user_lat'] == 0 and row['user_lng'] == 0\n",
-    "}\n",
-    "answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)].copy()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array(['gegaan', 'avond', 'heel', 'dag', 'bij (insect)', 'sprak (toe)',\n",
-       "       'oog', 'armen (lichaamsdeel)', 'kaas', 'deurtje', 'koken',\n",
-       "       'borst (lichaamsdeel)', 'vis', 'zaterdag', 'trein', 'geel', 'tand',\n",
-       "       'gezet', 'blad (aan een boom)'], dtype=object)"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "answers_filtered['question_text'].unique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "answers_filtered['question_text'] = answers_filtered['question_text'].map(\n",
-    "    lambda x: x.replace('\"', '').replace('*', ''))\n",
-    "\n",
-    "answers_filtered['answer_text'] = answers_filtered['answer_text'].map(\n",
-    "    lambda x: x[x.find('('):x.find(')')][1:])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "centroids, pronunciations, counts = create_gabmap_dataframes(\n",
-    "    regions, answers_filtered,\n",
-    "    latitude_column='user_lat', longitude_column='user_lng',\n",
-    "    word_column='question_text', pronunciation_column='answer_text',\n",
-    "    region_name_property='gemeente_en_wijk_naam'\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pronunciations.to_csv('../data/Friesland_wijken_pronunciations.gabmap.tsv', sep='\\t')\n",
-    "counts.to_csv('../data/Friesland_wijken_pronunciation_percentages.gabmap.tsv', sep='\\t')\n",
-    "centroids.to_csv('../data/Friesland_wijken_centroids.gabmap.tsv', sep='\\t', columns=['longitude', 'latitude'])"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
--- a/notebooks/Group
+++ b/notebooks/Group
@@ -1,265 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Group recordings in 4 Frysian dialect regions\n",
-    "\n",
-    " * Klaaifrysk\n",
-    " * Waldfrysk\n",
-    " * Sudwesthoeksk\n",
-    " * Noardhoeksk\n",
-    " \n",
-    "First run `Dialect Regions from image.ipynb`.\n",
-    "\n",
-    "![dialect regions](../data/dialects.png)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from math import floor\n",
-    "import json\n",
-    "import pandas\n",
-    "import MySQLdb\n",
-    "from collections import Counter\n",
-    "\n",
-    "from math import sqrt\n",
-    "import numpy as np\n",
-    "from shapely.geometry import shape, Point\n",
-    "from vincenty import vincenty\n",
-    "\n",
-    "from jupyter_progressbar import ProgressBar\n",
-    "\n",
-    "db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Input\n",
-    "\n",
-    "Load the geojson with the dialect region and create shapely shapes."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "with open('../data/fryslan_dialect_regions.geojson', 'r') as f:\n",
-    "    geojson = json.load(f)\n",
-    "\n",
-    "dialect_regions = [region['properties']['dialect'] for region in geojson['features']]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "shapes = {\n",
-    "    feature['properties']['dialect']: shape(feature['geometry'])\n",
-    "    for feature in geojson['features']\n",
-    "}\n",
-    "\n",
-    "def regions_for(coordinate):\n",
-    "    regions = {\n",
-    "        region_name\n",
-    "        for region_name, shape in shapes.items()\n",
-    "        if shape.contains(Point(*coordinate))\n",
-    "    }\n",
-    "    return regions\n",
-    "\n",
-    "def distance_to_shape(shape, longitude, latitude):\n",
-    "    ext = shape.exterior\n",
-    "    p = ext.interpolate(ext.project(Point(longitude, latitude)))\n",
-    "    return vincenty((latitude, longitude), (p.y, p.x))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Query and process\n",
-    "\n",
-    "Query all picture game and free speech recordings and assign the dialect region."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def dialect_regions_and_distance(data):\n",
-    "    return[\n",
-    "        {\n",
-    "            'dialects': [\n",
-    "                {\n",
-    "                    'dialect': dialect,\n",
-    "                    'boundary_distance': distance_to_shape(shapes[dialect], longitude, latitude),\n",
-    "                }\n",
-    "                for dialect in regions_for((longitude, latitude))\n",
-    "            ],\n",
-    "            'filename': filename,\n",
-    "        }\n",
-    "        for filename, (latitude, longitude) in ProgressBar(\n",
-    "            data[['latitude', 'longitude']].iterrows(),\n",
-    "            size=len(data)\n",
-    "        )\n",
-    "    ]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "picture_games = pandas.read_sql('''\n",
-    "SELECT language.name as language, item.name as picture,\n",
-    "       survey.user_lat as latitude, survey.user_lng as longitude,\n",
-    "       survey.area_name as area, survey.country_name as country,\n",
-    "       result.recording as filename,\n",
-    "       result.submitted_at as date\n",
-    "FROM       core_surveyresult as survey\n",
-    "INNER JOIN core_picturegameresult as result ON survey.id = result.survey_result_id\n",
-    "INNER JOIN core_language as language ON language.id = result.language_id\n",
-    "INNER JOIN core_picturegameitem as item\n",
-    "    ON result.picture_game_item_id = item.id\n",
-    "''', db)\n",
-    "picture_games.set_index('filename', inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5825449a737b4fcab38a4f4ac2adfd87",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "dialect_region_per_picture_game = dialect_regions_and_distance(picture_games)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pandas.DataFrame([\n",
-    "    [r['filename'], r['dialects'][0]['dialect'], r['dialects'][0]['boundary_distance']]\n",
-    "    for r in dialect_region_per_picture_game\n",
-    "    if len(r['dialects']) == 1\n",
-    "], columns = ['filename', 'dialect', 'boundary_distance'])\n",
-    "\n",
-    "df.to_excel('../data/picture_game_recordings_by_dialect.xlsx')\n",
-    "df.to_csv('../data/picture_game_recordings_by_dialect.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "free_speech_games = pandas.read_sql('''\n",
-    "SELECT language.name as language,\n",
-    "       survey.user_lat as latitude, survey.user_lng as longitude,\n",
-    "       survey.area_name as area, survey.country_name as country,\n",
-    "       result.recording as filename,\n",
-    "       result.submitted_at as date\n",
-    "FROM       core_surveyresult as survey\n",
-    "INNER JOIN core_freespeechresult as result ON survey.id = result.survey_result_id\n",
-    "INNER JOIN core_language as language ON language.id = result.language_id\n",
-    "''', db)\n",
-    "free_speech_games.set_index('filename', inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "8afad9f71e544658b554b828932d7769",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "dialect_region_per_free_speech = dialect_regions_and_distance(free_speech_games)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pandas.DataFrame([\n",
-    "    [r['filename'], r['dialects'][0]['dialect'], r['dialects'][0]['boundary_distance']]\n",
-    "    for r in dialect_region_per_free_speech\n",
-    "    if len(r['dialects']) == 1\n",
-    "], columns = ['filename', 'dialect', 'boundary_distance'])\n",
-    "\n",
-    "df.to_excel('../data/free_speech_recordings_by_dialect.xlsx')\n",
-    "df.to_csv('../data/free_speech_recordings_by_dialect.csv')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}
--- a/pronunciation.ipynb
+++ b/pronunciation.ipynb
--- a/notebooks/Segment
+++ b/notebooks/Segment
@@ -7,13 +7,11 @@
    "# Segment provinces\n",
    "\n",
    "\n",
-    "Create wijk and gemeente level segmentations for all Dutch provinces and save as geojson and Gabmap KML.\n",
+    "Create wijk and gemeente level segmentations for two Dutch provinces, Groningen and Friesland, and save as geojson and Gabmap KML.\n",
    "\n",
-    "All is based on CBS data.\n",
+    "All is based on [CBS data](https://www.cbs.nl/nl-nl/dossier/nederland-regionaal/geografische%20data/wijk-en-buurtkaart-2017)\n",
    "\n",
-    "For Friesland, several wijken are merged.\n",
-    "\n",
-    "Note: only applied to Groningen and Friesland, because other provinces give gemetry errors."
+    "For Friesland, several wijken are merged, in particular those of the municipalities Ameland, Harlingen, Schiermonnikoog, Terschelling and Vlieland, and those of Leeuwarden with centroid above 53.167. These neighborhoods are small in area and hence we decided to merge, to avoid a "
   ]
  },
  {
@@ -29,7 +27,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -53,14 +51,37 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Groningen\n",
+      "0\n",
+      "1\n",
+      "2\n",
+      "3\n",
+      "4\n",
+      "5\n",
+      "6\n",
+      "Friesland\n",
+      "0\n",
+      "1\n",
+      "2\n",
+      "3\n",
+      "4\n",
+      "5\n",
+      "6\n"
+     ]
+    }
+   ],
   "source": [
    "for province in ['Groningen', 'Friesland']:\n",
-    "    wijken_geojson = gwb_in_province(province, 'wijk', 2018)\n",
-    "    gemeente_geojson = gwb_in_province(province, 'gem', 2018)\n",
-    "\n",
+    "    wijken_geojson = gwb_in_province(province, 'wijk', 2018, polygon_simplification=None)\n",
+    "    gemeente_geojson = gwb_in_province(province, 'gem', 2018, polygon_simplification=None)\n",
+    "    \n",
    "    if province == 'Friesland':\n",
    "        for gemeente in {'Ameland', 'Harlingen', 'Schiermonnikoog', 'Terschelling', 'Vlieland'}:\n",
    "            merged_geojson = merge_features(\n",
@@ -83,14 +104,14 @@
    "    for gemeente in [feature['properties']['GM_NAAM'] for feature in gemeente_geojson['features']]:\n",
    "        gemeente_geojson = merge_features(\n",
    "            gemeente_geojson, condition=lambda feature: feature['properties']['GM_NAAM'] == gemeente)\n",
-    "        \n",
+    "    \n",
    "    for feature in wijken_geojson['features']:\n",
    "        feature['properties']['gemeente_en_wijk_naam'] = (\n",
    "            feature['properties']['GM_NAAM'] +\n",
    "            ', ' +\n",
    "            feature['properties'].get('WK_NAAM', '')\n",
    "        ).replace('&', 'en').replace('/', ' ').replace('\"', ' ').replace(\"'\", ' ')\n",
-    "        \n",
+    "    \n",
    "    for feature in gemeente_geojson['features']:\n",
    "        feature['properties']['gemeente_naam'] = (\n",
    "            feature['properties']['GM_NAAM']\n",
@@ -106,6 +127,13 @@
    "    with open('../data/{}_gemeentes.kml'.format(province), 'w') as f:\n",
    "        f.write(as_gabmap_kml(gemeente_geojson, name_property='gemeente_naam'))"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
--- a/Segmentations.ipynb
+++ b/Segmentations.ipynb