resturcured for reproducability

This commit is contained in:
2019-03-19 12:53:12 +01:00
parent 7da2bfc400
commit eaa71c9eeb
102 changed files with 2251 additions and 1472517 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,83 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Gabmap format\n",
"\n",
"Exploration of the format of the lines in example Gabmap files Martijn had sent."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/martijn_format/Dutch613-coordinates.txt') as f:\n",
" coordinates = list(f)\n",
" \n",
"with open('../data/martijn_format/Nederlands-ipa.utxt') as f:\n",
" table = list(f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"coordinates[0].split('\\t')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"coordinates[1].split('\\t')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"table[0].split('\\t')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"table[1].split('\\t')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,458 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Geographical pronunciation tables, simple example\n",
"\n",
"Simple example to create gabmap files for two words with few pronunciations an two regions."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('..')\n",
"\n",
"import pandas\n",
"import MySQLdb\n",
"import json\n",
"import copy\n",
"\n",
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
"\n",
"from shapely.geometry import shape, Point\n",
"\n",
"from gabmap import create_gabmap_dataframes\n",
"\n",
"from stimmen.geojson import merge_features"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/Friesland_wijken.geojson') as f:\n",
" regions = json.load(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and simplify"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Answers to how participants state a word should be pronounced\n",
"\n",
"answers = pandas.read_sql('''\n",
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
"FROM core_surveyresult as survey\n",
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
" ON result.id = answer.prediction_quiz_id\n",
"''', db)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"regions_simple = merge_features(copy.deepcopy(regions),\n",
" condition=lambda feature: feature['properties']['GM_NAAM'] == 'Heerenveen',\n",
")\n",
"\n",
"regions_simple = merge_features(\n",
" regions_simple,\n",
" condition=lambda feature: feature['properties']['GM_NAAM'] == 'Leeuwarden',\n",
")\n",
"regions_simple['features'] = regions_simple['features'][-2:]\n",
"\n",
"regions_simple['features'][0]['properties']['name'] = 'Heerenveen'\n",
"regions_simple['features'][1]['properties']['name'] = 'Leeuwarden'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"answers_simple = answers[\n",
" (answers['question_text'] == '\"blad\" (aan een boom)') |\n",
" (answers['question_text'] == '\"vis\"')\n",
"].copy()\n",
"\n",
"answers_simple['question_text'] = answers_simple['question_text'].map(\n",
" lambda x: x.replace('\"', '').replace('*', ''))\n",
"\n",
"answers_simple['answer_text'] = answers_simple['answer_text'].map(\n",
" lambda x: x[x.find('('):x.find(')')][1:])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Two words, boom and vis, with each 4 and 2 pronunciations"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>answer_text</th>\n",
" </tr>\n",
" <tr>\n",
" <th>question_text</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>blad (aan een boom)</th>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>vis</th>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" answer_text\n",
"question_text \n",
"blad (aan een boom) 4\n",
"vis 2"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"answers_simple.groupby('question_text').agg({'answer_text': lambda x: len(set(x))})"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"centroids_example, pronunciations_example, counts_example = create_gabmap_dataframes(\n",
" regions_simple, answers_simple,\n",
" latitude_column='user_lat', longitude_column='user_lng',\n",
" word_column='question_text', pronunciation_column='answer_text',\n",
" region_name_property='name'\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Resulting tables\n",
"\n",
"Stored as tab separated files for gabmap"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" </tr>\n",
" <tr>\n",
" <th>#name</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Heerenveen</th>\n",
" <td>52.996076</td>\n",
" <td>5.977925</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leeuwarden</th>\n",
" <td>53.169940</td>\n",
" <td>5.797613</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" latitude longitude\n",
"#name \n",
"Heerenveen 52.996076 5.977925\n",
"Leeuwarden 53.169940 5.797613"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"centroids_example"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>blad (aan een boom)</th>\n",
" <th>vis</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Heerenveen</th>\n",
" <td>blet / blɑt / blɔd / blɛ:t</td>\n",
" <td>fisk / fɪs</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leeuwarden</th>\n",
" <td>blet / blɑt / blɔd / blɛ:t</td>\n",
" <td>fisk / fɪs</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" blad (aan een boom) vis\n",
" \n",
"Heerenveen blet / blɑt / blɔd / blɛ:t fisk / fɪs\n",
"Leeuwarden blet / blɑt / blɔd / blɛ:t fisk / fɪs"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pronunciations_example"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>blad (aan een boom): blet</th>\n",
" <th>blad (aan een boom): blɑt</th>\n",
" <th>blad (aan een boom): blɔd</th>\n",
" <th>blad (aan een boom): blɛ:t</th>\n",
" <th>vis: fisk</th>\n",
" <th>vis: fɪs</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Heerenveen</th>\n",
" <td>31.654676</td>\n",
" <td>2.158273</td>\n",
" <td>2.158273</td>\n",
" <td>64.028777</td>\n",
" <td>52.517986</td>\n",
" <td>47.482014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leeuwarden</th>\n",
" <td>7.865169</td>\n",
" <td>7.022472</td>\n",
" <td>8.707865</td>\n",
" <td>76.404494</td>\n",
" <td>75.000000</td>\n",
" <td>25.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" blad (aan een boom): blet blad (aan een boom): blɑt \\\n",
" \n",
"Heerenveen 31.654676 2.158273 \n",
"Leeuwarden 7.865169 7.022472 \n",
"\n",
" blad (aan een boom): blɔd blad (aan een boom): blɛ:t vis: fisk \\\n",
" \n",
"Heerenveen 2.158273 64.028777 52.517986 \n",
"Leeuwarden 8.707865 76.404494 75.000000 \n",
"\n",
" vis: fɪs \n",
" \n",
"Heerenveen 47.482014 \n",
"Leeuwarden 25.000000 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counts_example"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"pronunciations_example.to_csv('../data/Pronunciations_example.gabmap.tsv', sep='\\t')\n",
"counts_example.to_csv('../data/Pronunciation_percentages_example.gabmap.tsv', sep='\\t')\n",
"centroids_example.to_csv('../data/Centroids_example.gabmap.tsv', sep='\\t', columns=['longitude', 'latitude'])\n",
"with open('../data/Gabmap_example.geojson', 'w') as f:\n",
" json.dump(regions_simple, f, indent=1)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,157 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Geographical pronunciation tables\n",
"\n",
"Creates gabmap files with region centroids, percentages and pronunciations for wijken in Friesland."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('..')\n",
"\n",
"import pandas\n",
"import MySQLdb\n",
"import json\n",
"import copy\n",
"\n",
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
"\n",
"from shapely.geometry import shape, Point\n",
"\n",
"from gabmap import create_gabmap_dataframes"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/Friesland_wijken.geojson') as f:\n",
" regions = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Answers to how participants state a word should be pronounced\n",
"\n",
"answers = pandas.read_sql('''\n",
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
"FROM core_surveyresult as survey\n",
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
" ON result.id = answer.prediction_quiz_id\n",
"''', db)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"zero_latlng_questions = {\n",
" q\n",
" for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
" if row['user_lat'] == 0 and row['user_lng'] == 0\n",
"}\n",
"answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)].copy()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['gegaan', 'avond', 'heel', 'dag', 'bij (insect)', 'sprak (toe)',\n",
" 'oog', 'armen (lichaamsdeel)', 'kaas', 'deurtje', 'koken',\n",
" 'borst (lichaamsdeel)', 'vis', 'zaterdag', 'trein', 'geel', 'tand',\n",
" 'gezet', 'blad (aan een boom)'], dtype=object)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"answers_filtered['question_text'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"answers_filtered['question_text'] = answers_filtered['question_text'].map(\n",
" lambda x: x.replace('\"', '').replace('*', ''))\n",
"\n",
"answers_filtered['answer_text'] = answers_filtered['answer_text'].map(\n",
" lambda x: x[x.find('('):x.find(')')][1:])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"centroids, pronunciations, counts = create_gabmap_dataframes(\n",
" regions, answers_filtered,\n",
" latitude_column='user_lat', longitude_column='user_lng',\n",
" word_column='question_text', pronunciation_column='answer_text',\n",
" region_name_property='gemeente_en_wijk_naam'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"pronunciations.to_csv('../data/Friesland_wijken_pronunciations.gabmap.tsv', sep='\\t')\n",
"counts.to_csv('../data/Friesland_wijken_pronunciation_percentages.gabmap.tsv', sep='\\t')\n",
"centroids.to_csv('../data/Friesland_wijken_centroids.gabmap.tsv', sep='\\t', columns=['longitude', 'latitude'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,265 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Group recordings in 4 Frysian dialect regions\n",
"\n",
" * Klaaifrysk\n",
" * Waldfrysk\n",
" * Sudwesthoeksk\n",
" * Noardhoeksk\n",
" \n",
"First run `Dialect Regions from image.ipynb`.\n",
"\n",
"![dialect regions](../data/dialects.png)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from math import floor\n",
"import json\n",
"import pandas\n",
"import MySQLdb\n",
"from collections import Counter\n",
"\n",
"from math import sqrt\n",
"import numpy as np\n",
"from shapely.geometry import shape, Point\n",
"from vincenty import vincenty\n",
"\n",
"from jupyter_progressbar import ProgressBar\n",
"\n",
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Input\n",
"\n",
"Load the geojson with the dialect region and create shapely shapes."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"with open('../data/fryslan_dialect_regions.geojson', 'r') as f:\n",
" geojson = json.load(f)\n",
"\n",
"dialect_regions = [region['properties']['dialect'] for region in geojson['features']]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"shapes = {\n",
" feature['properties']['dialect']: shape(feature['geometry'])\n",
" for feature in geojson['features']\n",
"}\n",
"\n",
"def regions_for(coordinate):\n",
" regions = {\n",
" region_name\n",
" for region_name, shape in shapes.items()\n",
" if shape.contains(Point(*coordinate))\n",
" }\n",
" return regions\n",
"\n",
"def distance_to_shape(shape, longitude, latitude):\n",
" ext = shape.exterior\n",
" p = ext.interpolate(ext.project(Point(longitude, latitude)))\n",
" return vincenty((latitude, longitude), (p.y, p.x))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Query and process\n",
"\n",
"Query all picture game and free speech recordings and assign the dialect region."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def dialect_regions_and_distance(data):\n",
" return[\n",
" {\n",
" 'dialects': [\n",
" {\n",
" 'dialect': dialect,\n",
" 'boundary_distance': distance_to_shape(shapes[dialect], longitude, latitude),\n",
" }\n",
" for dialect in regions_for((longitude, latitude))\n",
" ],\n",
" 'filename': filename,\n",
" }\n",
" for filename, (latitude, longitude) in ProgressBar(\n",
" data[['latitude', 'longitude']].iterrows(),\n",
" size=len(data)\n",
" )\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"picture_games = pandas.read_sql('''\n",
"SELECT language.name as language, item.name as picture,\n",
" survey.user_lat as latitude, survey.user_lng as longitude,\n",
" survey.area_name as area, survey.country_name as country,\n",
" result.recording as filename,\n",
" result.submitted_at as date\n",
"FROM core_surveyresult as survey\n",
"INNER JOIN core_picturegameresult as result ON survey.id = result.survey_result_id\n",
"INNER JOIN core_language as language ON language.id = result.language_id\n",
"INNER JOIN core_picturegameitem as item\n",
" ON result.picture_game_item_id = item.id\n",
"''', db)\n",
"picture_games.set_index('filename', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5825449a737b4fcab38a4f4ac2adfd87",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dialect_region_per_picture_game = dialect_regions_and_distance(picture_games)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df = pandas.DataFrame([\n",
" [r['filename'], r['dialects'][0]['dialect'], r['dialects'][0]['boundary_distance']]\n",
" for r in dialect_region_per_picture_game\n",
" if len(r['dialects']) == 1\n",
"], columns = ['filename', 'dialect', 'boundary_distance'])\n",
"\n",
"df.to_excel('../data/picture_game_recordings_by_dialect.xlsx')\n",
"df.to_csv('../data/picture_game_recordings_by_dialect.csv')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"free_speech_games = pandas.read_sql('''\n",
"SELECT language.name as language,\n",
" survey.user_lat as latitude, survey.user_lng as longitude,\n",
" survey.area_name as area, survey.country_name as country,\n",
" result.recording as filename,\n",
" result.submitted_at as date\n",
"FROM core_surveyresult as survey\n",
"INNER JOIN core_freespeechresult as result ON survey.id = result.survey_result_id\n",
"INNER JOIN core_language as language ON language.id = result.language_id\n",
"''', db)\n",
"free_speech_games.set_index('filename', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8afad9f71e544658b554b828932d7769",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dialect_region_per_free_speech = dialect_regions_and_distance(free_speech_games)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df = pandas.DataFrame([\n",
" [r['filename'], r['dialects'][0]['dialect'], r['dialects'][0]['boundary_distance']]\n",
" for r in dialect_region_per_free_speech\n",
" if len(r['dialects']) == 1\n",
"], columns = ['filename', 'dialect', 'boundary_distance'])\n",
"\n",
"df.to_excel('../data/free_speech_recordings_by_dialect.xlsx')\n",
"df.to_csv('../data/free_speech_recordings_by_dialect.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

View File

@@ -7,13 +7,11 @@
"# Segment provinces\n",
"\n",
"\n",
"Create wijk and gemeente level segmentations for all Dutch provinces and save as geojson and Gabmap KML.\n",
"Create wijk and gemeente level segmentations for two Dutch provinces, Groningen and Friesland, and save as geojson and Gabmap KML.\n",
"\n",
"All is based on CBS data.\n",
"All is based on [CBS data](https://www.cbs.nl/nl-nl/dossier/nederland-regionaal/geografische%20data/wijk-en-buurtkaart-2017)\n",
"\n",
"For Friesland, several wijken are merged.\n",
"\n",
"Note: only applied to Groningen and Friesland, because other provinces give gemetry errors."
"For Friesland, several wijken are merged, in particular those of the municipalities Ameland, Harlingen, Schiermonnikoog, Terschelling and Vlieland, and those of Leeuwarden with centroid above 53.167. These neighborhoods are small in area and hence we decided to merge, to avoid a "
]
},
{
@@ -29,7 +27,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -53,14 +51,37 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Groningen\n",
"0\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n",
"Friesland\n",
"0\n",
"1\n",
"2\n",
"3\n",
"4\n",
"5\n",
"6\n"
]
}
],
"source": [
"for province in ['Groningen', 'Friesland']:\n",
" wijken_geojson = gwb_in_province(province, 'wijk', 2018)\n",
" gemeente_geojson = gwb_in_province(province, 'gem', 2018)\n",
"\n",
" wijken_geojson = gwb_in_province(province, 'wijk', 2018, polygon_simplification=None)\n",
" gemeente_geojson = gwb_in_province(province, 'gem', 2018, polygon_simplification=None)\n",
" \n",
" if province == 'Friesland':\n",
" for gemeente in {'Ameland', 'Harlingen', 'Schiermonnikoog', 'Terschelling', 'Vlieland'}:\n",
" merged_geojson = merge_features(\n",
@@ -83,14 +104,14 @@
" for gemeente in [feature['properties']['GM_NAAM'] for feature in gemeente_geojson['features']]:\n",
" gemeente_geojson = merge_features(\n",
" gemeente_geojson, condition=lambda feature: feature['properties']['GM_NAAM'] == gemeente)\n",
" \n",
" \n",
" for feature in wijken_geojson['features']:\n",
" feature['properties']['gemeente_en_wijk_naam'] = (\n",
" feature['properties']['GM_NAAM'] +\n",
" ', ' +\n",
" feature['properties'].get('WK_NAAM', '')\n",
" ).replace('&', 'en').replace('/', ' ').replace('\"', ' ').replace(\"'\", ' ')\n",
" \n",
" \n",
" for feature in gemeente_geojson['features']:\n",
" feature['properties']['gemeente_naam'] = (\n",
" feature['properties']['GM_NAAM']\n",
@@ -106,6 +127,13 @@
" with open('../data/{}_gemeentes.kml'.format(province), 'w') as f:\n",
" f.write(as_gabmap_kml(gemeente_geojson, name_property='gemeente_naam'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

File diff suppressed because one or more lines are too long