cleaned up gabmap file creation

This commit is contained in:
2018-10-01 17:19:36 +02:00
parent e9ca31d8ec
commit d299069253
36 changed files with 42976 additions and 47776 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,83 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Gabmap format\n",
"\n",
"Exploration of the format of the lines in example Gabmap files Martijn had sent."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/martijn_format/Dutch613-coordinates.txt') as f:\n",
" coordinates = list(f)\n",
" \n",
"with open('../data/martijn_format/Nederlands-ipa.utxt') as f:\n",
" table = list(f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"coordinates[0].split('\\t')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"coordinates[1].split('\\t')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"table[0].split('\\t')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"table[1].split('\\t')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,458 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Geographical pronunciation tables, simple example\n",
"\n",
"Simple example to create gabmap files for two words with few pronunciations an two regions."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('..')\n",
"\n",
"import pandas\n",
"import MySQLdb\n",
"import json\n",
"import copy\n",
"\n",
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
"\n",
"from shapely.geometry import shape, Point\n",
"\n",
"from gabmap import create_gabmap_dataframes\n",
"\n",
"from stimmen.geojson import merge_features"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/Friesland_wijken.geojson') as f:\n",
" regions = json.load(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and simplify"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Answers to how participants state a word should be pronounced\n",
"\n",
"answers = pandas.read_sql('''\n",
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
"FROM core_surveyresult as survey\n",
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
" ON result.id = answer.prediction_quiz_id\n",
"''', db)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"regions_simple = merge_features(copy.deepcopy(regions),\n",
" condition=lambda feature: feature['properties']['GM_NAAM'] == 'Heerenveen',\n",
")\n",
"\n",
"regions_simple = merge_features(\n",
" regions_simple,\n",
" condition=lambda feature: feature['properties']['GM_NAAM'] == 'Leeuwarden',\n",
")\n",
"regions_simple['features'] = regions_simple['features'][-2:]\n",
"\n",
"regions_simple['features'][0]['properties']['name'] = 'Heerenveen'\n",
"regions_simple['features'][1]['properties']['name'] = 'Leeuwarden'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"answers_simple = answers[\n",
" (answers['question_text'] == '\"blad\" (aan een boom)') |\n",
" (answers['question_text'] == '\"vis\"')\n",
"].copy()\n",
"\n",
"answers_simple['question_text'] = answers_simple['question_text'].map(\n",
" lambda x: x.replace('\"', '').replace('*', ''))\n",
"\n",
"answers_simple['answer_text'] = answers_simple['answer_text'].map(\n",
" lambda x: x[x.find('('):x.find(')')][1:])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Two words, boom and vis, with each 4 and 2 pronunciations"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>answer_text</th>\n",
" </tr>\n",
" <tr>\n",
" <th>question_text</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>blad (aan een boom)</th>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>vis</th>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" answer_text\n",
"question_text \n",
"blad (aan een boom) 4\n",
"vis 2"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"answers_simple.groupby('question_text').agg({'answer_text': lambda x: len(set(x))})"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"centroids_example, pronunciations_example, counts_example = create_gabmap_dataframes(\n",
" regions_simple, answers_simple,\n",
" latitude_column='user_lat', longitude_column='user_lng',\n",
" word_column='question_text', pronunciation_column='answer_text',\n",
" region_name_property='name'\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Resulting tables\n",
"\n",
"Stored as tab separated files for gabmap"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" </tr>\n",
" <tr>\n",
" <th>#name</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Heerenveen</th>\n",
" <td>52.996076</td>\n",
" <td>5.977925</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leeuwarden</th>\n",
" <td>53.169940</td>\n",
" <td>5.797613</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" latitude longitude\n",
"#name \n",
"Heerenveen 52.996076 5.977925\n",
"Leeuwarden 53.169940 5.797613"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"centroids_example"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>blad (aan een boom)</th>\n",
" <th>vis</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Heerenveen</th>\n",
" <td>blet / blɑt / blɔd / blɛ:t</td>\n",
" <td>fisk / fɪs</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leeuwarden</th>\n",
" <td>blet / blɑt / blɔd / blɛ:t</td>\n",
" <td>fisk / fɪs</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" blad (aan een boom) vis\n",
" \n",
"Heerenveen blet / blɑt / blɔd / blɛ:t fisk / fɪs\n",
"Leeuwarden blet / blɑt / blɔd / blɛ:t fisk / fɪs"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pronunciations_example"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>blad (aan een boom): blet</th>\n",
" <th>blad (aan een boom): blɑt</th>\n",
" <th>blad (aan een boom): blɔd</th>\n",
" <th>blad (aan een boom): blɛ:t</th>\n",
" <th>vis: fisk</th>\n",
" <th>vis: fɪs</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Heerenveen</th>\n",
" <td>31.654676</td>\n",
" <td>2.158273</td>\n",
" <td>2.158273</td>\n",
" <td>64.028777</td>\n",
" <td>52.517986</td>\n",
" <td>47.482014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leeuwarden</th>\n",
" <td>7.865169</td>\n",
" <td>7.022472</td>\n",
" <td>8.707865</td>\n",
" <td>76.404494</td>\n",
" <td>75.000000</td>\n",
" <td>25.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" blad (aan een boom): blet blad (aan een boom): blɑt \\\n",
" \n",
"Heerenveen 31.654676 2.158273 \n",
"Leeuwarden 7.865169 7.022472 \n",
"\n",
" blad (aan een boom): blɔd blad (aan een boom): blɛ:t vis: fisk \\\n",
" \n",
"Heerenveen 2.158273 64.028777 52.517986 \n",
"Leeuwarden 8.707865 76.404494 75.000000 \n",
"\n",
" vis: fɪs \n",
" \n",
"Heerenveen 47.482014 \n",
"Leeuwarden 25.000000 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counts_example"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"pronunciations_example.to_csv('../data/Pronunciations_example.gabmap.tsv', sep='\\t')\n",
"counts_example.to_csv('../data/Pronunciation_percentages_example.gabmap.tsv', sep='\\t')\n",
"centroids_example.to_csv('../data/Centroids_example.gabmap.tsv', sep='\\t', columns=['longitude', 'latitude'])\n",
"with open('../data/Gabmap_example.geojson', 'w') as f:\n",
" json.dump(regions_simple, f)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -0,0 +1,157 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Geographical pronunciation tables\n",
"\n",
"Creates gabmap files with region centroids, percentages and pronunciations for wijken in Friesland."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('..')\n",
"\n",
"import pandas\n",
"import MySQLdb\n",
"import json\n",
"import copy\n",
"\n",
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
"\n",
"from shapely.geometry import shape, Point\n",
"\n",
"from gabmap import create_gabmap_dataframes"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/Friesland_wijken.geojson') as f:\n",
" regions = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Answers to how participants state a word should be pronounced\n",
"\n",
"answers = pandas.read_sql('''\n",
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
"FROM core_surveyresult as survey\n",
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
" ON result.id = answer.prediction_quiz_id\n",
"''', db)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"zero_latlng_questions = {\n",
" q\n",
" for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
" if row['user_lat'] == 0 and row['user_lng'] == 0\n",
"}\n",
"answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)].copy()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['gegaan', 'avond', 'heel', 'dag', 'bij (insect)', 'sprak (toe)',\n",
" 'oog', 'armen (lichaamsdeel)', 'kaas', 'deurtje', 'koken',\n",
" 'borst (lichaamsdeel)', 'vis', 'zaterdag', 'trein', 'geel', 'tand',\n",
" 'gezet', 'blad (aan een boom)'], dtype=object)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"answers_filtered['question_text'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"answers_filtered['question_text'] = answers_filtered['question_text'].map(\n",
" lambda x: x.replace('\"', '').replace('*', ''))\n",
"\n",
"answers_filtered['answer_text'] = answers_filtered['answer_text'].map(\n",
" lambda x: x[x.find('('):x.find(')')][1:])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"centroids, pronunciations, counts = create_gabmap_dataframes(\n",
" regions, answers_filtered,\n",
" latitude_column='user_lat', longitude_column='user_lng',\n",
" word_column='question_text', pronunciation_column='answer_text',\n",
" region_name_property='gemeente_en_wijk_naam'\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"pronunciations.to_csv('../data/Friesland_wijken_pronunciations.gabmap.tsv', sep='\\t')\n",
"counts.to_csv('../data/Friesland_wijken_pronunciation_percentages.gabmap.tsv', sep='\\t')\n",
"centroids.to_csv('../data/Friesland_wijken_centroids.gabmap.tsv', sep='\\t', columns=['longitude', 'latitude'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

View File

@@ -18,7 +18,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -49,13 +49,13 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"with open('dialect_regions.geojson', 'r') as f:\n",
"with open('../data/fryslan_dialect_regions.geojson', 'r') as f:\n",
" geojson = json.load(f)\n",
"\n",
"dialect_regions = [region['properties']['dialect'] for region in geojson['features']]"
@@ -63,7 +63,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -97,7 +97,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -122,7 +122,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -143,13 +143,13 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "67ed3190256b447c81daf3df1f189318",
"model_id": "5825449a737b4fcab38a4f4ac2adfd87",
"version_major": 2,
"version_minor": 0
},
@@ -167,7 +167,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -183,7 +183,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -202,13 +202,13 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "201b0aed64e8494db603de15b560d919",
"model_id": "8afad9f71e544658b554b828932d7769",
"version_major": 2,
"version_minor": 0
},
@@ -226,7 +226,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [

View File

@@ -1,430 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"with open('martijn_format/Dutch613-coordinates.txt') as f:\n",
" coordinates = list(f)\n",
" \n",
"with open('martijn_format/Nederlands-ipa.utxt') as f:\n",
" table = list(f)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Aalsmeer NH', '4.76163', '52.2693\\n']"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"coordinates[1].split('\\t')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['',\n",
" 'kippen',\n",
" 'mijn',\n",
" 'vriend',\n",
" 'bloemen',\n",
" 'spinnen',\n",
" 'machines',\n",
" 'werk',\n",
" 'op',\n",
" 'schip',\n",
" 'kregen',\n",
" 'beschimmeld',\n",
" 'brood',\n",
" 'timmerman',\n",
" 'splinter',\n",
" 'vinger',\n",
" 'fabriek',\n",
" 'vier',\n",
" 'bier',\n",
" 'twee',\n",
" 'drie',\n",
" 'hij',\n",
" 'knuppel',\n",
" 'ik',\n",
" 'knie',\n",
" 'gezien',\n",
" 'ragebol',\n",
" 'pet',\n",
" 'paddestoel',\n",
" 'kerel',\n",
" 'brede',\n",
" 'stenen',\n",
" 'breder',\n",
" 'breedste',\n",
" 'standbeeld',\n",
" 'duivel',\n",
" 'gebleven',\n",
" 'meester',\n",
" 'zee',\n",
" 'graag',\n",
" 'keelpijn',\n",
" 'steel',\n",
" 'bezem',\n",
" 'neen',\n",
" 'geroepen',\n",
" 'peer',\n",
" 'rijp',\n",
" 'geld',\n",
" 'ver',\n",
" 'brengen',\n",
" 'vrouw',\n",
" 'zwemmen',\n",
" 'sterk',\n",
" 'bed',\n",
" 'optillen',\n",
" 'metselaar',\n",
" 'springen',\n",
" 'boterham',\n",
" 'vader',\n",
" 'zes',\n",
" 'jaar',\n",
" 'school',\n",
" 'laten',\n",
" 'gaan',\n",
" 'water',\n",
" 'potten',\n",
" 'zijn',\n",
" 'veel',\n",
" 'maart',\n",
" 'nog',\n",
" 'koud',\n",
" 'kaars',\n",
" 'geeft',\n",
" 'licht',\n",
" 'paard',\n",
" 'tegen',\n",
" 'zwaluwen',\n",
" 'kaas',\n",
" 'motor',\n",
" 'dag',\n",
" 'avond',\n",
" 'jongetje',\n",
" 'barst',\n",
" 'brief',\n",
" 'hart',\n",
" 'spannen',\n",
" 'nieuwe',\n",
" 'kar',\n",
" 'zoon',\n",
" 'koning',\n",
" 'ook',\n",
" 'geweest',\n",
" 'rozen',\n",
" 'lange',\n",
" 'woord',\n",
" 'kindje',\n",
" 'was',\n",
" 'dochtertje',\n",
" 'bos',\n",
" 'ladder',\n",
" 'mond',\n",
" 'droog',\n",
" 'dorst',\n",
" 'weg',\n",
" 'krom',\n",
" 'liedje',\n",
" 'goed',\n",
" 'kelder',\n",
" 'voor',\n",
" 'moest',\n",
" 'ossenbloed',\n",
" 'drinken',\n",
" 'broer',\n",
" 'moe',\n",
" 'karnemelk',\n",
" 'dun',\n",
" 'zuur',\n",
" 'put',\n",
" 'uur',\n",
" 'Italië',\n",
" 'bergen',\n",
" 'vuur',\n",
" 'spuwen',\n",
" 'duwen',\n",
" 'hebben',\n",
" 'stuk',\n",
" 'brug',\n",
" 'veulen',\n",
" 'komen',\n",
" 'deur',\n",
" 'naaien',\n",
" 'gras',\n",
" 'brouwer',\n",
" 'bakken',\n",
" 'je',\n",
" 'eieren',\n",
" 'krijgen',\n",
" 'markt',\n",
" 'waren',\n",
" 'vijf',\n",
" 'eikels',\n",
" 'hooi',\n",
" 'is',\n",
" 'groen',\n",
" 'boompje',\n",
" 'wijn',\n",
" 'huis',\n",
" 'melk',\n",
" 'spuit',\n",
" 'koe',\n",
" 'koster',\n",
" 'kruiwagen',\n",
" 'buigen',\n",
" 'Duitsers',\n",
" 'blauw',\n",
" 'geslagen',\n",
" 'saus',\n",
" 'flauw',\n",
" 'sneeuw',\n",
" 'stad',\n",
" 'doen',\n",
" 'dopen',\n",
" 'doopvont',\n",
" 'soldaten',\n",
" 'dorsen',\n",
" 'binden',\n",
" 'gebonden\\n']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"table[0].split('\\t')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['West-Terschelling',\n",
" 'kipən',\n",
" 'miŋ',\n",
" 'kɑ̟mərɑ̟ːt',\n",
" 'blʊmə',\n",
" 'spɪnə',\n",
" 'məsinəs / məʃinəs',\n",
" 'ʋɔrə̆k',\n",
" 'ʊp̬',\n",
" 'sxɪp',\n",
" 'kreːɣə̃ / krɪɣə̃',\n",
" 'fəsxɪməlt / bəsxɪməlt',\n",
" 'bro̝ˑə̆t',\n",
" 'tɪmərmɑn',\n",
" 'splɪntər',\n",
" 'fɪŋər',\n",
" 'fəbrik',\n",
" 'fjɔŭwər',\n",
" 'biˑə̆r',\n",
" 'twɑ̟',\n",
" 'treːĭjə',\n",
" 'hɛĭ',\n",
" 'knʏpəl / ɛinhɔŭt',\n",
" 'ɪk',\n",
" 'knɪbəl',\n",
" 'siˑə̆n',\n",
" 'rɑ̟ːɣəbɔl',\n",
" 'pɛt',\n",
" 'pɑ̟dəstuˑə̆l',\n",
" 'mɑ̟n',\n",
" 'breːdə / breːjə',\n",
" 'stɪː',\n",
" 'breːdər',\n",
" 'breːstə',\n",
" 'stɔndbeːlt',\n",
" 'dyvəl',\n",
" 'blɔŭn',\n",
" 'meːstər',\n",
" 'seˑ',\n",
" 'xrɑːx',\n",
" 'ətĩˑsĩkeːl',\n",
" 'stɛːl',\n",
" 'biːzəm',\n",
" 'neː',\n",
" 'rɔft',\n",
" 'pɔˑə̆r',\n",
" 'rip',\n",
" 'jɪlt',\n",
" 'fiˑə̆r',\n",
" 'brɪŋə',\n",
" 'ʋiːf',\n",
" 'swʊmə',\n",
" 'stɛrk',\n",
" 'bɛˑə̆t',\n",
" 'ʊptɪlən',\n",
" '',\n",
" 'sprɪŋə',\n",
" '',\n",
" 'tɔ̞ˑə̆',\n",
" 'sɛks',\n",
" 'jiə̆r',\n",
" 'sxuˑəl',\n",
" 'lɪtn̩̆',\n",
" 'xeˑə̆̃',\n",
" 'ʋɛtər',\n",
" 'pɔtn̩̆',\n",
" 'bɪn',\n",
" 'fʊlə',\n",
" 'mɑˑə̆t',\n",
" 'nɔx',\n",
" 'kɔˑə̆t',\n",
" 'kɛs',\n",
" 'jʊxt',\n",
" 'jɛxt',\n",
" 'hoĭsʲ',\n",
" 'tsjɪ',\n",
" 'swɑːlywə',\n",
" 'tsiːs / tsjiːs',\n",
" 'moˑtər',\n",
" 'dɛĭ',\n",
" 'ioŋ',\n",
" 'jʊŋkjə',\n",
" 'bœ̝st',\n",
" 'briːf',\n",
" 'hɔĭtʲ',\n",
" 'spɔnə',\n",
" 'niˑjə',\n",
" 'kɑ̟rə',\n",
" 'sɪn',\n",
" 'koˑə̆nɪŋ',\n",
" 'eˑə̆k',\n",
" 'ʋɛn',\n",
" 'roˑə̆zən',\n",
" 'lɑ̟ŋə',\n",
" 'ʋɔĭtʲ',\n",
" 'bɔ̞nʲ',\n",
" 'ʋɑ̟z',\n",
" 'fɑ̟mkə / dɔxtərtsə',\n",
" 'bʊs',\n",
" 'leˑə̆rt',\n",
" 'mylə',\n",
" 'drux',\n",
" 'toˑə̆st',\n",
" 'ʋɛĭ',\n",
" '',\n",
" 'fɛsjə',\n",
" 'xuˑət',\n",
" 'kɛldər',\n",
" 'fŭɑ̟r',\n",
" 'mɔs',\n",
" 'ɔsəbluˑət̬',\n",
" 'drɪŋkə',\n",
" 'bruər',\n",
" 'muˑə̆t',\n",
" 'suˑp / kɑrnəmoˑə̆lək',\n",
" 'tɪn',\n",
" 'suːr',\n",
" 'pʏt',\n",
" 'uːr',\n",
" 'itɑ̟ːljə',\n",
" 'bɑ̟rɣən',\n",
" 'fjuːr',\n",
" 'spiˑə̆n',\n",
" 'drʏkən',\n",
" 'hɑ̟bə',\n",
" 'stɪk',\n",
" 'brʏx',\n",
" 'fɔlʲtsə',\n",
" 'kʊmə',\n",
" 'doˑə̆r',\n",
" 'nɑːĭjə',\n",
" 'xɛs',\n",
" 'brɔŭwər',\n",
" 'bɑ̟kə',\n",
" 'do̞',\n",
" 'ɑːə̆jən',\n",
" 'kriˑjə',\n",
" 'mɑ̟rək / mɑrəkt',\n",
" 'wɑːrə̃',\n",
" 'fiːf',\n",
" 'ɛikəls',\n",
" 'heˑə̆',\n",
" 'ɪz',\n",
" 'xriˑə̆n',\n",
" 'boːmkə',\n",
" 'ʋin',\n",
" 'hyːs',\n",
" 'mʊə̆lək',\n",
" 'spœĭt',\n",
" 'ku',\n",
" 'kɔstər',\n",
" 'krødʋɛin',\n",
" 'buːɣə',\n",
" 'dytsərs',\n",
" 'blɑːŭ',\n",
" 'slɛin',\n",
" 'sjy',\n",
" '',\n",
" 'sneː',\n",
" 'stɑ̟t',\n",
" 'dweˑə̆n',\n",
" 'doːpə',\n",
" 'doːpfʊnt',\n",
" 'sɔldɑːtən',\n",
" 'tɛskjə',\n",
" 'binə',\n",
" 'boŋ\\n']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"table[1].split('\\t')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,433 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pronunciation-based location prediction confusion\n",
"\n",
"Setup a pandas dataframe with in each row\n",
"\n",
" * participant provided (actual) location,\n",
" * 3 estimations made by Nanna's heuristic based in what the participant stated to be the correct pronunciation of a word\n",
" * distance between the actual and heuristic predicted location\n",
" \n",
"Averages of the distances are exported for visualisation in QGIS."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"import pandas\n",
"import MySQLdb\n",
"import numpy\n",
"import itertools\n",
"import requests\n",
"import json\n",
"from vincenty import vincenty\n",
"\n",
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen')\n",
"\n",
"%matplotlib inline\n",
"from matplotlib import pyplot, rcParams\n",
"from jupyter_progressbar import ProgressBar\n",
"\n",
"# rcParams['font.family'] = 'Lucinda Console'\n",
"rcParams['font.size'] = '24'\n",
"rcParams['figure.figsize'] = (20, 10)\n",
"rcParams['figure.dpi'] = 100"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def simplify_area_name(x):\n",
" return ' '.join(\n",
" x.split('/') # split Dutch and Frysian name\n",
" [0] # extract Dutch name\n",
" .strip()\n",
" .split(' ') # Split area name from province, mostly 'Fr'\n",
" [:-1] # remove province\n",
" ).strip().lower() # rejoin spaces in area name"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"metadata = pandas.read_sql('''SELECT answer.* FROM core_surveyresultquestionanswer as answer''', db)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"grouped = metadata.groupby(['survey_result_id', 'question_id']).agg({\n",
" 'question_text': 'first',\n",
" 'answer_text': lambda x: x if len(x) == 1 else ', '.join(x)\n",
"})\n",
"grouped.reset_index(inplace=True)\n",
"\n",
"grouped = grouped.pivot(index='survey_result_id', columns='question_text', values='answer_text')\n",
"\n",
"grouped = grouped.rename({\n",
" 'Do you go to school?': 'school',\n",
" 'Do you go to university?': 'university',\n",
" 'What is your age bracket?': 'age_bracket',\n",
" 'What is your age?': 'age',\n",
" 'What is your gender?': 'gender',\n",
" 'Which language are you the most proficient in?': 'language',\n",
" 'Which languages do you actively use in your life?': 'active-languages'\n",
"}, axis='columns')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"predictions = pandas.read_sql('''\n",
"SELECT \n",
" sr.id as id,\n",
" sr.area_name as actual_area,\n",
" area1_name as area_prediction_1,\n",
" area2_name as area_prediction_2,\n",
" area3_name as area_prediction_3\n",
"FROM core_surveyresult as sr\n",
"INNER JOIN core_predictionquizresult as pq\n",
" ON sr.id = pq.survey_result_id\n",
"''', db)\n",
"\n",
"predicted_areas = set(map(simplify_area_name,\n",
" set(predictions['area_prediction_1']) |\n",
" set(predictions['area_prediction_2']) |\n",
" set(predictions['area_prediction_3'])\n",
"))\n",
"actual_areas = set(map(str.lower, predictions['actual_area']))\n",
"\n",
"areas = list(predicted_areas | actual_areas)\n",
"location_to_number = {l: i for i, l in enumerate(areas)}"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"simplified_predictions = pandas.DataFrame({\n",
" 'id': list(predictions['id']),\n",
" 'actual': list(map(str.lower, predictions['actual_area'])),\n",
" 'prediction_1': list(map(simplify_area_name, predictions['area_prediction_1'])),\n",
" 'prediction_2': list(map(simplify_area_name, predictions['area_prediction_2'])),\n",
" 'prediction_3': list(map(simplify_area_name, predictions['area_prediction_3'])),\n",
"})\n",
"# simplified_predictions.set_index('id')\n",
"simplified_predictions.to_excel('actual-predictions.xls')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"locations = {location for c in simplified_predictions.columns for location in simplified_predictions[c] if c != 'id'}"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"names = pandas.read_csv('plaatsen_nl.csv')\n",
"\n",
"nonominatim = {\n",
" name: [row['st_y'], row['x']]\n",
" for _, row in names.iterrows()\n",
" for column in ['bebouwdeko', 'naamoffici', 'naamnl', 'naamfries']\n",
"# for _ in [ print(row[column]) ]\n",
" if type(row[column]) == str\n",
" for name in [row[column], row[column].lower().replace('-', ' ')]\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "21c016f24e23473e807ed3e9c2d942c6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"nominatim = {\n",
" l: json.loads(\n",
" requests.get(\n",
" 'https://nominatim.openstreetmap.org/search.php?q=Netherlands%20'\n",
" '{}&polygon_geojson=1&viewbox=&format=json'.format(l)\n",
" ).text\n",
" )\n",
" for l in ProgressBar(locations)\n",
" if l not in nonominatim\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"latlons = {\n",
" l: (float(v[0]['lat']), float(v[0]['lon']))\n",
" for l, v in nominatim.items()\n",
" if len(v) > 0\n",
"}\n",
"latlons.update(nonominatim)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"for c in {'actual', 'prediction_1', 'prediction_2', 'prediction_3'}:\n",
" simplified_predictions['{}_latlon'.format(c)] = [\n",
" latlons.get(l, numpy.nan)\n",
" for l in simplified_predictions['{}'.format(c)]\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"for c in {'prediction_1_latlon', 'prediction_2_latlon', 'prediction_3_latlon'}:\n",
" simplified_predictions['{}_distance'.format(c)] = [\n",
" vincenty(x, y) if x == x and y == y else numpy.nan\n",
" for x, y in zip(simplified_predictions['actual_latlon'], simplified_predictions[c])\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"simplified_predictions = simplified_predictions[[\n",
" 'id', 'actual', 'actual_latlon', 'prediction_3_latlon_distance',\n",
" 'prediction_1_latlon_distance', 'prediction_2_latlon_distance'\n",
"]]\n",
"\n",
"simplified_predictions = simplified_predictions.rename({\n",
" 'prediction_3_latlon_distance': 'distance3',\n",
" 'prediction_1_latlon_distance': 'distance1',\n",
" 'prediction_2_latlon_distance': 'distance2'\n",
"}, axis='columns')"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"simplified_predictions = simplified_predictions.join(grouped, on='id')"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"simplified_predictions['latitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[0] if x == x else None)\n",
"simplified_predictions['longitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[1] if x == x else None)\n",
"simplified_predictions = simplified_predictions.drop('actual_latlon', axis='columns')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"simplified_predictions['age_groups'] = [\n",
" {'0-10': '0-20', '11-20': '0-20',\n",
" '21-30': '21-50', '31-40': '21-50', '41-50': '21-50',\n",
" '51-60': '51-100', '61-70': '51-100', '71-80': '51-100', '81-90': '51-100', '91-100': '51-100'}.get(b, None)\n",
" for b in simplified_predictions['age_bracket']\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# age_groups = simplified_predictions.groupby(['age_groups', 'actual']).agg({\n",
"# 'distance1': ['mean', 'min', 'max', 'count', 'size'],\n",
"# 'latitude': 'first',\n",
"# 'longitude': 'first'\n",
"# })\n",
"# age_groups.index.get_level_values('age_groups')"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# gender_groups = simplified_predictions.groupby(['gender', 'actual']).agg({\n",
"# 'distance1': ['min', 'mean', 'max', 'count', 'size'],\n",
"# 'latitude': 'first',\n",
"# 'longitude': 'first'\n",
"# })\n",
"# gender_groups"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"summary = simplified_predictions[['latitude', 'longitude', 'distance1', 'distance2', 'distance3', 'actual']]"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"summary.to_csv('points.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"geojson = {\n",
" \"type\": \"FeatureCollection\",\n",
" \"features\": [\n",
" {\n",
" \"type\": \"Feature\",\n",
" \"properties\": {\n",
" \"distance 1\": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
" \"distance 2\": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
" \"distance 3\": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
" \"actual\": actual\n",
" },\n",
" \"geometry\": {\n",
" \"type\": \"Point\",\n",
" \"coordinates\": list( actual_lat_lon )[::-1]\n",
" }\n",
" }\n",
" for actual, row in simplified_predictions.groupby('actual')\n",
" if actual != ''\n",
"# for _ in [ print(row['actual_latlon']), print() ]\n",
" for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias \n",
" if actual_lat_lon == actual_lat_lon\n",
" ]\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"geojson = {\n",
" \"type\": \"FeatureCollection\",\n",
" \"features\": [\n",
" {\n",
" \"type\": \"Feature\",\n",
" \"properties\": {\n",
" \"distance 1\": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
" \"distance 2\": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
" \"distance 3\": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
" \"actual\": actual\n",
" },\n",
" \"geometry\": {\n",
" \"type\": \"Point\",\n",
" \"coordinates\": list( actual_lat_lon )[::-1]\n",
" }\n",
" }\n",
" for actual, row in simplified_predictions.groupby('actual')\n",
" if actual != ''\n",
"# for _ in [ print(row['actual_latlon']), print() ]\n",
" for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias \n",
" if actual_lat_lon == actual_lat_lon\n",
" ]\n",
"}"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

View File

@@ -1,293 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Geographical pronunciation statistics"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"import pandas\n",
"import MySQLdb\n",
"import numpy\n",
"import json\n",
"\n",
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
"\n",
"%matplotlib notebook\n",
"from matplotlib import pyplot\n",
"import folium\n",
"from IPython.display import display\n",
"from shapely.geometry import Polygon, MultiPolygon, shape, Point\n",
"from jsbutton import JsButton\n",
"from jupyter_progressbar import ProgressBar\n",
"from collections import defaultdict\n",
"from ipy_table import make_table\n",
"from html import escape\n",
"\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.colors import LogNorm\n",
"from sklearn import mixture\n",
"from skimage.measure import find_contours\n",
"from collections import Counter\n",
"from random import shuffle"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# Borders of Frysian municipalities\n",
"\n",
"with open('Friesland_AL8.GeoJson') as f:\n",
" gemeentes = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"coords = [feature['geometry'] for feature in gemeentes['features']]\n",
"coords_folium = [[[[c__[::-1] for c__ in c_] for c_ in c] for c in coords_['coordinates']] for coords_ in coords]\n",
"shapes = [shape(coords_) for coords_ in coords]\n",
"gemeente_names = [feature['properties']['name'] for feature in gemeentes['features']]\n",
"\n",
"def get_gemeente(point):\n",
" for i, shape in enumerate(shapes):\n",
" if shape.contains(point):\n",
" return i\n",
" return -1"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Answers to how participants state a word should be pronounces.\n",
"\n",
"answers = pandas.read_sql('''\n",
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
"FROM core_surveyresult as survey\n",
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
" ON result.id = answer.prediction_quiz_id\n",
"''', db)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"zero_latlng_questions = {\n",
" q\n",
" for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
" if row['user_lat'] == 0 and row['user_lng'] == 0\n",
"}\n",
"answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" # Remove the CWD from sys.path while we load stuff.\n"
]
}
],
"source": [
"# Takes approximately 2 minutes\n",
"\n",
"gemeente_map = {\n",
" (lng, lat): get_gemeente(Point(lng, lat))\n",
" for lng, lat in set(zip(answers_filtered['user_lng'], answers_filtered['user_lat']))\n",
"}\n",
"\n",
"answers_filtered['gemeente'] = [\n",
" gemeente_map[(lng, lat)]\n",
" for lat, lng in zip(answers_filtered['user_lat'], answers_filtered['user_lng'])\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" \n"
]
}
],
"source": [
"answers_filtered['question_text_url'] = answers_filtered['question_text'].map(\n",
" lambda x: x.replace('\"', '').replace('*', ''))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e48eb24f5c43434bad4241d4bea53074",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"cmap = pyplot.get_cmap('YlOrRd')\n",
"\n",
"for question, rows in ProgressBar(\n",
" answers_filtered.groupby('question_text_url'),\n",
" size=len(answers_filtered['question_text_url'].unique())\n",
"):\n",
" m = folium.Map((rows['user_lat'].median(), rows['user_lng'].median()), tiles=None, zoom_start=9)\n",
" pecentage_labels = folium.FeatureGroup(name='pecentages', overlay=True)\n",
" order = [a for _, a in sorted((\n",
" (r['user_lat'], answer)\n",
" for answer, r in rows.groupby('answer_text').count().iterrows()\n",
" ), reverse=True)]\n",
" gemeente_normalizer = {\n",
" gemeente: r['user_lat']\n",
" for gemeente, r in rows.groupby('gemeente').count().iterrows()\n",
" }\n",
" for answer_text in order:\n",
" rows_ = rows[rows['answer_text'] == answer_text]\n",
" if (rows_['gemeente'] >= 0).sum() <= 0:\n",
" continue\n",
"\n",
" spread = {\n",
" gemeente: r['user_lat']\n",
" for gemeente, r in rows_.groupby('gemeente').count().iterrows()\n",
" if gemeente >= 0\n",
" }\n",
" n_answers = sum(spread.values())\n",
" \n",
" name = '{} ({})'.format(answer_text, n_answers)\n",
" group = folium.FeatureGroup(name=name, overlay=False)\n",
" folium.TileLayer(tiles='stamentoner').add_to(group)\n",
" \n",
" max_value = max(value / gemeente_normalizer[gemeente] for gemeente, value in spread.items())\n",
" for gemeente, gemeente_name in enumerate(gemeente_names):\n",
" if gemeente in spread:\n",
" value = spread[gemeente]\n",
" percentage = value / gemeente_normalizer[gemeente]\n",
" color_value = percentage / max_value\n",
" color = '#%02x%02x%02x' % tuple(int(255 * c) for c in cmap(color_value)[:3])\n",
" \n",
" polygon = folium.Polygon(coords_folium[gemeente], fill_color=color, fill_opacity=0.8,\n",
" color='#555555', popup='{} ({}, {}%)'.format(gemeente_name, value, round(100*percentage)))\n",
" centroid = shapes[gemeente].centroid\n",
" centroid = (centroid.y, centroid.x)\n",
"# folium.Circle(centroid, color='green', radius=200).add_to(group)\n",
" folium.map.Marker(\n",
" [shapes[gemeente].centroid.y, shapes[gemeente].centroid.x],\n",
" icon=folium.DivIcon(\n",
" icon_size=(50, 24),\n",
" icon_anchor=(25, 12),\n",
" html='<div class=\"percentage-label\" style=\"font-size: 12pt; background-color: rgba(255,255,255,0.8); border-radius: 12px; text-align: center;\">{:d}%</div>'.format(int(100 * percentage)),\n",
" )\n",
" ).add_to(group)\n",
" else:\n",
" polygon = folium.Polygon(coords_folium[gemeente], fill_color=None, fill_opacity=0, color='#555555')\n",
" polygon.add_to(group)\n",
" group.add_to(m)\n",
" pecentage_labels.add_to(m)\n",
" folium.map.LayerControl('topright', collapsed=False).add_to(m)\n",
" JsButton(\n",
" title='<i class=\"fas fa-tags\"></i>',\n",
" function=\"\"\"\n",
" function(btn, map){\n",
" $('.percentage-label').toggle();\n",
" }\n",
" \"\"\"\n",
" ).add_to(m)\n",
"# display(m)\n",
" m.save('maps/heatmaps/{}.html'.format(question))\n",
"# break"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import glob\n",
"with open('maps/heatmaps/index.html', 'w') as f:\n",
" f.write('<html><head></head><body>' + \n",
" '<br/>\\n'.join(\n",
" '\\t<a href=\"http://herbertkruitbosch.com/pronunciation_maps/{}\">{}<a>'.format(fn[5:], fn[14:-5].replace('_', ' '))\n",
" for fn in sorted(\n",
" glob.glob('maps/heatmaps/*.html')\n",
" )\n",
" ) + \"</body></html>\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,312 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Geographical pronunciation statistics"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [],
"source": [
"import pandas\n",
"import MySQLdb\n",
"import numpy\n",
"import json\n",
"\n",
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
"\n",
"%matplotlib notebook\n",
"from matplotlib import pyplot\n",
"import folium\n",
"from IPython.display import display\n",
"from shapely.geometry import Polygon, MultiPolygon, shape, Point\n",
"from jsbutton import JsButton\n",
"from shapely.geometry import LineString, MultiLineString\n",
"from jupyter_progressbar import ProgressBar\n",
"from collections import defaultdict, Counter\n",
"from ipy_table import make_table\n",
"from html import escape\n",
"\n",
"import numpy as np\n",
"from random import shuffle\n",
"import pickle\n",
"from jupyter_progressbar import ProgressBar"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {},
"outputs": [],
"source": [
"with open('friesland_wijken.p3', 'rb') as f:\n",
" wijken, wijk_shapes = pickle.load(f)\n",
"\n",
"wijk_names = [wijk['properties']['GM_NAAM'] + ', ' + wijk['properties'].get('WK_NAAM', '') for wijk in wijken['features']]\n",
"\n",
"def get_wijk(point):\n",
" for i, shape in enumerate(wijk_shapes):\n",
" if shape.contains(point):\n",
" return i\n",
" return -1"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
"def listify(rd_multipolygon):\n",
" if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float):\n",
" return list(rd_multipolygon)\n",
" return [\n",
" listify(element)\n",
" for element in rd_multipolygon\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [],
"source": [
"# Answers to how participants state a word should be pronounces.\n",
"\n",
"answers = pandas.read_sql('''\n",
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
"FROM core_surveyresult as survey\n",
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
" ON result.id = answer.prediction_quiz_id\n",
"''', db)"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [],
"source": [
"zero_latlng_questions = {\n",
" q\n",
" for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
" if row['user_lat'] == 0 and row['user_lng'] == 0\n",
"}\n",
"answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)]"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [],
"source": [
"def reverse(rd_multipolygon):\n",
" if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float):\n",
" return rd_multipolygon[::-1]\n",
" return [\n",
" reverse(element)\n",
" for element in rd_multipolygon\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" # Remove the CWD from sys.path while we load stuff.\n"
]
}
],
"source": [
"# Takes approximately 2 minutes\n",
"points = set(zip(answers_filtered['user_lng'], answers_filtered['user_lat']))\n",
"\n",
"wijk_map = dict()\n",
"for lng, lat in points:\n",
" wijk_map[(lng, lat)] = get_wijk(Point(lng, lat))\n",
"\n",
"answers_filtered['wijk'] = [\n",
" wijk_map[(lng, lat)]\n",
" for lat, lng in zip(answers_filtered['user_lat'], answers_filtered['user_lng'])\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" \n"
]
}
],
"source": [
"answers_filtered['question_text_url'] = answers_filtered['question_text'].map(\n",
" lambda x: x.replace('\"', '').replace('*', ''))"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ea89078b81da4daba82bcd4b1ddbe8c2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"cmap = pyplot.get_cmap('YlOrRd')\n",
"\n",
"for question, rows in ProgressBar(\n",
" answers_filtered.groupby('question_text_url'),\n",
" size=len(answers_filtered['question_text_url'].unique())\n",
"):\n",
" m = folium.Map((rows['user_lat'].median(), rows['user_lng'].median()), tiles=None, zoom_start=9)\n",
" order = [a for _, a in sorted((\n",
" (r['user_lat'], answer)\n",
" for answer, r in rows.groupby('answer_text').count().iterrows()\n",
" ), reverse=True)]\n",
" wijk_normalizer = {\n",
" wijk: r['user_lat']\n",
" for wijk, r in rows.groupby('wijk').count().iterrows()\n",
" }\n",
" for answer_text in (order):\n",
" rows_ = rows[rows['answer_text'] == answer_text]\n",
" if (rows_['wijk'] >= 0).sum() <= 0:\n",
" continue\n",
"\n",
" spread = {\n",
" wijk: r['user_lat']\n",
" for wijk, r in rows_.groupby('wijk').count().iterrows()\n",
" if wijk >= 0\n",
" }\n",
" n_answers = sum(spread.values())\n",
" \n",
" name = '{} ({})'.format(answer_text, n_answers)\n",
" group = folium.FeatureGroup(name=name, overlay=False)\n",
" folium.TileLayer(tiles='stamentoner').add_to(group)\n",
" \n",
" max_value = max(value / wijk_normalizer[wijk] for wijk, value in spread.items())\n",
" \n",
" for wijk, wijk_name in enumerate(wijk_names):\n",
" coordinates = reverse(wijken['features'][wijk]['geometry']['coordinates'])\n",
" if wijk in spread:\n",
" value = spread[wijk]\n",
" percentage = value / wijk_normalizer[wijk]\n",
" color_value = percentage / max_value\n",
" color = '#%02x%02x%02x' % tuple(int(255 * c) for c in cmap(color_value)[:3])\n",
" \n",
" polygon = folium.Polygon(\n",
" coordinates, fill_color=color, fill_opacity=0.8,\n",
" color='#555555', popup='{} ({}, {: 3d}%)'.format(wijk_name, value, int(100*percentage))\n",
" \n",
" )\n",
" centroid = wijk_shapes[wijk].centroid\n",
" centroid = (centroid.y, centroid.x)\n",
" folium.map.Marker(\n",
" [wijk_shapes[wijk].centroid.y, wijk_shapes[wijk].centroid.x],\n",
" icon=folium.DivIcon(\n",
" icon_size=(30, 16),\n",
" icon_anchor=(15, 8),\n",
" html='<div class=\"percentage-label\" style=\"font-size: 8pt; background-color: rgba(255,255,255,0.8); border-radius: 4px; text-align: center;\">{:d}%</div>'.format(int(100 * percentage)),\n",
" )\n",
" ).add_to(group)\n",
" else:\n",
" polygon = folium.Polygon(coordinates, fill_color=None, fill_opacity=0, color='#555555')\n",
" polygon.add_to(group)\n",
" group.add_to(m)\n",
" JsButton(\n",
" title='<i class=\"fas fa-tags\"></i>',\n",
" function=\"\"\"\n",
" function(btn, map){\n",
" $('.percentage-label').toggle();\n",
" }\n",
" \"\"\"\n",
" ).add_to(m)\n",
" folium.map.LayerControl('topright', collapsed=False).add_to(m)\n",
"# display(m)\n",
" m.save('maps/heatmaps-wijk/{}.html'.format(question))\n",
"# break"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [],
"source": [
"import glob\n",
"with open('maps/heatmaps-wijk/index.html', 'w') as f:\n",
" f.write('<html><head></head><body>' + \n",
" '<br/>\\n'.join(\n",
" '\\t<a href=\"{}\">{}<a>'.format(fn, fn[:-5].replace('_', ' '))\n",
" for fn in sorted(\n",
" glob.glob('maps/heatmaps-wijk/*.html')\n",
" )\n",
" for fn in [fn[len('maps/heatmaps-wijk/'):]]\n",
" ) + \"</body></html>\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,327 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Geographical pronunciation statistics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas\n",
"import MySQLdb\n",
"import numpy\n",
"import json\n",
"\n",
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
"\n",
"%matplotlib notebook\n",
"from matplotlib import pyplot\n",
"import folium\n",
"from IPython.display import display\n",
"from shapely.geometry import Polygon, MultiPolygon, shape, Point\n",
"from jupyter_progressbar import ProgressBar\n",
"from collections import defaultdict\n",
"from ipy_table import make_table\n",
"from html import escape\n",
"\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.colors import LogNorm\n",
"from sklearn import mixture\n",
"from skimage.measure import find_contours\n",
"from collections import Counter\n",
"from random import shuffle"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Borders of Frysian municipalities\n",
"\n",
"with open('Friesland_AL8.GeoJson') as f:\n",
" gemeentes = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"shapes = [shape(feature['geometry']) for feature in gemeentes['features']]\n",
"gemeente_names = [feature['properties']['name'] for feature in gemeentes['features']]\n",
"\n",
"def get_gemeente(point):\n",
" for i, shape in enumerate(shapes):\n",
" if shape.contains(point):\n",
" return i\n",
" return -1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Answers to how participants state a word should be pronounces.\n",
"\n",
"answers = pandas.read_sql('''\n",
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
"FROM core_surveyresult as survey\n",
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
" ON result.id = answer.prediction_quiz_id\n",
"''', db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Takes approximately 2 minutes\n",
"\n",
"gemeente_map = {\n",
" (lng, lat): get_gemeente(Point(lng, lat))\n",
" for lng, lat in set(zip(answers['user_lng'], answers['user_lat']))\n",
"}\n",
"\n",
"answers['gemeente'] = [\n",
" gemeente_map[(lng, lat)]\n",
" for lat, lng in zip(answers['user_lat'], answers['user_lng'])\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Mapping pronunciations\n",
"\n",
"The idea is to plot each pronunciation as a point of a different color, now only seems to show participation density.\n",
"\n",
"Slow, so started with the first question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# cmap = pyplot.get_cmap('gist_rainbow')\n",
"\n",
"# std = (1.89, 1.35)\n",
"\n",
"# for _, (question, rows) in zip(range(3), answers.groupby('question_text')):\n",
"# plt.figure()\n",
"# n_answers = len(rows.groupby('answer_text').count())\n",
"# colors = cmap(range(256))[::256 // n_answers]\n",
"# for (answer, rows_), color in zip(rows.groupby('answer_text'), colors):\n",
"# if len(rows_) < 100:\n",
"# continue\n",
"# color = '#%02x%02x%02x' % tuple(int(c*255) for c in color[:3])\n",
"# X = rows_[['user_lat', 'user_lng']].as_matrix()\n",
"\n",
"# clf = mixture.GaussianMixture(n_components=5, covariance_type='full')\n",
"# clf.fit(X)\n",
"# xlim = numpy.percentile(X[:, 0], [1, 99.5])\n",
"# ylim = numpy.percentile(X[:, 1], [1, 99.5])\n",
"# xlim = [2*xlim[0] - xlim[1], 2*xlim[1] - xlim[0]]\n",
"# ylim = [2*ylim[0] - ylim[1], 2*ylim[1] - ylim[0]]\n",
" \n",
"# x = np.linspace(*xlim, 1000)\n",
"# y = np.linspace(*ylim, 1000)\n",
"# xx, yy = np.meshgrid(x, y)\n",
"# xxyy = np.array([xx.ravel(), yy.ravel()]).T\n",
"# z = np.exp(clf.score_samples(xxyy))\n",
"# z = z.reshape(xx.shape)\n",
" \n",
"# z_sorted = sorted(z.ravel(), reverse=True)\n",
"# z_sorted_cumsum = np.cumsum(z_sorted)\n",
"# split = np.where(z_sorted_cumsum > (z_sorted_cumsum[-1] * 0.5))[0][0]\n",
"# threshold = z_sorted[split]\n",
"# threshold\n",
"\n",
"# # p = list(range(0, 100, 5))\n",
"\n",
"# p = [80]\n",
"# plt.contour(xx, yy, z, levels=[threshold], colors=[color])\n",
"# plt.plot(X[:, 0], X[:, 1], '.', c=color)\n",
"# plt.xlim(*xlim)\n",
"# plt.ylim(*ylim)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"zero_latlng_questions = {\n",
" q\n",
" for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
" if row['user_lat'] == 0 and row['user_lng'] == 0\n",
"}\n",
"answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"answers_filtered['question_text_url'] = answers_filtered['question_text'].map(\n",
" lambda x: x.replace('\"', '').replace('*', ''))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_palette(n, no_black=True, no_white=True):\n",
" with open('glasbey/{}_colors.txt'.format(n + no_black + no_white)) as f:\n",
" return [\n",
" '#%02x%02x%02x' % tuple(int(c) for c in line.replace('\\n', '').split(','))\n",
" for line in f\n",
" if not no_black or line != '0,0,0\\n'\n",
" if not no_white or line != '255,255,255\\n'\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"options = [x[1] for x in sorted([\n",
" (row['user_lng'], answer_text)\n",
" for answer_text, row in rows.groupby('answer_text').agg({'user_lng': 'count'}).iterrows()\n",
"], reverse=True)]\n",
"\n",
"groups = [options[:len(options) // 2], options[len(options) // 2:]]\n",
"groups"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"80000 / 350"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import glob\n",
"with open('index.html', 'w') as f:\n",
" f.write('<html><head></head><body>' + \n",
" '<br/>\\n'.join(\n",
" '\\t<a href=\"http://herbertkruitbosch.com/pronunciation_maps/{}\">{}<a>'.format(fn, fn[:-4].replace('_', ' '))\n",
" for fn in sorted(\n",
" glob.glob('*_all.html') +\n",
" glob.glob('*_larger.html') +\n",
" glob.glob('*_smaller.html')\n",
" )\n",
" ) + \"</body></html>\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# cmap = pyplot.get_cmap('gist_rainbow')\n",
"# colors = pyplot.get_cmap('tab20')\n",
"# colors = ['#e6194b', '#3cb44b', '#ffe119', '#0082c8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#d2f53c', '#fabebe', '#008080', '#e6beff', '#aa6e28', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000080', '#808080']\n",
"\n",
"std = (1.89, 1.35)\n",
"\n",
"for question, rows in answers_filtered.groupby('question_text_url'):\n",
"# question = rows['question_text_url'][0]\n",
" n_answers = len(rows.groupby('answer_text').count())\n",
" \n",
" \n",
" options = [x[1] for x in sorted([\n",
" (row['user_lng'], answer_text)\n",
" for answer_text, row in rows.groupby('answer_text').agg({'user_lng': 'count'}).iterrows()\n",
" ], reverse=True)]\n",
" groups = [options]\n",
" if n_answers > 6:\n",
" groups.extend([options[:6], options[6:]])\n",
" \n",
" for group, group_name in zip(groups, ['all', 'larger', 'smaller']):\n",
" m = folium.Map((rows['user_lat'].median(), rows['user_lng'].median()), tiles='stamentoner', zoom_start=9)\n",
" # colors = cmap(range(256))[::256 // n_answers]\n",
" colors = get_palette(len(group))\n",
" for answer, color in zip(group, colors):\n",
" rows_ = rows[rows['answer_text'] == answer]\n",
" # color = '#%02x%02x%02x' % tuple(int(c*255) for c in color[:3])\n",
" name = '<span style=\\\\\"color:{}; \\\\\">{} ({})'.format(color, escape(answer), len(rows_))\n",
"\n",
" group = folium.FeatureGroup(name=name)\n",
" colormap[name] = color\n",
"\n",
" for point in zip(rows_['user_lat'], rows_['user_lng']):\n",
" point = tuple(p + 0.01 * s * numpy.random.randn() for p, s in zip(point, std))\n",
" folium.Circle(\n",
" point, color=None, fill_color=color,\n",
" radius=400*min(1, 100 / len(rows_)), fill_opacity=1 #1 - 0.5 * len(rows_) / len(rows)\n",
" ).add_to(group)\n",
" group.add_to(m)\n",
" folium.map.LayerControl('topright', collapsed=False).add_to(m)\n",
" \n",
" print(group_name, question)\n",
" if group_name == 'larger':\n",
" display(m)\n",
" m.save('{}_{}.html'.format(question, group_name))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,397 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Geographical pronunciation statistics"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas\n",
"import MySQLdb\n",
"import numpy\n",
"import json\n",
"\n",
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
"\n",
"%matplotlib notebook\n",
"from matplotlib import pyplot\n",
"import folium\n",
"from IPython.display import display\n",
"from shapely.geometry import Polygon, MultiPolygon, shape, Point\n",
"from jsbutton import JsButton\n",
"from shapely.geometry import LineString, MultiLineString\n",
"from jupyter_progressbar import ProgressBar\n",
"from collections import defaultdict, Counter\n",
"from ipy_table import make_table\n",
"from html import escape\n",
"\n",
"import numpy as np\n",
"from random import shuffle\n",
"import pickle\n",
"from jupyter_progressbar import ProgressBar"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"with open('friesland_wijken_land_only.p3', 'rb') as f:\n",
" wijken, wijk_shapes = pickle.load(f)\n",
"\n",
"for x in wijken['features']:\n",
" x['type'] = 'Feature'\n",
"\n",
"with open('friesland_wijken_geojson.json', 'w') as f:\n",
" wijken['features'] = wijken['features']\n",
" json.dump(wijken, f, indent=1)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from osgeo import gdal, ogr\n",
"\n",
"srcDS = gdal.OpenEx('friesland_wijken_geojson.json')\n",
"ds = gdal.VectorTranslate('friesland_wijken_geojson.kml', srcDS, format='kml')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'k4luâ7mWBAgDSKhCVaysNdr TjeoE85JzëGúcM.,IRtp2-bLû69Un0wZF3Hv1iOfô'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"''.join({\n",
" c\n",
" for wijk in wijken['features']\n",
" for c in wijk['properties']['gemeente_en_wijk_naam']\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"with open('friesland_wijken_land_only.p3', 'rb') as f:\n",
" wijken, wijk_shapes = pickle.load(f)\n",
"\n",
"wijk_names = [wijk['properties']['gemeente_en_wijk_naam'] for wijk in wijken['features']]\n",
"\n",
"def get_wijk(point):\n",
" for i, shape in enumerate(wijk_shapes):\n",
" if shape.contains(point):\n",
" return i\n",
" return -1"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def listify(rd_multipolygon):\n",
" if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float):\n",
" return list(rd_multipolygon)\n",
" return [\n",
" listify(element)\n",
" for element in rd_multipolygon\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Answers to how participants state a word should be pronounces.\n",
"\n",
"answers = pandas.read_sql('''\n",
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
"FROM core_surveyresult as survey\n",
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
" ON result.id = answer.prediction_quiz_id\n",
"''', db)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"zero_latlng_questions = {\n",
" q\n",
" for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
" if row['user_lat'] == 0 and row['user_lng'] == 0\n",
"}\n",
"answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def reverse(rd_multipolygon):\n",
" if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float):\n",
" return rd_multipolygon[::-1]\n",
" return [\n",
" reverse(element)\n",
" for element in rd_multipolygon\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" # Remove the CWD from sys.path while we load stuff.\n"
]
}
],
"source": [
"# Takes approximately 2 minutes\n",
"points = set(zip(answers_filtered['user_lng'], answers_filtered['user_lat']))\n",
"\n",
"wijk_map = dict()\n",
"for lng, lat in points:\n",
" wijk_map[(lng, lat)] = get_wijk(Point(lng, lat))\n",
"\n",
"answers_filtered['wijk'] = [\n",
" wijk_map[(lng, lat)]\n",
" for lat, lng in zip(answers_filtered['user_lat'], answers_filtered['user_lng'])\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" \n",
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" \"\"\"\n",
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" \n"
]
}
],
"source": [
"answers_filtered['question_text_url'] = answers_filtered['question_text'].map(\n",
" lambda x: x.replace('\"', '').replace('*', ''))\n",
"\n",
"answers_filtered['wijk_name'] = answers_filtered['wijk'].map(\n",
" lambda x: wijk_names[x])\n",
"\n",
"answers_filtered['answer_text_url'] = answers_filtered['answer_text'].map(\n",
" lambda x: x[x.find('('):x.find(')')][1:])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"wijken = pandas.DataFrame([\n",
" {'#name': name, 'longitude': shape.centroid.xy[0][0], 'latitude': shape.centroid.xy[1][0]}\n",
" for name, shape in zip(wijk_names, wijk_shapes)\n",
"])\n",
"\n",
"wijken.set_index('#name', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def merge_dicts(*args):\n",
" for arg in args[1:]:\n",
" args[0].update(arg)\n",
" return args[0]\n",
"\n",
"\n",
"pronunciations = pandas.DataFrame([\n",
" merge_dicts(\n",
" {\n",
" question: answers['answer_text_url']\n",
" for question, answers in rows.groupby(\n",
" 'question_text_url'\n",
" ).agg(\n",
" {\n",
" 'answer_text_url': lambda x: [\n",
" {\n",
" 'pronunciation': answer_text,\n",
" 'count': answer_texts.count(answer_text)\n",
" }\n",
" for answer_texts in [list(x)]\n",
" for answer_text in sorted(set(x))\n",
" \n",
" ] \n",
" }\n",
" ).iterrows()\n",
" }, {\n",
" 'wijk': wijk_names[wijk]\n",
" })\n",
" for wijk, rows in answers_filtered.groupby('wijk')\n",
" if wijk >= 0\n",
"])\n",
"\n",
"pronunciations.set_index('wijk', inplace=True)\n",
"pronunciations\n",
"\n",
"columns = list(pronunciations.columns)\n",
"\n",
"counts = pandas.DataFrame([\n",
" merge_dicts({\n",
" column + \": \" + x['pronunciation']: 100 * x['count'] / total\n",
" for column in columns\n",
" for total in [sum(x['count'] for x in row[column])]\n",
" for x in row[column]\n",
" }, {'': wijk})\n",
" for wijk, row in pronunciations.iterrows()\n",
"])\n",
"\n",
"pronunciations = pandas.DataFrame([\n",
" merge_dicts({\n",
" column: ' / '.join(str(x['pronunciation']) for x in row[column])\n",
" for column in columns\n",
" }, {'': wijk})\n",
" for wijk, row in pronunciations.iterrows()\n",
"])\n",
"\n",
"pronunciations.set_index('', inplace=True)\n",
"counts.set_index('', inplace=True)\n",
"counts[counts != counts] = 0"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<function shapely.geometry.geo.shape(context)>"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"shape"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"pronunciations.to_csv('pronunciations_by_wijk.tsv', sep='\\t')\n",
"counts.to_csv('pronunciation_percentages_by_wijk.tsv', sep='\\t')\n",
"wijken.to_csv('wijk_centroid.tsv', sep='\\t', columns=['longitude', 'latitude'])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"with open('pronunciations_by_wijk.tsv') as f:\n",
" p = list(f)\n",
" \n",
"with open('pronunciation_count_by_wijk.tsv') as f:\n",
" c = list(f)\n",
"\n",
"with open('wijk_centroid.tsv') as f:\n",
" w = list(f)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

View File

@@ -1,5 +1,14 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show province segmentation\n",
"\n",
"In gemeentes and wijken as calculated in `Segment Provinces in Wijken and Gemeentes.ipynb`."
]
},
{
"cell_type": "code",
"execution_count": 2,
@@ -34,7 +43,9 @@
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",

File diff suppressed because one or more lines are too long

View File

@@ -1,102 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from glob import glob\n",
"\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"waves = [\n",
" wave\n",
" for location in os.listdir('data')\n",
" for date in os.listdir(os.path.join('data', location))\n",
" for wave in os.listdir(os.path.join('data', location, date))\n",
"]\n",
"assert len(waves) == len(set(waves)), \"Not all filenames are unique :(\""
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"for location in os.listdir('data'):\n",
" for date in os.listdir(os.path.join('data', location)):\n",
" for wave in os.listdir(os.path.join('data', location, date)):\n",
" source = os.path.join('data', location, date, wave)\n",
" destination = os.path.join('per_word', wave.split('_')[1])\n",
" if not os.path.isdir(destination):\n",
" os.mkdir(destination)\n",
" os.rename(source, os.path.join(destination, wave))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"import pandas\n",
"\n",
"data = pandas.read_csv('/home/herbert/picture-game-result-export.csv', delimiter=';')\n",
"data['Filename'] = [x.split('/')[-1] for x in data['Opname']]"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"for word in os.listdir('per_word'):\n",
" for wave in os.listdir(os.path.join('per_word', word)):\n",
" source = os.path.join('per_word', word, wave)\n",
" if wave not in relevant:\n",
" destination = os.path.join('per_word', word, 'irrelevant_accent_' + wave)\n",
" os.rename(source, destination)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"data.to_excel('/home/herbert/picture-game-result-export-filename.xlsx')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}