cleaned up gabmap file creation
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
83
notebooks/Gabmap Format.ipynb
Normal file
83
notebooks/Gabmap Format.ipynb
Normal file
@@ -0,0 +1,83 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Gabmap format\n",
|
||||
"\n",
|
||||
"Exploration of the format of the lines in example Gabmap files Martijn had sent."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('../data/martijn_format/Dutch613-coordinates.txt') as f:\n",
|
||||
" coordinates = list(f)\n",
|
||||
" \n",
|
||||
"with open('../data/martijn_format/Nederlands-ipa.utxt') as f:\n",
|
||||
" table = list(f)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"coordinates[0].split('\\t')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"coordinates[1].split('\\t')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"table[0].split('\\t')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"table[1].split('\\t')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
458
notebooks/Gabmap Pronunciation Tables, Simple Example.ipynb
Normal file
458
notebooks/Gabmap Pronunciation Tables, Simple Example.ipynb
Normal file
@@ -0,0 +1,458 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Geographical pronunciation tables, simple example\n",
|
||||
"\n",
|
||||
"Simple example to create gabmap files for two words with few pronunciations an two regions."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"sys.path.append('..')\n",
|
||||
"\n",
|
||||
"import pandas\n",
|
||||
"import MySQLdb\n",
|
||||
"import json\n",
|
||||
"import copy\n",
|
||||
"\n",
|
||||
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
|
||||
"\n",
|
||||
"from shapely.geometry import shape, Point\n",
|
||||
"\n",
|
||||
"from gabmap import create_gabmap_dataframes\n",
|
||||
"\n",
|
||||
"from stimmen.geojson import merge_features"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('../data/Friesland_wijken.geojson') as f:\n",
|
||||
" regions = json.load(f)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load and simplify"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Answers to how participants state a word should be pronounced\n",
|
||||
"\n",
|
||||
"answers = pandas.read_sql('''\n",
|
||||
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
|
||||
"FROM core_surveyresult as survey\n",
|
||||
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
|
||||
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
|
||||
" ON result.id = answer.prediction_quiz_id\n",
|
||||
"''', db)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"regions_simple = merge_features(copy.deepcopy(regions),\n",
|
||||
" condition=lambda feature: feature['properties']['GM_NAAM'] == 'Heerenveen',\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"regions_simple = merge_features(\n",
|
||||
" regions_simple,\n",
|
||||
" condition=lambda feature: feature['properties']['GM_NAAM'] == 'Leeuwarden',\n",
|
||||
")\n",
|
||||
"regions_simple['features'] = regions_simple['features'][-2:]\n",
|
||||
"\n",
|
||||
"regions_simple['features'][0]['properties']['name'] = 'Heerenveen'\n",
|
||||
"regions_simple['features'][1]['properties']['name'] = 'Leeuwarden'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"answers_simple = answers[\n",
|
||||
" (answers['question_text'] == '\"blad\" (aan een boom)') |\n",
|
||||
" (answers['question_text'] == '\"vis\"')\n",
|
||||
"].copy()\n",
|
||||
"\n",
|
||||
"answers_simple['question_text'] = answers_simple['question_text'].map(\n",
|
||||
" lambda x: x.replace('\"', '').replace('*', ''))\n",
|
||||
"\n",
|
||||
"answers_simple['answer_text'] = answers_simple['answer_text'].map(\n",
|
||||
" lambda x: x[x.find('('):x.find(')')][1:])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Two words, boom and vis, with each 4 and 2 pronunciations"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>answer_text</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>question_text</th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>blad (aan een boom)</th>\n",
|
||||
" <td>4</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>vis</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" answer_text\n",
|
||||
"question_text \n",
|
||||
"blad (aan een boom) 4\n",
|
||||
"vis 2"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"answers_simple.groupby('question_text').agg({'answer_text': lambda x: len(set(x))})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"centroids_example, pronunciations_example, counts_example = create_gabmap_dataframes(\n",
|
||||
" regions_simple, answers_simple,\n",
|
||||
" latitude_column='user_lat', longitude_column='user_lng',\n",
|
||||
" word_column='question_text', pronunciation_column='answer_text',\n",
|
||||
" region_name_property='name'\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Resulting tables\n",
|
||||
"\n",
|
||||
"Stored as tab separated files for gabmap"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>latitude</th>\n",
|
||||
" <th>longitude</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>#name</th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>Heerenveen</th>\n",
|
||||
" <td>52.996076</td>\n",
|
||||
" <td>5.977925</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>Leeuwarden</th>\n",
|
||||
" <td>53.169940</td>\n",
|
||||
" <td>5.797613</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" latitude longitude\n",
|
||||
"#name \n",
|
||||
"Heerenveen 52.996076 5.977925\n",
|
||||
"Leeuwarden 53.169940 5.797613"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"centroids_example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>blad (aan een boom)</th>\n",
|
||||
" <th>vis</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>Heerenveen</th>\n",
|
||||
" <td>blet / blɑt / blɔd / blɛ:t</td>\n",
|
||||
" <td>fisk / fɪs</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>Leeuwarden</th>\n",
|
||||
" <td>blet / blɑt / blɔd / blɛ:t</td>\n",
|
||||
" <td>fisk / fɪs</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" blad (aan een boom) vis\n",
|
||||
" \n",
|
||||
"Heerenveen blet / blɑt / blɔd / blɛ:t fisk / fɪs\n",
|
||||
"Leeuwarden blet / blɑt / blɔd / blɛ:t fisk / fɪs"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pronunciations_example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>blad (aan een boom): blet</th>\n",
|
||||
" <th>blad (aan een boom): blɑt</th>\n",
|
||||
" <th>blad (aan een boom): blɔd</th>\n",
|
||||
" <th>blad (aan een boom): blɛ:t</th>\n",
|
||||
" <th>vis: fisk</th>\n",
|
||||
" <th>vis: fɪs</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>Heerenveen</th>\n",
|
||||
" <td>31.654676</td>\n",
|
||||
" <td>2.158273</td>\n",
|
||||
" <td>2.158273</td>\n",
|
||||
" <td>64.028777</td>\n",
|
||||
" <td>52.517986</td>\n",
|
||||
" <td>47.482014</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>Leeuwarden</th>\n",
|
||||
" <td>7.865169</td>\n",
|
||||
" <td>7.022472</td>\n",
|
||||
" <td>8.707865</td>\n",
|
||||
" <td>76.404494</td>\n",
|
||||
" <td>75.000000</td>\n",
|
||||
" <td>25.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" blad (aan een boom): blet blad (aan een boom): blɑt \\\n",
|
||||
" \n",
|
||||
"Heerenveen 31.654676 2.158273 \n",
|
||||
"Leeuwarden 7.865169 7.022472 \n",
|
||||
"\n",
|
||||
" blad (aan een boom): blɔd blad (aan een boom): blɛ:t vis: fisk \\\n",
|
||||
" \n",
|
||||
"Heerenveen 2.158273 64.028777 52.517986 \n",
|
||||
"Leeuwarden 8.707865 76.404494 75.000000 \n",
|
||||
"\n",
|
||||
" vis: fɪs \n",
|
||||
" \n",
|
||||
"Heerenveen 47.482014 \n",
|
||||
"Leeuwarden 25.000000 "
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"counts_example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pronunciations_example.to_csv('../data/Pronunciations_example.gabmap.tsv', sep='\\t')\n",
|
||||
"counts_example.to_csv('../data/Pronunciation_percentages_example.gabmap.tsv', sep='\\t')\n",
|
||||
"centroids_example.to_csv('../data/Centroids_example.gabmap.tsv', sep='\\t', columns=['longitude', 'latitude'])\n",
|
||||
"with open('../data/Gabmap_example.geojson', 'w') as f:\n",
|
||||
" json.dump(regions_simple, f)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
157
notebooks/Gabmap Pronunciation Tables.ipynb
Normal file
157
notebooks/Gabmap Pronunciation Tables.ipynb
Normal file
@@ -0,0 +1,157 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Geographical pronunciation tables\n",
|
||||
"\n",
|
||||
"Creates gabmap files with region centroids, percentages and pronunciations for wijken in Friesland."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"sys.path.append('..')\n",
|
||||
"\n",
|
||||
"import pandas\n",
|
||||
"import MySQLdb\n",
|
||||
"import json\n",
|
||||
"import copy\n",
|
||||
"\n",
|
||||
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
|
||||
"\n",
|
||||
"from shapely.geometry import shape, Point\n",
|
||||
"\n",
|
||||
"from gabmap import create_gabmap_dataframes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('../data/Friesland_wijken.geojson') as f:\n",
|
||||
" regions = json.load(f)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Answers to how participants state a word should be pronounced\n",
|
||||
"\n",
|
||||
"answers = pandas.read_sql('''\n",
|
||||
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
|
||||
"FROM core_surveyresult as survey\n",
|
||||
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
|
||||
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
|
||||
" ON result.id = answer.prediction_quiz_id\n",
|
||||
"''', db)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"zero_latlng_questions = {\n",
|
||||
" q\n",
|
||||
" for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
|
||||
" if row['user_lat'] == 0 and row['user_lng'] == 0\n",
|
||||
"}\n",
|
||||
"answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)].copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array(['gegaan', 'avond', 'heel', 'dag', 'bij (insect)', 'sprak (toe)',\n",
|
||||
" 'oog', 'armen (lichaamsdeel)', 'kaas', 'deurtje', 'koken',\n",
|
||||
" 'borst (lichaamsdeel)', 'vis', 'zaterdag', 'trein', 'geel', 'tand',\n",
|
||||
" 'gezet', 'blad (aan een boom)'], dtype=object)"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"answers_filtered['question_text'].unique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"answers_filtered['question_text'] = answers_filtered['question_text'].map(\n",
|
||||
" lambda x: x.replace('\"', '').replace('*', ''))\n",
|
||||
"\n",
|
||||
"answers_filtered['answer_text'] = answers_filtered['answer_text'].map(\n",
|
||||
" lambda x: x[x.find('('):x.find(')')][1:])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"centroids, pronunciations, counts = create_gabmap_dataframes(\n",
|
||||
" regions, answers_filtered,\n",
|
||||
" latitude_column='user_lat', longitude_column='user_lng',\n",
|
||||
" word_column='question_text', pronunciation_column='answer_text',\n",
|
||||
" region_name_property='gemeente_en_wijk_naam'\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pronunciations.to_csv('../data/Friesland_wijken_pronunciations.gabmap.tsv', sep='\\t')\n",
|
||||
"counts.to_csv('../data/Friesland_wijken_pronunciation_percentages.gabmap.tsv', sep='\\t')\n",
|
||||
"centroids.to_csv('../data/Friesland_wijken_centroids.gabmap.tsv', sep='\\t', columns=['longitude', 'latitude'])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
File diff suppressed because one or more lines are too long
@@ -18,7 +18,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -49,13 +49,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('dialect_regions.geojson', 'r') as f:\n",
|
||||
"with open('../data/fryslan_dialect_regions.geojson', 'r') as f:\n",
|
||||
" geojson = json.load(f)\n",
|
||||
"\n",
|
||||
"dialect_regions = [region['properties']['dialect'] for region in geojson['features']]"
|
||||
@@ -63,7 +63,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -97,7 +97,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -122,7 +122,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -143,13 +143,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "67ed3190256b447c81daf3df1f189318",
|
||||
"model_id": "5825449a737b4fcab38a4f4ac2adfd87",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
@@ -167,7 +167,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -183,7 +183,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -202,13 +202,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "201b0aed64e8494db603de15b560d919",
|
||||
"model_id": "8afad9f71e544658b554b828932d7769",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
@@ -226,7 +226,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@@ -1,430 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('martijn_format/Dutch613-coordinates.txt') as f:\n",
|
||||
" coordinates = list(f)\n",
|
||||
" \n",
|
||||
"with open('martijn_format/Nederlands-ipa.utxt') as f:\n",
|
||||
" table = list(f)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Aalsmeer NH', '4.76163', '52.2693\\n']"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"coordinates[1].split('\\t')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['',\n",
|
||||
" 'kippen',\n",
|
||||
" 'mijn',\n",
|
||||
" 'vriend',\n",
|
||||
" 'bloemen',\n",
|
||||
" 'spinnen',\n",
|
||||
" 'machines',\n",
|
||||
" 'werk',\n",
|
||||
" 'op',\n",
|
||||
" 'schip',\n",
|
||||
" 'kregen',\n",
|
||||
" 'beschimmeld',\n",
|
||||
" 'brood',\n",
|
||||
" 'timmerman',\n",
|
||||
" 'splinter',\n",
|
||||
" 'vinger',\n",
|
||||
" 'fabriek',\n",
|
||||
" 'vier',\n",
|
||||
" 'bier',\n",
|
||||
" 'twee',\n",
|
||||
" 'drie',\n",
|
||||
" 'hij',\n",
|
||||
" 'knuppel',\n",
|
||||
" 'ik',\n",
|
||||
" 'knie',\n",
|
||||
" 'gezien',\n",
|
||||
" 'ragebol',\n",
|
||||
" 'pet',\n",
|
||||
" 'paddestoel',\n",
|
||||
" 'kerel',\n",
|
||||
" 'brede',\n",
|
||||
" 'stenen',\n",
|
||||
" 'breder',\n",
|
||||
" 'breedste',\n",
|
||||
" 'standbeeld',\n",
|
||||
" 'duivel',\n",
|
||||
" 'gebleven',\n",
|
||||
" 'meester',\n",
|
||||
" 'zee',\n",
|
||||
" 'graag',\n",
|
||||
" 'keelpijn',\n",
|
||||
" 'steel',\n",
|
||||
" 'bezem',\n",
|
||||
" 'neen',\n",
|
||||
" 'geroepen',\n",
|
||||
" 'peer',\n",
|
||||
" 'rijp',\n",
|
||||
" 'geld',\n",
|
||||
" 'ver',\n",
|
||||
" 'brengen',\n",
|
||||
" 'vrouw',\n",
|
||||
" 'zwemmen',\n",
|
||||
" 'sterk',\n",
|
||||
" 'bed',\n",
|
||||
" 'optillen',\n",
|
||||
" 'metselaar',\n",
|
||||
" 'springen',\n",
|
||||
" 'boterham',\n",
|
||||
" 'vader',\n",
|
||||
" 'zes',\n",
|
||||
" 'jaar',\n",
|
||||
" 'school',\n",
|
||||
" 'laten',\n",
|
||||
" 'gaan',\n",
|
||||
" 'water',\n",
|
||||
" 'potten',\n",
|
||||
" 'zijn',\n",
|
||||
" 'veel',\n",
|
||||
" 'maart',\n",
|
||||
" 'nog',\n",
|
||||
" 'koud',\n",
|
||||
" 'kaars',\n",
|
||||
" 'geeft',\n",
|
||||
" 'licht',\n",
|
||||
" 'paard',\n",
|
||||
" 'tegen',\n",
|
||||
" 'zwaluwen',\n",
|
||||
" 'kaas',\n",
|
||||
" 'motor',\n",
|
||||
" 'dag',\n",
|
||||
" 'avond',\n",
|
||||
" 'jongetje',\n",
|
||||
" 'barst',\n",
|
||||
" 'brief',\n",
|
||||
" 'hart',\n",
|
||||
" 'spannen',\n",
|
||||
" 'nieuwe',\n",
|
||||
" 'kar',\n",
|
||||
" 'zoon',\n",
|
||||
" 'koning',\n",
|
||||
" 'ook',\n",
|
||||
" 'geweest',\n",
|
||||
" 'rozen',\n",
|
||||
" 'lange',\n",
|
||||
" 'woord',\n",
|
||||
" 'kindje',\n",
|
||||
" 'was',\n",
|
||||
" 'dochtertje',\n",
|
||||
" 'bos',\n",
|
||||
" 'ladder',\n",
|
||||
" 'mond',\n",
|
||||
" 'droog',\n",
|
||||
" 'dorst',\n",
|
||||
" 'weg',\n",
|
||||
" 'krom',\n",
|
||||
" 'liedje',\n",
|
||||
" 'goed',\n",
|
||||
" 'kelder',\n",
|
||||
" 'voor',\n",
|
||||
" 'moest',\n",
|
||||
" 'ossenbloed',\n",
|
||||
" 'drinken',\n",
|
||||
" 'broer',\n",
|
||||
" 'moe',\n",
|
||||
" 'karnemelk',\n",
|
||||
" 'dun',\n",
|
||||
" 'zuur',\n",
|
||||
" 'put',\n",
|
||||
" 'uur',\n",
|
||||
" 'Italië',\n",
|
||||
" 'bergen',\n",
|
||||
" 'vuur',\n",
|
||||
" 'spuwen',\n",
|
||||
" 'duwen',\n",
|
||||
" 'hebben',\n",
|
||||
" 'stuk',\n",
|
||||
" 'brug',\n",
|
||||
" 'veulen',\n",
|
||||
" 'komen',\n",
|
||||
" 'deur',\n",
|
||||
" 'naaien',\n",
|
||||
" 'gras',\n",
|
||||
" 'brouwer',\n",
|
||||
" 'bakken',\n",
|
||||
" 'je',\n",
|
||||
" 'eieren',\n",
|
||||
" 'krijgen',\n",
|
||||
" 'markt',\n",
|
||||
" 'waren',\n",
|
||||
" 'vijf',\n",
|
||||
" 'eikels',\n",
|
||||
" 'hooi',\n",
|
||||
" 'is',\n",
|
||||
" 'groen',\n",
|
||||
" 'boompje',\n",
|
||||
" 'wijn',\n",
|
||||
" 'huis',\n",
|
||||
" 'melk',\n",
|
||||
" 'spuit',\n",
|
||||
" 'koe',\n",
|
||||
" 'koster',\n",
|
||||
" 'kruiwagen',\n",
|
||||
" 'buigen',\n",
|
||||
" 'Duitsers',\n",
|
||||
" 'blauw',\n",
|
||||
" 'geslagen',\n",
|
||||
" 'saus',\n",
|
||||
" 'flauw',\n",
|
||||
" 'sneeuw',\n",
|
||||
" 'stad',\n",
|
||||
" 'doen',\n",
|
||||
" 'dopen',\n",
|
||||
" 'doopvont',\n",
|
||||
" 'soldaten',\n",
|
||||
" 'dorsen',\n",
|
||||
" 'binden',\n",
|
||||
" 'gebonden\\n']"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"table[0].split('\\t')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['West-Terschelling',\n",
|
||||
" 'kipən',\n",
|
||||
" 'miŋ',\n",
|
||||
" 'kɑ̟mərɑ̟ːt',\n",
|
||||
" 'blʊmə',\n",
|
||||
" 'spɪnə',\n",
|
||||
" 'məsinəs / məʃinəs',\n",
|
||||
" 'ʋɔrə̆k',\n",
|
||||
" 'ʊp̬',\n",
|
||||
" 'sxɪp',\n",
|
||||
" 'kreːɣə̃ / krɪɣə̃',\n",
|
||||
" 'fəsxɪməlt / bəsxɪməlt',\n",
|
||||
" 'bro̝ˑə̆t',\n",
|
||||
" 'tɪmərmɑn',\n",
|
||||
" 'splɪntər',\n",
|
||||
" 'fɪŋər',\n",
|
||||
" 'fəbrik',\n",
|
||||
" 'fjɔŭwər',\n",
|
||||
" 'biˑə̆r',\n",
|
||||
" 'twɑ̟',\n",
|
||||
" 'treːĭjə',\n",
|
||||
" 'hɛĭ',\n",
|
||||
" 'knʏpəl / ɛinhɔŭt',\n",
|
||||
" 'ɪk',\n",
|
||||
" 'knɪbəl',\n",
|
||||
" 'siˑə̆n',\n",
|
||||
" 'rɑ̟ːɣəbɔl',\n",
|
||||
" 'pɛt',\n",
|
||||
" 'pɑ̟dəstuˑə̆l',\n",
|
||||
" 'mɑ̟n',\n",
|
||||
" 'breːdə / breːjə',\n",
|
||||
" 'stɪn̩ː',\n",
|
||||
" 'breːdər',\n",
|
||||
" 'breːstə',\n",
|
||||
" 'stɔndbeːlt',\n",
|
||||
" 'dyvəl',\n",
|
||||
" 'blɔŭn',\n",
|
||||
" 'meːstər',\n",
|
||||
" 'seˑ',\n",
|
||||
" 'xrɑːx',\n",
|
||||
" 'ətĩˑsĩkeːl',\n",
|
||||
" 'stɛːl',\n",
|
||||
" 'biːzəm',\n",
|
||||
" 'neː',\n",
|
||||
" 'rɔft',\n",
|
||||
" 'pɔˑə̆r',\n",
|
||||
" 'rip',\n",
|
||||
" 'jɪlt',\n",
|
||||
" 'fiˑə̆r',\n",
|
||||
" 'brɪŋə',\n",
|
||||
" 'ʋiːf',\n",
|
||||
" 'swʊmə',\n",
|
||||
" 'stɛrk',\n",
|
||||
" 'bɛˑə̆t',\n",
|
||||
" 'ʊptɪlən',\n",
|
||||
" '',\n",
|
||||
" 'sprɪŋə',\n",
|
||||
" '',\n",
|
||||
" 'tɔ̞ˑə̆',\n",
|
||||
" 'sɛks',\n",
|
||||
" 'jiə̆r',\n",
|
||||
" 'sxuˑəl',\n",
|
||||
" 'lɪtn̩̆',\n",
|
||||
" 'xeˑə̆̃',\n",
|
||||
" 'ʋɛtər',\n",
|
||||
" 'pɔtn̩̆',\n",
|
||||
" 'bɪn',\n",
|
||||
" 'fʊlə',\n",
|
||||
" 'mɑˑə̆t',\n",
|
||||
" 'nɔx',\n",
|
||||
" 'kɔˑə̆t',\n",
|
||||
" 'kɛs',\n",
|
||||
" 'jʊxt',\n",
|
||||
" 'jɛxt',\n",
|
||||
" 'hoĭsʲ',\n",
|
||||
" 'tsjɪ',\n",
|
||||
" 'swɑːlywə',\n",
|
||||
" 'tsiːs / tsjiːs',\n",
|
||||
" 'moˑtər',\n",
|
||||
" 'dɛĭ',\n",
|
||||
" 'ioŋ',\n",
|
||||
" 'jʊŋkjə',\n",
|
||||
" 'bœ̝st',\n",
|
||||
" 'briːf',\n",
|
||||
" 'hɔĭtʲ',\n",
|
||||
" 'spɔnə',\n",
|
||||
" 'niˑjə',\n",
|
||||
" 'kɑ̟rə',\n",
|
||||
" 'sɪn',\n",
|
||||
" 'koˑə̆nɪŋ',\n",
|
||||
" 'eˑə̆k',\n",
|
||||
" 'ʋɛn',\n",
|
||||
" 'roˑə̆zən',\n",
|
||||
" 'lɑ̟ŋə',\n",
|
||||
" 'ʋɔĭtʲ',\n",
|
||||
" 'bɔ̞nʲ',\n",
|
||||
" 'ʋɑ̟z',\n",
|
||||
" 'fɑ̟mkə / dɔxtərtsə',\n",
|
||||
" 'bʊs',\n",
|
||||
" 'leˑə̆rt',\n",
|
||||
" 'mylə',\n",
|
||||
" 'drux',\n",
|
||||
" 'toˑə̆st',\n",
|
||||
" 'ʋɛĭ',\n",
|
||||
" '',\n",
|
||||
" 'fɛsjə',\n",
|
||||
" 'xuˑət',\n",
|
||||
" 'kɛldər',\n",
|
||||
" 'fŭɑ̟r',\n",
|
||||
" 'mɔs',\n",
|
||||
" 'ɔsəbluˑət̬',\n",
|
||||
" 'drɪŋkə',\n",
|
||||
" 'bruər',\n",
|
||||
" 'muˑə̆t',\n",
|
||||
" 'suˑp / kɑrnəmoˑə̆lək',\n",
|
||||
" 'tɪn',\n",
|
||||
" 'suːr',\n",
|
||||
" 'pʏt',\n",
|
||||
" 'uːr',\n",
|
||||
" 'itɑ̟ːljə',\n",
|
||||
" 'bɑ̟rɣən',\n",
|
||||
" 'fjuːr',\n",
|
||||
" 'spiˑə̆n',\n",
|
||||
" 'drʏkən',\n",
|
||||
" 'hɑ̟bə',\n",
|
||||
" 'stɪk',\n",
|
||||
" 'brʏx',\n",
|
||||
" 'fɔlʲtsə',\n",
|
||||
" 'kʊmə',\n",
|
||||
" 'doˑə̆r',\n",
|
||||
" 'nɑːĭjə',\n",
|
||||
" 'xɛs',\n",
|
||||
" 'brɔŭwər',\n",
|
||||
" 'bɑ̟kə',\n",
|
||||
" 'do̞',\n",
|
||||
" 'ɑːə̆jən',\n",
|
||||
" 'kriˑjə',\n",
|
||||
" 'mɑ̟rək / mɑrəkt',\n",
|
||||
" 'wɑːrə̃',\n",
|
||||
" 'fiːf',\n",
|
||||
" 'ɛikəls',\n",
|
||||
" 'heˑə̆',\n",
|
||||
" 'ɪz',\n",
|
||||
" 'xriˑə̆n',\n",
|
||||
" 'boːmkə',\n",
|
||||
" 'ʋin',\n",
|
||||
" 'hyːs',\n",
|
||||
" 'mʊə̆lək',\n",
|
||||
" 'spœĭt',\n",
|
||||
" 'ku',\n",
|
||||
" 'kɔstər',\n",
|
||||
" 'krødʋɛin',\n",
|
||||
" 'buːɣə',\n",
|
||||
" 'dytsərs',\n",
|
||||
" 'blɑːŭ',\n",
|
||||
" 'slɛin',\n",
|
||||
" 'sjy',\n",
|
||||
" '',\n",
|
||||
" 'sneː',\n",
|
||||
" 'stɑ̟t',\n",
|
||||
" 'dweˑə̆n',\n",
|
||||
" 'doːpə',\n",
|
||||
" 'doːpfʊnt',\n",
|
||||
" 'sɔldɑːtən',\n",
|
||||
" 'tɛskjə',\n",
|
||||
" 'binə',\n",
|
||||
" 'boŋ\\n']"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"table[1].split('\\t')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,433 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Pronunciation-based location prediction confusion\n",
|
||||
"\n",
|
||||
"Setup a pandas dataframe with in each row\n",
|
||||
"\n",
|
||||
" * participant provided (actual) location,\n",
|
||||
" * 3 estimations made by Nanna's heuristic based in what the participant stated to be the correct pronunciation of a word\n",
|
||||
" * distance between the actual and heuristic predicted location\n",
|
||||
" \n",
|
||||
"Averages of the distances are exported for visualisation in QGIS."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pickle\n",
|
||||
"import pandas\n",
|
||||
"import MySQLdb\n",
|
||||
"import numpy\n",
|
||||
"import itertools\n",
|
||||
"import requests\n",
|
||||
"import json\n",
|
||||
"from vincenty import vincenty\n",
|
||||
"\n",
|
||||
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen')\n",
|
||||
"\n",
|
||||
"%matplotlib inline\n",
|
||||
"from matplotlib import pyplot, rcParams\n",
|
||||
"from jupyter_progressbar import ProgressBar\n",
|
||||
"\n",
|
||||
"# rcParams['font.family'] = 'Lucinda Console'\n",
|
||||
"rcParams['font.size'] = '24'\n",
|
||||
"rcParams['figure.figsize'] = (20, 10)\n",
|
||||
"rcParams['figure.dpi'] = 100"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def simplify_area_name(x):\n",
|
||||
" return ' '.join(\n",
|
||||
" x.split('/') # split Dutch and Frysian name\n",
|
||||
" [0] # extract Dutch name\n",
|
||||
" .strip()\n",
|
||||
" .split(' ') # Split area name from province, mostly 'Fr'\n",
|
||||
" [:-1] # remove province\n",
|
||||
" ).strip().lower() # rejoin spaces in area name"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"metadata = pandas.read_sql('''SELECT answer.* FROM core_surveyresultquestionanswer as answer''', db)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"grouped = metadata.groupby(['survey_result_id', 'question_id']).agg({\n",
|
||||
" 'question_text': 'first',\n",
|
||||
" 'answer_text': lambda x: x if len(x) == 1 else ', '.join(x)\n",
|
||||
"})\n",
|
||||
"grouped.reset_index(inplace=True)\n",
|
||||
"\n",
|
||||
"grouped = grouped.pivot(index='survey_result_id', columns='question_text', values='answer_text')\n",
|
||||
"\n",
|
||||
"grouped = grouped.rename({\n",
|
||||
" 'Do you go to school?': 'school',\n",
|
||||
" 'Do you go to university?': 'university',\n",
|
||||
" 'What is your age bracket?': 'age_bracket',\n",
|
||||
" 'What is your age?': 'age',\n",
|
||||
" 'What is your gender?': 'gender',\n",
|
||||
" 'Which language are you the most proficient in?': 'language',\n",
|
||||
" 'Which languages do you actively use in your life?': 'active-languages'\n",
|
||||
"}, axis='columns')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictions = pandas.read_sql('''\n",
|
||||
"SELECT \n",
|
||||
" sr.id as id,\n",
|
||||
" sr.area_name as actual_area,\n",
|
||||
" area1_name as area_prediction_1,\n",
|
||||
" area2_name as area_prediction_2,\n",
|
||||
" area3_name as area_prediction_3\n",
|
||||
"FROM core_surveyresult as sr\n",
|
||||
"INNER JOIN core_predictionquizresult as pq\n",
|
||||
" ON sr.id = pq.survey_result_id\n",
|
||||
"''', db)\n",
|
||||
"\n",
|
||||
"predicted_areas = set(map(simplify_area_name,\n",
|
||||
" set(predictions['area_prediction_1']) |\n",
|
||||
" set(predictions['area_prediction_2']) |\n",
|
||||
" set(predictions['area_prediction_3'])\n",
|
||||
"))\n",
|
||||
"actual_areas = set(map(str.lower, predictions['actual_area']))\n",
|
||||
"\n",
|
||||
"areas = list(predicted_areas | actual_areas)\n",
|
||||
"location_to_number = {l: i for i, l in enumerate(areas)}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"simplified_predictions = pandas.DataFrame({\n",
|
||||
" 'id': list(predictions['id']),\n",
|
||||
" 'actual': list(map(str.lower, predictions['actual_area'])),\n",
|
||||
" 'prediction_1': list(map(simplify_area_name, predictions['area_prediction_1'])),\n",
|
||||
" 'prediction_2': list(map(simplify_area_name, predictions['area_prediction_2'])),\n",
|
||||
" 'prediction_3': list(map(simplify_area_name, predictions['area_prediction_3'])),\n",
|
||||
"})\n",
|
||||
"# simplified_predictions.set_index('id')\n",
|
||||
"simplified_predictions.to_excel('actual-predictions.xls')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"locations = {location for c in simplified_predictions.columns for location in simplified_predictions[c] if c != 'id'}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"names = pandas.read_csv('plaatsen_nl.csv')\n",
|
||||
"\n",
|
||||
"nonominatim = {\n",
|
||||
" name: [row['st_y'], row['x']]\n",
|
||||
" for _, row in names.iterrows()\n",
|
||||
" for column in ['bebouwdeko', 'naamoffici', 'naamnl', 'naamfries']\n",
|
||||
"# for _ in [ print(row[column]) ]\n",
|
||||
" if type(row[column]) == str\n",
|
||||
" for name in [row[column], row[column].lower().replace('-', ' ')]\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "21c016f24e23473e807ed3e9c2d942c6",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nominatim = {\n",
|
||||
" l: json.loads(\n",
|
||||
" requests.get(\n",
|
||||
" 'https://nominatim.openstreetmap.org/search.php?q=Netherlands%20'\n",
|
||||
" '{}&polygon_geojson=1&viewbox=&format=json'.format(l)\n",
|
||||
" ).text\n",
|
||||
" )\n",
|
||||
" for l in ProgressBar(locations)\n",
|
||||
" if l not in nonominatim\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"latlons = {\n",
|
||||
" l: (float(v[0]['lat']), float(v[0]['lon']))\n",
|
||||
" for l, v in nominatim.items()\n",
|
||||
" if len(v) > 0\n",
|
||||
"}\n",
|
||||
"latlons.update(nonominatim)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for c in {'actual', 'prediction_1', 'prediction_2', 'prediction_3'}:\n",
|
||||
" simplified_predictions['{}_latlon'.format(c)] = [\n",
|
||||
" latlons.get(l, numpy.nan)\n",
|
||||
" for l in simplified_predictions['{}'.format(c)]\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for c in {'prediction_1_latlon', 'prediction_2_latlon', 'prediction_3_latlon'}:\n",
|
||||
" simplified_predictions['{}_distance'.format(c)] = [\n",
|
||||
" vincenty(x, y) if x == x and y == y else numpy.nan\n",
|
||||
" for x, y in zip(simplified_predictions['actual_latlon'], simplified_predictions[c])\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"simplified_predictions = simplified_predictions[[\n",
|
||||
" 'id', 'actual', 'actual_latlon', 'prediction_3_latlon_distance',\n",
|
||||
" 'prediction_1_latlon_distance', 'prediction_2_latlon_distance'\n",
|
||||
"]]\n",
|
||||
"\n",
|
||||
"simplified_predictions = simplified_predictions.rename({\n",
|
||||
" 'prediction_3_latlon_distance': 'distance3',\n",
|
||||
" 'prediction_1_latlon_distance': 'distance1',\n",
|
||||
" 'prediction_2_latlon_distance': 'distance2'\n",
|
||||
"}, axis='columns')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"simplified_predictions = simplified_predictions.join(grouped, on='id')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"simplified_predictions['latitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[0] if x == x else None)\n",
|
||||
"simplified_predictions['longitude'] = simplified_predictions['actual_latlon'].map(lambda x: x[1] if x == x else None)\n",
|
||||
"simplified_predictions = simplified_predictions.drop('actual_latlon', axis='columns')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"simplified_predictions['age_groups'] = [\n",
|
||||
" {'0-10': '0-20', '11-20': '0-20',\n",
|
||||
" '21-30': '21-50', '31-40': '21-50', '41-50': '21-50',\n",
|
||||
" '51-60': '51-100', '61-70': '51-100', '71-80': '51-100', '81-90': '51-100', '91-100': '51-100'}.get(b, None)\n",
|
||||
" for b in simplified_predictions['age_bracket']\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# age_groups = simplified_predictions.groupby(['age_groups', 'actual']).agg({\n",
|
||||
"# 'distance1': ['mean', 'min', 'max', 'count', 'size'],\n",
|
||||
"# 'latitude': 'first',\n",
|
||||
"# 'longitude': 'first'\n",
|
||||
"# })\n",
|
||||
"# age_groups.index.get_level_values('age_groups')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# gender_groups = simplified_predictions.groupby(['gender', 'actual']).agg({\n",
|
||||
"# 'distance1': ['min', 'mean', 'max', 'count', 'size'],\n",
|
||||
"# 'latitude': 'first',\n",
|
||||
"# 'longitude': 'first'\n",
|
||||
"# })\n",
|
||||
"# gender_groups"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summary = simplified_predictions[['latitude', 'longitude', 'distance1', 'distance2', 'distance3', 'actual']]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summary.to_csv('points.csv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geojson = {\n",
|
||||
" \"type\": \"FeatureCollection\",\n",
|
||||
" \"features\": [\n",
|
||||
" {\n",
|
||||
" \"type\": \"Feature\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"distance 1\": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
|
||||
" \"distance 2\": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
|
||||
" \"distance 3\": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
|
||||
" \"actual\": actual\n",
|
||||
" },\n",
|
||||
" \"geometry\": {\n",
|
||||
" \"type\": \"Point\",\n",
|
||||
" \"coordinates\": list( actual_lat_lon )[::-1]\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" for actual, row in simplified_predictions.groupby('actual')\n",
|
||||
" if actual != ''\n",
|
||||
"# for _ in [ print(row['actual_latlon']), print() ]\n",
|
||||
" for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias \n",
|
||||
" if actual_lat_lon == actual_lat_lon\n",
|
||||
" ]\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"geojson = {\n",
|
||||
" \"type\": \"FeatureCollection\",\n",
|
||||
" \"features\": [\n",
|
||||
" {\n",
|
||||
" \"type\": \"Feature\",\n",
|
||||
" \"properties\": {\n",
|
||||
" \"distance 1\": row['prediction_1_latlon_distance'].mean() if row['prediction_1_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
|
||||
" \"distance 2\": row['prediction_2_latlon_distance'].mean() if row['prediction_2_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
|
||||
" \"distance 3\": row['prediction_3_latlon_distance'].mean() if row['prediction_3_latlon_distance'].isnull().sum() == 0 else -0.0001,\n",
|
||||
" \"actual\": actual\n",
|
||||
" },\n",
|
||||
" \"geometry\": {\n",
|
||||
" \"type\": \"Point\",\n",
|
||||
" \"coordinates\": list( actual_lat_lon )[::-1]\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" for actual, row in simplified_predictions.groupby('actual')\n",
|
||||
" if actual != ''\n",
|
||||
"# for _ in [ print(row['actual_latlon']), print() ]\n",
|
||||
" for actual_lat_lon in [list(row['actual_latlon'])[0]] # alias \n",
|
||||
" if actual_lat_lon == actual_lat_lon\n",
|
||||
" ]\n",
|
||||
"}"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
File diff suppressed because one or more lines are too long
@@ -1,293 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Geographical pronunciation statistics"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas\n",
|
||||
"import MySQLdb\n",
|
||||
"import numpy\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
|
||||
"\n",
|
||||
"%matplotlib notebook\n",
|
||||
"from matplotlib import pyplot\n",
|
||||
"import folium\n",
|
||||
"from IPython.display import display\n",
|
||||
"from shapely.geometry import Polygon, MultiPolygon, shape, Point\n",
|
||||
"from jsbutton import JsButton\n",
|
||||
"from jupyter_progressbar import ProgressBar\n",
|
||||
"from collections import defaultdict\n",
|
||||
"from ipy_table import make_table\n",
|
||||
"from html import escape\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from matplotlib.colors import LogNorm\n",
|
||||
"from sklearn import mixture\n",
|
||||
"from skimage.measure import find_contours\n",
|
||||
"from collections import Counter\n",
|
||||
"from random import shuffle"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Borders of Frysian municipalities\n",
|
||||
"\n",
|
||||
"with open('Friesland_AL8.GeoJson') as f:\n",
|
||||
" gemeentes = json.load(f)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"coords = [feature['geometry'] for feature in gemeentes['features']]\n",
|
||||
"coords_folium = [[[[c__[::-1] for c__ in c_] for c_ in c] for c in coords_['coordinates']] for coords_ in coords]\n",
|
||||
"shapes = [shape(coords_) for coords_ in coords]\n",
|
||||
"gemeente_names = [feature['properties']['name'] for feature in gemeentes['features']]\n",
|
||||
"\n",
|
||||
"def get_gemeente(point):\n",
|
||||
" for i, shape in enumerate(shapes):\n",
|
||||
" if shape.contains(point):\n",
|
||||
" return i\n",
|
||||
" return -1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Answers to how participants state a word should be pronounces.\n",
|
||||
"\n",
|
||||
"answers = pandas.read_sql('''\n",
|
||||
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
|
||||
"FROM core_surveyresult as survey\n",
|
||||
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
|
||||
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
|
||||
" ON result.id = answer.prediction_quiz_id\n",
|
||||
"''', db)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"zero_latlng_questions = {\n",
|
||||
" q\n",
|
||||
" for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
|
||||
" if row['user_lat'] == 0 and row['user_lng'] == 0\n",
|
||||
"}\n",
|
||||
"answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
|
||||
" # Remove the CWD from sys.path while we load stuff.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Takes approximately 2 minutes\n",
|
||||
"\n",
|
||||
"gemeente_map = {\n",
|
||||
" (lng, lat): get_gemeente(Point(lng, lat))\n",
|
||||
" for lng, lat in set(zip(answers_filtered['user_lng'], answers_filtered['user_lat']))\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"answers_filtered['gemeente'] = [\n",
|
||||
" gemeente_map[(lng, lat)]\n",
|
||||
" for lat, lng in zip(answers_filtered['user_lat'], answers_filtered['user_lng'])\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
|
||||
" \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"answers_filtered['question_text_url'] = answers_filtered['question_text'].map(\n",
|
||||
" lambda x: x.replace('\"', '').replace('*', ''))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "e48eb24f5c43434bad4241d4bea53074",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cmap = pyplot.get_cmap('YlOrRd')\n",
|
||||
"\n",
|
||||
"for question, rows in ProgressBar(\n",
|
||||
" answers_filtered.groupby('question_text_url'),\n",
|
||||
" size=len(answers_filtered['question_text_url'].unique())\n",
|
||||
"):\n",
|
||||
" m = folium.Map((rows['user_lat'].median(), rows['user_lng'].median()), tiles=None, zoom_start=9)\n",
|
||||
" pecentage_labels = folium.FeatureGroup(name='pecentages', overlay=True)\n",
|
||||
" order = [a for _, a in sorted((\n",
|
||||
" (r['user_lat'], answer)\n",
|
||||
" for answer, r in rows.groupby('answer_text').count().iterrows()\n",
|
||||
" ), reverse=True)]\n",
|
||||
" gemeente_normalizer = {\n",
|
||||
" gemeente: r['user_lat']\n",
|
||||
" for gemeente, r in rows.groupby('gemeente').count().iterrows()\n",
|
||||
" }\n",
|
||||
" for answer_text in order:\n",
|
||||
" rows_ = rows[rows['answer_text'] == answer_text]\n",
|
||||
" if (rows_['gemeente'] >= 0).sum() <= 0:\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" spread = {\n",
|
||||
" gemeente: r['user_lat']\n",
|
||||
" for gemeente, r in rows_.groupby('gemeente').count().iterrows()\n",
|
||||
" if gemeente >= 0\n",
|
||||
" }\n",
|
||||
" n_answers = sum(spread.values())\n",
|
||||
" \n",
|
||||
" name = '{} ({})'.format(answer_text, n_answers)\n",
|
||||
" group = folium.FeatureGroup(name=name, overlay=False)\n",
|
||||
" folium.TileLayer(tiles='stamentoner').add_to(group)\n",
|
||||
" \n",
|
||||
" max_value = max(value / gemeente_normalizer[gemeente] for gemeente, value in spread.items())\n",
|
||||
" for gemeente, gemeente_name in enumerate(gemeente_names):\n",
|
||||
" if gemeente in spread:\n",
|
||||
" value = spread[gemeente]\n",
|
||||
" percentage = value / gemeente_normalizer[gemeente]\n",
|
||||
" color_value = percentage / max_value\n",
|
||||
" color = '#%02x%02x%02x' % tuple(int(255 * c) for c in cmap(color_value)[:3])\n",
|
||||
" \n",
|
||||
" polygon = folium.Polygon(coords_folium[gemeente], fill_color=color, fill_opacity=0.8,\n",
|
||||
" color='#555555', popup='{} ({}, {}%)'.format(gemeente_name, value, round(100*percentage)))\n",
|
||||
" centroid = shapes[gemeente].centroid\n",
|
||||
" centroid = (centroid.y, centroid.x)\n",
|
||||
"# folium.Circle(centroid, color='green', radius=200).add_to(group)\n",
|
||||
" folium.map.Marker(\n",
|
||||
" [shapes[gemeente].centroid.y, shapes[gemeente].centroid.x],\n",
|
||||
" icon=folium.DivIcon(\n",
|
||||
" icon_size=(50, 24),\n",
|
||||
" icon_anchor=(25, 12),\n",
|
||||
" html='<div class=\"percentage-label\" style=\"font-size: 12pt; background-color: rgba(255,255,255,0.8); border-radius: 12px; text-align: center;\">{:d}%</div>'.format(int(100 * percentage)),\n",
|
||||
" )\n",
|
||||
" ).add_to(group)\n",
|
||||
" else:\n",
|
||||
" polygon = folium.Polygon(coords_folium[gemeente], fill_color=None, fill_opacity=0, color='#555555')\n",
|
||||
" polygon.add_to(group)\n",
|
||||
" group.add_to(m)\n",
|
||||
" pecentage_labels.add_to(m)\n",
|
||||
" folium.map.LayerControl('topright', collapsed=False).add_to(m)\n",
|
||||
" JsButton(\n",
|
||||
" title='<i class=\"fas fa-tags\"></i>',\n",
|
||||
" function=\"\"\"\n",
|
||||
" function(btn, map){\n",
|
||||
" $('.percentage-label').toggle();\n",
|
||||
" }\n",
|
||||
" \"\"\"\n",
|
||||
" ).add_to(m)\n",
|
||||
"# display(m)\n",
|
||||
" m.save('maps/heatmaps/{}.html'.format(question))\n",
|
||||
"# break"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import glob\n",
|
||||
"with open('maps/heatmaps/index.html', 'w') as f:\n",
|
||||
" f.write('<html><head></head><body>' + \n",
|
||||
" '<br/>\\n'.join(\n",
|
||||
" '\\t<a href=\"http://herbertkruitbosch.com/pronunciation_maps/{}\">{}<a>'.format(fn[5:], fn[14:-5].replace('_', ' '))\n",
|
||||
" for fn in sorted(\n",
|
||||
" glob.glob('maps/heatmaps/*.html')\n",
|
||||
" )\n",
|
||||
" ) + \"</body></html>\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@@ -1,312 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Geographical pronunciation statistics"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 128,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas\n",
|
||||
"import MySQLdb\n",
|
||||
"import numpy\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
|
||||
"\n",
|
||||
"%matplotlib notebook\n",
|
||||
"from matplotlib import pyplot\n",
|
||||
"import folium\n",
|
||||
"from IPython.display import display\n",
|
||||
"from shapely.geometry import Polygon, MultiPolygon, shape, Point\n",
|
||||
"from jsbutton import JsButton\n",
|
||||
"from shapely.geometry import LineString, MultiLineString\n",
|
||||
"from jupyter_progressbar import ProgressBar\n",
|
||||
"from collections import defaultdict, Counter\n",
|
||||
"from ipy_table import make_table\n",
|
||||
"from html import escape\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"from random import shuffle\n",
|
||||
"import pickle\n",
|
||||
"from jupyter_progressbar import ProgressBar"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 129,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('friesland_wijken.p3', 'rb') as f:\n",
|
||||
" wijken, wijk_shapes = pickle.load(f)\n",
|
||||
"\n",
|
||||
"wijk_names = [wijk['properties']['GM_NAAM'] + ', ' + wijk['properties'].get('WK_NAAM', '') for wijk in wijken['features']]\n",
|
||||
"\n",
|
||||
"def get_wijk(point):\n",
|
||||
" for i, shape in enumerate(wijk_shapes):\n",
|
||||
" if shape.contains(point):\n",
|
||||
" return i\n",
|
||||
" return -1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 130,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def listify(rd_multipolygon):\n",
|
||||
" if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float):\n",
|
||||
" return list(rd_multipolygon)\n",
|
||||
" return [\n",
|
||||
" listify(element)\n",
|
||||
" for element in rd_multipolygon\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 131,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Answers to how participants state a word should be pronounces.\n",
|
||||
"\n",
|
||||
"answers = pandas.read_sql('''\n",
|
||||
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
|
||||
"FROM core_surveyresult as survey\n",
|
||||
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
|
||||
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
|
||||
" ON result.id = answer.prediction_quiz_id\n",
|
||||
"''', db)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 132,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"zero_latlng_questions = {\n",
|
||||
" q\n",
|
||||
" for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
|
||||
" if row['user_lat'] == 0 and row['user_lng'] == 0\n",
|
||||
"}\n",
|
||||
"answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 133,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def reverse(rd_multipolygon):\n",
|
||||
" if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float):\n",
|
||||
" return rd_multipolygon[::-1]\n",
|
||||
" return [\n",
|
||||
" reverse(element)\n",
|
||||
" for element in rd_multipolygon\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 134,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
|
||||
" # Remove the CWD from sys.path while we load stuff.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Takes approximately 2 minutes\n",
|
||||
"points = set(zip(answers_filtered['user_lng'], answers_filtered['user_lat']))\n",
|
||||
"\n",
|
||||
"wijk_map = dict()\n",
|
||||
"for lng, lat in points:\n",
|
||||
" wijk_map[(lng, lat)] = get_wijk(Point(lng, lat))\n",
|
||||
"\n",
|
||||
"answers_filtered['wijk'] = [\n",
|
||||
" wijk_map[(lng, lat)]\n",
|
||||
" for lat, lng in zip(answers_filtered['user_lat'], answers_filtered['user_lng'])\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 135,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
|
||||
" \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"answers_filtered['question_text_url'] = answers_filtered['question_text'].map(\n",
|
||||
" lambda x: x.replace('\"', '').replace('*', ''))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 137,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "ea89078b81da4daba82bcd4b1ddbe8c2",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cmap = pyplot.get_cmap('YlOrRd')\n",
|
||||
"\n",
|
||||
"for question, rows in ProgressBar(\n",
|
||||
" answers_filtered.groupby('question_text_url'),\n",
|
||||
" size=len(answers_filtered['question_text_url'].unique())\n",
|
||||
"):\n",
|
||||
" m = folium.Map((rows['user_lat'].median(), rows['user_lng'].median()), tiles=None, zoom_start=9)\n",
|
||||
" order = [a for _, a in sorted((\n",
|
||||
" (r['user_lat'], answer)\n",
|
||||
" for answer, r in rows.groupby('answer_text').count().iterrows()\n",
|
||||
" ), reverse=True)]\n",
|
||||
" wijk_normalizer = {\n",
|
||||
" wijk: r['user_lat']\n",
|
||||
" for wijk, r in rows.groupby('wijk').count().iterrows()\n",
|
||||
" }\n",
|
||||
" for answer_text in (order):\n",
|
||||
" rows_ = rows[rows['answer_text'] == answer_text]\n",
|
||||
" if (rows_['wijk'] >= 0).sum() <= 0:\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" spread = {\n",
|
||||
" wijk: r['user_lat']\n",
|
||||
" for wijk, r in rows_.groupby('wijk').count().iterrows()\n",
|
||||
" if wijk >= 0\n",
|
||||
" }\n",
|
||||
" n_answers = sum(spread.values())\n",
|
||||
" \n",
|
||||
" name = '{} ({})'.format(answer_text, n_answers)\n",
|
||||
" group = folium.FeatureGroup(name=name, overlay=False)\n",
|
||||
" folium.TileLayer(tiles='stamentoner').add_to(group)\n",
|
||||
" \n",
|
||||
" max_value = max(value / wijk_normalizer[wijk] for wijk, value in spread.items())\n",
|
||||
" \n",
|
||||
" for wijk, wijk_name in enumerate(wijk_names):\n",
|
||||
" coordinates = reverse(wijken['features'][wijk]['geometry']['coordinates'])\n",
|
||||
" if wijk in spread:\n",
|
||||
" value = spread[wijk]\n",
|
||||
" percentage = value / wijk_normalizer[wijk]\n",
|
||||
" color_value = percentage / max_value\n",
|
||||
" color = '#%02x%02x%02x' % tuple(int(255 * c) for c in cmap(color_value)[:3])\n",
|
||||
" \n",
|
||||
" polygon = folium.Polygon(\n",
|
||||
" coordinates, fill_color=color, fill_opacity=0.8,\n",
|
||||
" color='#555555', popup='{} ({}, {: 3d}%)'.format(wijk_name, value, int(100*percentage))\n",
|
||||
" \n",
|
||||
" )\n",
|
||||
" centroid = wijk_shapes[wijk].centroid\n",
|
||||
" centroid = (centroid.y, centroid.x)\n",
|
||||
" folium.map.Marker(\n",
|
||||
" [wijk_shapes[wijk].centroid.y, wijk_shapes[wijk].centroid.x],\n",
|
||||
" icon=folium.DivIcon(\n",
|
||||
" icon_size=(30, 16),\n",
|
||||
" icon_anchor=(15, 8),\n",
|
||||
" html='<div class=\"percentage-label\" style=\"font-size: 8pt; background-color: rgba(255,255,255,0.8); border-radius: 4px; text-align: center;\">{:d}%</div>'.format(int(100 * percentage)),\n",
|
||||
" )\n",
|
||||
" ).add_to(group)\n",
|
||||
" else:\n",
|
||||
" polygon = folium.Polygon(coordinates, fill_color=None, fill_opacity=0, color='#555555')\n",
|
||||
" polygon.add_to(group)\n",
|
||||
" group.add_to(m)\n",
|
||||
" JsButton(\n",
|
||||
" title='<i class=\"fas fa-tags\"></i>',\n",
|
||||
" function=\"\"\"\n",
|
||||
" function(btn, map){\n",
|
||||
" $('.percentage-label').toggle();\n",
|
||||
" }\n",
|
||||
" \"\"\"\n",
|
||||
" ).add_to(m)\n",
|
||||
" folium.map.LayerControl('topright', collapsed=False).add_to(m)\n",
|
||||
"# display(m)\n",
|
||||
" m.save('maps/heatmaps-wijk/{}.html'.format(question))\n",
|
||||
"# break"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 138,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import glob\n",
|
||||
"with open('maps/heatmaps-wijk/index.html', 'w') as f:\n",
|
||||
" f.write('<html><head></head><body>' + \n",
|
||||
" '<br/>\\n'.join(\n",
|
||||
" '\\t<a href=\"{}\">{}<a>'.format(fn, fn[:-5].replace('_', ' '))\n",
|
||||
" for fn in sorted(\n",
|
||||
" glob.glob('maps/heatmaps-wijk/*.html')\n",
|
||||
" )\n",
|
||||
" for fn in [fn[len('maps/heatmaps-wijk/'):]]\n",
|
||||
" ) + \"</body></html>\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@@ -1,327 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Geographical pronunciation statistics"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas\n",
|
||||
"import MySQLdb\n",
|
||||
"import numpy\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
|
||||
"\n",
|
||||
"%matplotlib notebook\n",
|
||||
"from matplotlib import pyplot\n",
|
||||
"import folium\n",
|
||||
"from IPython.display import display\n",
|
||||
"from shapely.geometry import Polygon, MultiPolygon, shape, Point\n",
|
||||
"from jupyter_progressbar import ProgressBar\n",
|
||||
"from collections import defaultdict\n",
|
||||
"from ipy_table import make_table\n",
|
||||
"from html import escape\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from matplotlib.colors import LogNorm\n",
|
||||
"from sklearn import mixture\n",
|
||||
"from skimage.measure import find_contours\n",
|
||||
"from collections import Counter\n",
|
||||
"from random import shuffle"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Borders of Frysian municipalities\n",
|
||||
"\n",
|
||||
"with open('Friesland_AL8.GeoJson') as f:\n",
|
||||
" gemeentes = json.load(f)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"shapes = [shape(feature['geometry']) for feature in gemeentes['features']]\n",
|
||||
"gemeente_names = [feature['properties']['name'] for feature in gemeentes['features']]\n",
|
||||
"\n",
|
||||
"def get_gemeente(point):\n",
|
||||
" for i, shape in enumerate(shapes):\n",
|
||||
" if shape.contains(point):\n",
|
||||
" return i\n",
|
||||
" return -1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Answers to how participants state a word should be pronounces.\n",
|
||||
"\n",
|
||||
"answers = pandas.read_sql('''\n",
|
||||
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
|
||||
"FROM core_surveyresult as survey\n",
|
||||
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
|
||||
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
|
||||
" ON result.id = answer.prediction_quiz_id\n",
|
||||
"''', db)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Takes approximately 2 minutes\n",
|
||||
"\n",
|
||||
"gemeente_map = {\n",
|
||||
" (lng, lat): get_gemeente(Point(lng, lat))\n",
|
||||
" for lng, lat in set(zip(answers['user_lng'], answers['user_lat']))\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"answers['gemeente'] = [\n",
|
||||
" gemeente_map[(lng, lat)]\n",
|
||||
" for lat, lng in zip(answers['user_lat'], answers['user_lng'])\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Mapping pronunciations\n",
|
||||
"\n",
|
||||
"The idea is to plot each pronunciation as a point of a different color, now only seems to show participation density.\n",
|
||||
"\n",
|
||||
"Slow, so started with the first question."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# cmap = pyplot.get_cmap('gist_rainbow')\n",
|
||||
"\n",
|
||||
"# std = (1.89, 1.35)\n",
|
||||
"\n",
|
||||
"# for _, (question, rows) in zip(range(3), answers.groupby('question_text')):\n",
|
||||
"# plt.figure()\n",
|
||||
"# n_answers = len(rows.groupby('answer_text').count())\n",
|
||||
"# colors = cmap(range(256))[::256 // n_answers]\n",
|
||||
"# for (answer, rows_), color in zip(rows.groupby('answer_text'), colors):\n",
|
||||
"# if len(rows_) < 100:\n",
|
||||
"# continue\n",
|
||||
"# color = '#%02x%02x%02x' % tuple(int(c*255) for c in color[:3])\n",
|
||||
"# X = rows_[['user_lat', 'user_lng']].as_matrix()\n",
|
||||
"\n",
|
||||
"# clf = mixture.GaussianMixture(n_components=5, covariance_type='full')\n",
|
||||
"# clf.fit(X)\n",
|
||||
"# xlim = numpy.percentile(X[:, 0], [1, 99.5])\n",
|
||||
"# ylim = numpy.percentile(X[:, 1], [1, 99.5])\n",
|
||||
"# xlim = [2*xlim[0] - xlim[1], 2*xlim[1] - xlim[0]]\n",
|
||||
"# ylim = [2*ylim[0] - ylim[1], 2*ylim[1] - ylim[0]]\n",
|
||||
" \n",
|
||||
"# x = np.linspace(*xlim, 1000)\n",
|
||||
"# y = np.linspace(*ylim, 1000)\n",
|
||||
"# xx, yy = np.meshgrid(x, y)\n",
|
||||
"# xxyy = np.array([xx.ravel(), yy.ravel()]).T\n",
|
||||
"# z = np.exp(clf.score_samples(xxyy))\n",
|
||||
"# z = z.reshape(xx.shape)\n",
|
||||
" \n",
|
||||
"# z_sorted = sorted(z.ravel(), reverse=True)\n",
|
||||
"# z_sorted_cumsum = np.cumsum(z_sorted)\n",
|
||||
"# split = np.where(z_sorted_cumsum > (z_sorted_cumsum[-1] * 0.5))[0][0]\n",
|
||||
"# threshold = z_sorted[split]\n",
|
||||
"# threshold\n",
|
||||
"\n",
|
||||
"# # p = list(range(0, 100, 5))\n",
|
||||
"\n",
|
||||
"# p = [80]\n",
|
||||
"# plt.contour(xx, yy, z, levels=[threshold], colors=[color])\n",
|
||||
"# plt.plot(X[:, 0], X[:, 1], '.', c=color)\n",
|
||||
"# plt.xlim(*xlim)\n",
|
||||
"# plt.ylim(*ylim)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"zero_latlng_questions = {\n",
|
||||
" q\n",
|
||||
" for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
|
||||
" if row['user_lat'] == 0 and row['user_lng'] == 0\n",
|
||||
"}\n",
|
||||
"answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"answers_filtered['question_text_url'] = answers_filtered['question_text'].map(\n",
|
||||
" lambda x: x.replace('\"', '').replace('*', ''))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_palette(n, no_black=True, no_white=True):\n",
|
||||
" with open('glasbey/{}_colors.txt'.format(n + no_black + no_white)) as f:\n",
|
||||
" return [\n",
|
||||
" '#%02x%02x%02x' % tuple(int(c) for c in line.replace('\\n', '').split(','))\n",
|
||||
" for line in f\n",
|
||||
" if not no_black or line != '0,0,0\\n'\n",
|
||||
" if not no_white or line != '255,255,255\\n'\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"options = [x[1] for x in sorted([\n",
|
||||
" (row['user_lng'], answer_text)\n",
|
||||
" for answer_text, row in rows.groupby('answer_text').agg({'user_lng': 'count'}).iterrows()\n",
|
||||
"], reverse=True)]\n",
|
||||
"\n",
|
||||
"groups = [options[:len(options) // 2], options[len(options) // 2:]]\n",
|
||||
"groups"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"80000 / 350"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import glob\n",
|
||||
"with open('index.html', 'w') as f:\n",
|
||||
" f.write('<html><head></head><body>' + \n",
|
||||
" '<br/>\\n'.join(\n",
|
||||
" '\\t<a href=\"http://herbertkruitbosch.com/pronunciation_maps/{}\">{}<a>'.format(fn, fn[:-4].replace('_', ' '))\n",
|
||||
" for fn in sorted(\n",
|
||||
" glob.glob('*_all.html') +\n",
|
||||
" glob.glob('*_larger.html') +\n",
|
||||
" glob.glob('*_smaller.html')\n",
|
||||
" )\n",
|
||||
" ) + \"</body></html>\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# cmap = pyplot.get_cmap('gist_rainbow')\n",
|
||||
"# colors = pyplot.get_cmap('tab20')\n",
|
||||
"# colors = ['#e6194b', '#3cb44b', '#ffe119', '#0082c8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#d2f53c', '#fabebe', '#008080', '#e6beff', '#aa6e28', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000080', '#808080']\n",
|
||||
"\n",
|
||||
"std = (1.89, 1.35)\n",
|
||||
"\n",
|
||||
"for question, rows in answers_filtered.groupby('question_text_url'):\n",
|
||||
"# question = rows['question_text_url'][0]\n",
|
||||
" n_answers = len(rows.groupby('answer_text').count())\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" options = [x[1] for x in sorted([\n",
|
||||
" (row['user_lng'], answer_text)\n",
|
||||
" for answer_text, row in rows.groupby('answer_text').agg({'user_lng': 'count'}).iterrows()\n",
|
||||
" ], reverse=True)]\n",
|
||||
" groups = [options]\n",
|
||||
" if n_answers > 6:\n",
|
||||
" groups.extend([options[:6], options[6:]])\n",
|
||||
" \n",
|
||||
" for group, group_name in zip(groups, ['all', 'larger', 'smaller']):\n",
|
||||
" m = folium.Map((rows['user_lat'].median(), rows['user_lng'].median()), tiles='stamentoner', zoom_start=9)\n",
|
||||
" # colors = cmap(range(256))[::256 // n_answers]\n",
|
||||
" colors = get_palette(len(group))\n",
|
||||
" for answer, color in zip(group, colors):\n",
|
||||
" rows_ = rows[rows['answer_text'] == answer]\n",
|
||||
" # color = '#%02x%02x%02x' % tuple(int(c*255) for c in color[:3])\n",
|
||||
" name = '<span style=\\\\\"color:{}; \\\\\">{} ({})'.format(color, escape(answer), len(rows_))\n",
|
||||
"\n",
|
||||
" group = folium.FeatureGroup(name=name)\n",
|
||||
" colormap[name] = color\n",
|
||||
"\n",
|
||||
" for point in zip(rows_['user_lat'], rows_['user_lng']):\n",
|
||||
" point = tuple(p + 0.01 * s * numpy.random.randn() for p, s in zip(point, std))\n",
|
||||
" folium.Circle(\n",
|
||||
" point, color=None, fill_color=color,\n",
|
||||
" radius=400*min(1, 100 / len(rows_)), fill_opacity=1 #1 - 0.5 * len(rows_) / len(rows)\n",
|
||||
" ).add_to(group)\n",
|
||||
" group.add_to(m)\n",
|
||||
" folium.map.LayerControl('topright', collapsed=False).add_to(m)\n",
|
||||
" \n",
|
||||
" print(group_name, question)\n",
|
||||
" if group_name == 'larger':\n",
|
||||
" display(m)\n",
|
||||
" m.save('{}_{}.html'.format(question, group_name))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@@ -1,397 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Geographical pronunciation statistics"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas\n",
|
||||
"import MySQLdb\n",
|
||||
"import numpy\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
|
||||
"\n",
|
||||
"%matplotlib notebook\n",
|
||||
"from matplotlib import pyplot\n",
|
||||
"import folium\n",
|
||||
"from IPython.display import display\n",
|
||||
"from shapely.geometry import Polygon, MultiPolygon, shape, Point\n",
|
||||
"from jsbutton import JsButton\n",
|
||||
"from shapely.geometry import LineString, MultiLineString\n",
|
||||
"from jupyter_progressbar import ProgressBar\n",
|
||||
"from collections import defaultdict, Counter\n",
|
||||
"from ipy_table import make_table\n",
|
||||
"from html import escape\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"from random import shuffle\n",
|
||||
"import pickle\n",
|
||||
"from jupyter_progressbar import ProgressBar"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('friesland_wijken_land_only.p3', 'rb') as f:\n",
|
||||
" wijken, wijk_shapes = pickle.load(f)\n",
|
||||
"\n",
|
||||
"for x in wijken['features']:\n",
|
||||
" x['type'] = 'Feature'\n",
|
||||
"\n",
|
||||
"with open('friesland_wijken_geojson.json', 'w') as f:\n",
|
||||
" wijken['features'] = wijken['features']\n",
|
||||
" json.dump(wijken, f, indent=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from osgeo import gdal, ogr\n",
|
||||
"\n",
|
||||
"srcDS = gdal.OpenEx('friesland_wijken_geojson.json')\n",
|
||||
"ds = gdal.VectorTranslate('friesland_wijken_geojson.kml', srcDS, format='kml')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'k4luâ7mWBAgDSKhCVaysNdr TjeoE85JzëGúcM.,IRtp2-bLû69Un0wZF3Hv1iOfô'"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"''.join({\n",
|
||||
" c\n",
|
||||
" for wijk in wijken['features']\n",
|
||||
" for c in wijk['properties']['gemeente_en_wijk_naam']\n",
|
||||
"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('friesland_wijken_land_only.p3', 'rb') as f:\n",
|
||||
" wijken, wijk_shapes = pickle.load(f)\n",
|
||||
"\n",
|
||||
"wijk_names = [wijk['properties']['gemeente_en_wijk_naam'] for wijk in wijken['features']]\n",
|
||||
"\n",
|
||||
"def get_wijk(point):\n",
|
||||
" for i, shape in enumerate(wijk_shapes):\n",
|
||||
" if shape.contains(point):\n",
|
||||
" return i\n",
|
||||
" return -1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def listify(rd_multipolygon):\n",
|
||||
" if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float):\n",
|
||||
" return list(rd_multipolygon)\n",
|
||||
" return [\n",
|
||||
" listify(element)\n",
|
||||
" for element in rd_multipolygon\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Answers to how participants state a word should be pronounces.\n",
|
||||
"\n",
|
||||
"answers = pandas.read_sql('''\n",
|
||||
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
|
||||
"FROM core_surveyresult as survey\n",
|
||||
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
|
||||
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
|
||||
" ON result.id = answer.prediction_quiz_id\n",
|
||||
"''', db)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"zero_latlng_questions = {\n",
|
||||
" q\n",
|
||||
" for q, row in answers.groupby('question_text').agg('std').iterrows()\n",
|
||||
" if row['user_lat'] == 0 and row['user_lng'] == 0\n",
|
||||
"}\n",
|
||||
"answers_filtered = answers[answers['question_text'].map(lambda x: x not in zero_latlng_questions)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def reverse(rd_multipolygon):\n",
|
||||
" if len(rd_multipolygon) == 2 and tuple(map(type, rd_multipolygon)) == (float, float):\n",
|
||||
" return rd_multipolygon[::-1]\n",
|
||||
" return [\n",
|
||||
" reverse(element)\n",
|
||||
" for element in rd_multipolygon\n",
|
||||
" ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
|
||||
" # Remove the CWD from sys.path while we load stuff.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Takes approximately 2 minutes\n",
|
||||
"points = set(zip(answers_filtered['user_lng'], answers_filtered['user_lat']))\n",
|
||||
"\n",
|
||||
"wijk_map = dict()\n",
|
||||
"for lng, lat in points:\n",
|
||||
" wijk_map[(lng, lat)] = get_wijk(Point(lng, lat))\n",
|
||||
"\n",
|
||||
"answers_filtered['wijk'] = [\n",
|
||||
" wijk_map[(lng, lat)]\n",
|
||||
" for lat, lng in zip(answers_filtered['user_lat'], answers_filtered['user_lng'])\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
|
||||
" \n",
|
||||
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
|
||||
" \"\"\"\n",
|
||||
"/home/herbert/.virtualenvs/stimmenfryslan/lib/python3.6/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
|
||||
" \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"answers_filtered['question_text_url'] = answers_filtered['question_text'].map(\n",
|
||||
" lambda x: x.replace('\"', '').replace('*', ''))\n",
|
||||
"\n",
|
||||
"answers_filtered['wijk_name'] = answers_filtered['wijk'].map(\n",
|
||||
" lambda x: wijk_names[x])\n",
|
||||
"\n",
|
||||
"answers_filtered['answer_text_url'] = answers_filtered['answer_text'].map(\n",
|
||||
" lambda x: x[x.find('('):x.find(')')][1:])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"wijken = pandas.DataFrame([\n",
|
||||
" {'#name': name, 'longitude': shape.centroid.xy[0][0], 'latitude': shape.centroid.xy[1][0]}\n",
|
||||
" for name, shape in zip(wijk_names, wijk_shapes)\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"wijken.set_index('#name', inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def merge_dicts(*args):\n",
|
||||
" for arg in args[1:]:\n",
|
||||
" args[0].update(arg)\n",
|
||||
" return args[0]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"pronunciations = pandas.DataFrame([\n",
|
||||
" merge_dicts(\n",
|
||||
" {\n",
|
||||
" question: answers['answer_text_url']\n",
|
||||
" for question, answers in rows.groupby(\n",
|
||||
" 'question_text_url'\n",
|
||||
" ).agg(\n",
|
||||
" {\n",
|
||||
" 'answer_text_url': lambda x: [\n",
|
||||
" {\n",
|
||||
" 'pronunciation': answer_text,\n",
|
||||
" 'count': answer_texts.count(answer_text)\n",
|
||||
" }\n",
|
||||
" for answer_texts in [list(x)]\n",
|
||||
" for answer_text in sorted(set(x))\n",
|
||||
" \n",
|
||||
" ] \n",
|
||||
" }\n",
|
||||
" ).iterrows()\n",
|
||||
" }, {\n",
|
||||
" 'wijk': wijk_names[wijk]\n",
|
||||
" })\n",
|
||||
" for wijk, rows in answers_filtered.groupby('wijk')\n",
|
||||
" if wijk >= 0\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"pronunciations.set_index('wijk', inplace=True)\n",
|
||||
"pronunciations\n",
|
||||
"\n",
|
||||
"columns = list(pronunciations.columns)\n",
|
||||
"\n",
|
||||
"counts = pandas.DataFrame([\n",
|
||||
" merge_dicts({\n",
|
||||
" column + \": \" + x['pronunciation']: 100 * x['count'] / total\n",
|
||||
" for column in columns\n",
|
||||
" for total in [sum(x['count'] for x in row[column])]\n",
|
||||
" for x in row[column]\n",
|
||||
" }, {'': wijk})\n",
|
||||
" for wijk, row in pronunciations.iterrows()\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"pronunciations = pandas.DataFrame([\n",
|
||||
" merge_dicts({\n",
|
||||
" column: ' / '.join(str(x['pronunciation']) for x in row[column])\n",
|
||||
" for column in columns\n",
|
||||
" }, {'': wijk})\n",
|
||||
" for wijk, row in pronunciations.iterrows()\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"pronunciations.set_index('', inplace=True)\n",
|
||||
"counts.set_index('', inplace=True)\n",
|
||||
"counts[counts != counts] = 0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<function shapely.geometry.geo.shape(context)>"
|
||||
]
|
||||
},
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pronunciations.to_csv('pronunciations_by_wijk.tsv', sep='\\t')\n",
|
||||
"counts.to_csv('pronunciation_percentages_by_wijk.tsv', sep='\\t')\n",
|
||||
"wijken.to_csv('wijk_centroid.tsv', sep='\\t', columns=['longitude', 'latitude'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open('pronunciations_by_wijk.tsv') as f:\n",
|
||||
" p = list(f)\n",
|
||||
" \n",
|
||||
"with open('pronunciation_count_by_wijk.tsv') as f:\n",
|
||||
" c = list(f)\n",
|
||||
"\n",
|
||||
"with open('wijk_centroid.tsv') as f:\n",
|
||||
" w = list(f)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
File diff suppressed because one or more lines are too long
@@ -1,5 +1,14 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Show province segmentation\n",
|
||||
"\n",
|
||||
"In gemeentes and wijken as calculated in `Segment Provinces in Wijken and Gemeentes.ipynb`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
@@ -34,7 +43,9 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
|
File diff suppressed because one or more lines are too long
@@ -1,102 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from glob import glob\n",
|
||||
"\n",
|
||||
"import os"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"waves = [\n",
|
||||
" wave\n",
|
||||
" for location in os.listdir('data')\n",
|
||||
" for date in os.listdir(os.path.join('data', location))\n",
|
||||
" for wave in os.listdir(os.path.join('data', location, date))\n",
|
||||
"]\n",
|
||||
"assert len(waves) == len(set(waves)), \"Not all filenames are unique :(\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for location in os.listdir('data'):\n",
|
||||
" for date in os.listdir(os.path.join('data', location)):\n",
|
||||
" for wave in os.listdir(os.path.join('data', location, date)):\n",
|
||||
" source = os.path.join('data', location, date, wave)\n",
|
||||
" destination = os.path.join('per_word', wave.split('_')[1])\n",
|
||||
" if not os.path.isdir(destination):\n",
|
||||
" os.mkdir(destination)\n",
|
||||
" os.rename(source, os.path.join(destination, wave))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas\n",
|
||||
"\n",
|
||||
"data = pandas.read_csv('/home/herbert/picture-game-result-export.csv', delimiter=';')\n",
|
||||
"data['Filename'] = [x.split('/')[-1] for x in data['Opname']]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for word in os.listdir('per_word'):\n",
|
||||
" for wave in os.listdir(os.path.join('per_word', word)):\n",
|
||||
" source = os.path.join('per_word', word, wave)\n",
|
||||
" if wave not in relevant:\n",
|
||||
" destination = os.path.join('per_word', word, 'irrelevant_accent_' + wave)\n",
|
||||
" os.rename(source, destination)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 63,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data.to_excel('/home/herbert/picture-game-result-export-filename.xlsx')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Reference in New Issue
Block a user