{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Geographical pronunciation tables, simple example\n",
"\n",
"Simple example to create gabmap files for two words with few pronunciations an two regions."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('..')\n",
"\n",
"import pandas\n",
"import MySQLdb\n",
"import json\n",
"import copy\n",
"\n",
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
"\n",
"from shapely.geometry import shape, Point\n",
"\n",
"from gabmap import create_gabmap_dataframes\n",
"\n",
"from stimmen.geojson import merge_features"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/Friesland_wijken.geojson') as f:\n",
" regions = json.load(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and simplify"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Answers to how participants state a word should be pronounced\n",
"\n",
"answers = pandas.read_sql('''\n",
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
"FROM core_surveyresult as survey\n",
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
" ON result.id = answer.prediction_quiz_id\n",
"''', db)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"regions_simple = merge_features(copy.deepcopy(regions),\n",
" condition=lambda feature: feature['properties']['GM_NAAM'] == 'Heerenveen',\n",
")\n",
"\n",
"regions_simple = merge_features(\n",
" regions_simple,\n",
" condition=lambda feature: feature['properties']['GM_NAAM'] == 'Leeuwarden',\n",
")\n",
"regions_simple['features'] = regions_simple['features'][-2:]\n",
"\n",
"regions_simple['features'][0]['properties']['name'] = 'Heerenveen'\n",
"regions_simple['features'][1]['properties']['name'] = 'Leeuwarden'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"answers_simple = answers[\n",
" (answers['question_text'] == '\"blad\" (aan een boom)') |\n",
" (answers['question_text'] == '\"vis\"')\n",
"].copy()\n",
"\n",
"answers_simple['question_text'] = answers_simple['question_text'].map(\n",
" lambda x: x.replace('\"', '').replace('*', ''))\n",
"\n",
"answers_simple['answer_text'] = answers_simple['answer_text'].map(\n",
" lambda x: x[x.find('('):x.find(')')][1:])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Two words, boom and vis, with each 4 and 2 pronunciations"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" answer_text | \n",
"
\n",
" \n",
" question_text | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" blad (aan een boom) | \n",
" 4 | \n",
"
\n",
" \n",
" vis | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" answer_text\n",
"question_text \n",
"blad (aan een boom) 4\n",
"vis 2"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"answers_simple.groupby('question_text').agg({'answer_text': lambda x: len(set(x))})"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"centroids_example, pronunciations_example, counts_example = create_gabmap_dataframes(\n",
" regions_simple, answers_simple,\n",
" latitude_column='user_lat', longitude_column='user_lng',\n",
" word_column='question_text', pronunciation_column='answer_text',\n",
" region_name_property='name'\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Resulting tables\n",
"\n",
"Stored as tab separated files for gabmap"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" latitude | \n",
" longitude | \n",
"
\n",
" \n",
" #name | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Heerenveen | \n",
" 52.996076 | \n",
" 5.977925 | \n",
"
\n",
" \n",
" Leeuwarden | \n",
" 53.169940 | \n",
" 5.797613 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" latitude longitude\n",
"#name \n",
"Heerenveen 52.996076 5.977925\n",
"Leeuwarden 53.169940 5.797613"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"centroids_example"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" blad (aan een boom) | \n",
" vis | \n",
"
\n",
" \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Heerenveen | \n",
" blet / blɑt / blɔd / blɛ:t | \n",
" fisk / fɪs | \n",
"
\n",
" \n",
" Leeuwarden | \n",
" blet / blɑt / blɔd / blɛ:t | \n",
" fisk / fɪs | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" blad (aan een boom) vis\n",
" \n",
"Heerenveen blet / blɑt / blɔd / blɛ:t fisk / fɪs\n",
"Leeuwarden blet / blɑt / blɔd / blɛ:t fisk / fɪs"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pronunciations_example"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" blad (aan een boom): blet | \n",
" blad (aan een boom): blɑt | \n",
" blad (aan een boom): blɔd | \n",
" blad (aan een boom): blɛ:t | \n",
" vis: fisk | \n",
" vis: fɪs | \n",
"
\n",
" \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Heerenveen | \n",
" 31.654676 | \n",
" 2.158273 | \n",
" 2.158273 | \n",
" 64.028777 | \n",
" 52.517986 | \n",
" 47.482014 | \n",
"
\n",
" \n",
" Leeuwarden | \n",
" 7.865169 | \n",
" 7.022472 | \n",
" 8.707865 | \n",
" 76.404494 | \n",
" 75.000000 | \n",
" 25.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" blad (aan een boom): blet blad (aan een boom): blɑt \\\n",
" \n",
"Heerenveen 31.654676 2.158273 \n",
"Leeuwarden 7.865169 7.022472 \n",
"\n",
" blad (aan een boom): blɔd blad (aan een boom): blɛ:t vis: fisk \\\n",
" \n",
"Heerenveen 2.158273 64.028777 52.517986 \n",
"Leeuwarden 8.707865 76.404494 75.000000 \n",
"\n",
" vis: fɪs \n",
" \n",
"Heerenveen 47.482014 \n",
"Leeuwarden 25.000000 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counts_example"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"pronunciations_example.to_csv('../data/Pronunciations_example.gabmap.tsv', sep='\\t')\n",
"counts_example.to_csv('../data/Pronunciation_percentages_example.gabmap.tsv', sep='\\t')\n",
"centroids_example.to_csv('../data/Centroids_example.gabmap.tsv', sep='\\t', columns=['longitude', 'latitude'])\n",
"with open('../data/Gabmap_example.geojson', 'w') as f:\n",
" json.dump(regions_simple, f, indent=1)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}