459 lines
12 KiB
Plaintext
459 lines
12 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Geographical pronunciation tables, simple example\n",
|
||
"\n",
|
||
"Simple example to create gabmap files for two words with few pronunciations an two regions."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import sys\n",
|
||
"sys.path.append('..')\n",
|
||
"\n",
|
||
"import pandas\n",
|
||
"import MySQLdb\n",
|
||
"import json\n",
|
||
"import copy\n",
|
||
"\n",
|
||
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
|
||
"\n",
|
||
"from shapely.geometry import shape, Point\n",
|
||
"\n",
|
||
"from gabmap import create_gabmap_dataframes\n",
|
||
"\n",
|
||
"from stimmen.geojson import merge_features"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"with open('../data/Friesland_wijken.geojson') as f:\n",
|
||
" regions = json.load(f)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Load and simplify"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Answers to how participants state a word should be pronounced\n",
|
||
"\n",
|
||
"answers = pandas.read_sql('''\n",
|
||
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
|
||
"FROM core_surveyresult as survey\n",
|
||
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
|
||
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
|
||
" ON result.id = answer.prediction_quiz_id\n",
|
||
"''', db)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"regions_simple = merge_features(copy.deepcopy(regions),\n",
|
||
" condition=lambda feature: feature['properties']['GM_NAAM'] == 'Heerenveen',\n",
|
||
")\n",
|
||
"\n",
|
||
"regions_simple = merge_features(\n",
|
||
" regions_simple,\n",
|
||
" condition=lambda feature: feature['properties']['GM_NAAM'] == 'Leeuwarden',\n",
|
||
")\n",
|
||
"regions_simple['features'] = regions_simple['features'][-2:]\n",
|
||
"\n",
|
||
"regions_simple['features'][0]['properties']['name'] = 'Heerenveen'\n",
|
||
"regions_simple['features'][1]['properties']['name'] = 'Leeuwarden'"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"answers_simple = answers[\n",
|
||
" (answers['question_text'] == '\"blad\" (aan een boom)') |\n",
|
||
" (answers['question_text'] == '\"vis\"')\n",
|
||
"].copy()\n",
|
||
"\n",
|
||
"answers_simple['question_text'] = answers_simple['question_text'].map(\n",
|
||
" lambda x: x.replace('\"', '').replace('*', ''))\n",
|
||
"\n",
|
||
"answers_simple['answer_text'] = answers_simple['answer_text'].map(\n",
|
||
" lambda x: x[x.find('('):x.find(')')][1:])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Two words, boom and vis, with each 4 and 2 pronunciations"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>answer_text</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>question_text</th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>blad (aan een boom)</th>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>vis</th>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" answer_text\n",
|
||
"question_text \n",
|
||
"blad (aan een boom) 4\n",
|
||
"vis 2"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"answers_simple.groupby('question_text').agg({'answer_text': lambda x: len(set(x))})"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"centroids_example, pronunciations_example, counts_example = create_gabmap_dataframes(\n",
|
||
" regions_simple, answers_simple,\n",
|
||
" latitude_column='user_lat', longitude_column='user_lng',\n",
|
||
" word_column='question_text', pronunciation_column='answer_text',\n",
|
||
" region_name_property='name'\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Resulting tables\n",
|
||
"\n",
|
||
"Stored as tab separated files for gabmap"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>latitude</th>\n",
|
||
" <th>longitude</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>#name</th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>Heerenveen</th>\n",
|
||
" <td>52.996076</td>\n",
|
||
" <td>5.977925</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Leeuwarden</th>\n",
|
||
" <td>53.169940</td>\n",
|
||
" <td>5.797613</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" latitude longitude\n",
|
||
"#name \n",
|
||
"Heerenveen 52.996076 5.977925\n",
|
||
"Leeuwarden 53.169940 5.797613"
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"centroids_example"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>blad (aan een boom)</th>\n",
|
||
" <th>vis</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>Heerenveen</th>\n",
|
||
" <td>blet / blɑt / blɔd / blɛ:t</td>\n",
|
||
" <td>fisk / fɪs</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Leeuwarden</th>\n",
|
||
" <td>blet / blɑt / blɔd / blɛ:t</td>\n",
|
||
" <td>fisk / fɪs</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" blad (aan een boom) vis\n",
|
||
" \n",
|
||
"Heerenveen blet / blɑt / blɔd / blɛ:t fisk / fɪs\n",
|
||
"Leeuwarden blet / blɑt / blɔd / blɛ:t fisk / fɪs"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pronunciations_example"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>blad (aan een boom): blet</th>\n",
|
||
" <th>blad (aan een boom): blɑt</th>\n",
|
||
" <th>blad (aan een boom): blɔd</th>\n",
|
||
" <th>blad (aan een boom): blɛ:t</th>\n",
|
||
" <th>vis: fisk</th>\n",
|
||
" <th>vis: fɪs</th>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" <th></th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>Heerenveen</th>\n",
|
||
" <td>31.654676</td>\n",
|
||
" <td>2.158273</td>\n",
|
||
" <td>2.158273</td>\n",
|
||
" <td>64.028777</td>\n",
|
||
" <td>52.517986</td>\n",
|
||
" <td>47.482014</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>Leeuwarden</th>\n",
|
||
" <td>7.865169</td>\n",
|
||
" <td>7.022472</td>\n",
|
||
" <td>8.707865</td>\n",
|
||
" <td>76.404494</td>\n",
|
||
" <td>75.000000</td>\n",
|
||
" <td>25.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" blad (aan een boom): blet blad (aan een boom): blɑt \\\n",
|
||
" \n",
|
||
"Heerenveen 31.654676 2.158273 \n",
|
||
"Leeuwarden 7.865169 7.022472 \n",
|
||
"\n",
|
||
" blad (aan een boom): blɔd blad (aan een boom): blɛ:t vis: fisk \\\n",
|
||
" \n",
|
||
"Heerenveen 2.158273 64.028777 52.517986 \n",
|
||
"Leeuwarden 8.707865 76.404494 75.000000 \n",
|
||
"\n",
|
||
" vis: fɪs \n",
|
||
" \n",
|
||
"Heerenveen 47.482014 \n",
|
||
"Leeuwarden 25.000000 "
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"counts_example"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"pronunciations_example.to_csv('../data/Pronunciations_example.gabmap.tsv', sep='\\t')\n",
|
||
"counts_example.to_csv('../data/Pronunciation_percentages_example.gabmap.tsv', sep='\\t')\n",
|
||
"centroids_example.to_csv('../data/Centroids_example.gabmap.tsv', sep='\\t', columns=['longitude', 'latitude'])\n",
|
||
"with open('../data/Gabmap_example.geojson', 'w') as f:\n",
|
||
" json.dump(regions_simple, f)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.6.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|