stimmenfryslan/notebooks/Gabmap Pronunciation Tables, Simple Example.ipynb

459 lines
12 KiB
Plaintext
Raw Normal View History

2018-10-01 17:19:36 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Geographical pronunciation tables, simple example\n",
"\n",
"Simple example to create gabmap files for two words with few pronunciations an two regions."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('..')\n",
"\n",
"import pandas\n",
"import MySQLdb\n",
"import json\n",
"import copy\n",
"\n",
"db = MySQLdb.connect(user='root', passwd='Nmmxhjgt1@', db='stimmen', charset='utf8')\n",
"\n",
"from shapely.geometry import shape, Point\n",
"\n",
"from gabmap import create_gabmap_dataframes\n",
"\n",
"from stimmen.geojson import merge_features"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"with open('../data/Friesland_wijken.geojson') as f:\n",
" regions = json.load(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and simplify"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Answers to how participants state a word should be pronounced\n",
"\n",
"answers = pandas.read_sql('''\n",
"SELECT prediction_quiz_id, user_lat, user_lng, question_text, answer_text\n",
"FROM core_surveyresult as survey\n",
"INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
"INNER JOIN core_predictionquizresultquestionanswer as answer\n",
" ON result.id = answer.prediction_quiz_id\n",
"''', db)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"regions_simple = merge_features(copy.deepcopy(regions),\n",
" condition=lambda feature: feature['properties']['GM_NAAM'] == 'Heerenveen',\n",
")\n",
"\n",
"regions_simple = merge_features(\n",
" regions_simple,\n",
" condition=lambda feature: feature['properties']['GM_NAAM'] == 'Leeuwarden',\n",
")\n",
"regions_simple['features'] = regions_simple['features'][-2:]\n",
"\n",
"regions_simple['features'][0]['properties']['name'] = 'Heerenveen'\n",
"regions_simple['features'][1]['properties']['name'] = 'Leeuwarden'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"answers_simple = answers[\n",
" (answers['question_text'] == '\"blad\" (aan een boom)') |\n",
" (answers['question_text'] == '\"vis\"')\n",
"].copy()\n",
"\n",
"answers_simple['question_text'] = answers_simple['question_text'].map(\n",
" lambda x: x.replace('\"', '').replace('*', ''))\n",
"\n",
"answers_simple['answer_text'] = answers_simple['answer_text'].map(\n",
" lambda x: x[x.find('('):x.find(')')][1:])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Two words, boom and vis, with each 4 and 2 pronunciations"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>answer_text</th>\n",
" </tr>\n",
" <tr>\n",
" <th>question_text</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>blad (aan een boom)</th>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>vis</th>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" answer_text\n",
"question_text \n",
"blad (aan een boom) 4\n",
"vis 2"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"answers_simple.groupby('question_text').agg({'answer_text': lambda x: len(set(x))})"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"centroids_example, pronunciations_example, counts_example = create_gabmap_dataframes(\n",
" regions_simple, answers_simple,\n",
" latitude_column='user_lat', longitude_column='user_lng',\n",
" word_column='question_text', pronunciation_column='answer_text',\n",
" region_name_property='name'\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Resulting tables\n",
"\n",
"Stored as tab separated files for gabmap"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" </tr>\n",
" <tr>\n",
" <th>#name</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Heerenveen</th>\n",
" <td>52.996076</td>\n",
" <td>5.977925</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leeuwarden</th>\n",
" <td>53.169940</td>\n",
" <td>5.797613</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" latitude longitude\n",
"#name \n",
"Heerenveen 52.996076 5.977925\n",
"Leeuwarden 53.169940 5.797613"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"centroids_example"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>blad (aan een boom)</th>\n",
" <th>vis</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Heerenveen</th>\n",
" <td>blet / blɑt / blɔd / blɛ:t</td>\n",
" <td>fisk / fɪs</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leeuwarden</th>\n",
" <td>blet / blɑt / blɔd / blɛ:t</td>\n",
" <td>fisk / fɪs</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" blad (aan een boom) vis\n",
" \n",
"Heerenveen blet / blɑt / blɔd / blɛ:t fisk / fɪs\n",
"Leeuwarden blet / blɑt / blɔd / blɛ:t fisk / fɪs"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pronunciations_example"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>blad (aan een boom): blet</th>\n",
" <th>blad (aan een boom): blɑt</th>\n",
" <th>blad (aan een boom): blɔd</th>\n",
" <th>blad (aan een boom): blɛ:t</th>\n",
" <th>vis: fisk</th>\n",
" <th>vis: fɪs</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Heerenveen</th>\n",
" <td>31.654676</td>\n",
" <td>2.158273</td>\n",
" <td>2.158273</td>\n",
" <td>64.028777</td>\n",
" <td>52.517986</td>\n",
" <td>47.482014</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Leeuwarden</th>\n",
" <td>7.865169</td>\n",
" <td>7.022472</td>\n",
" <td>8.707865</td>\n",
" <td>76.404494</td>\n",
" <td>75.000000</td>\n",
" <td>25.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" blad (aan een boom): blet blad (aan een boom): blɑt \\\n",
" \n",
"Heerenveen 31.654676 2.158273 \n",
"Leeuwarden 7.865169 7.022472 \n",
"\n",
" blad (aan een boom): blɔd blad (aan een boom): blɛ:t vis: fisk \\\n",
" \n",
"Heerenveen 2.158273 64.028777 52.517986 \n",
"Leeuwarden 8.707865 76.404494 75.000000 \n",
"\n",
" vis: fɪs \n",
" \n",
"Heerenveen 47.482014 \n",
"Leeuwarden 25.000000 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"counts_example"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"pronunciations_example.to_csv('../data/Pronunciations_example.gabmap.tsv', sep='\\t')\n",
"counts_example.to_csv('../data/Pronunciation_percentages_example.gabmap.tsv', sep='\\t')\n",
"centroids_example.to_csv('../data/Centroids_example.gabmap.tsv', sep='\\t', columns=['longitude', 'latitude'])\n",
"with open('../data/Gabmap_example.geojson', 'w') as f:\n",
" json.dump(regions_simple, f)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}