stimmenfryslan/notebooks/Surveys with repeated prediction quizes.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Surveys with repeated prediction quizes\n",
    "\n",
    "Participants specify some demographics (approximate age, gender, etc) in a survey before they start the prediction quiz. Then each survey should contain one prediction quiz, where the participant provides his pronunciation for 19 words.\n",
    "\n",
    "In conclusion, this is only partly true. All surveys have at least one prediction quiz attached, with 19 pronunciation responses, but some surveys have more quizes attached. All off the quizes have completely answered all 19 words."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "········\n"
     ]
    }
   ],
   "source": [
    "# Enable portforwording from 3307 locally to 3306 on the stimmen database machine\n",
    "# ssh -L 3307:127.0.0.1:3306 stimmen.housing.rug.nl\n",
    "\n",
    "import pandas\n",
    "import MySQLdb\n",
    "from collections import defaultdict, Counter\n",
    "from ipy_table import make_table, set_row_style\n",
    "from IPython.display import display\n",
    "\n",
    "from getpass import getpass\n",
    "\n",
    "if 'mysql_password' not in globals():\n",
    "    mysql_password = getpass()\n",
    "try:\n",
    "    db = MySQLdb.connect(host='127.0.0.1', port=3307, user='stimmen', passwd=mysql_password, db='stimmen', charset='utf8')\n",
    "except MySQLdb.OperationalError as e:\n",
    "    globals().pop('mysql_password')\n",
    "    raise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "answers = pandas.read_sql('''\n",
    "SELECT\n",
    "    survey.id AS survey_id, \n",
    "    prediction_quiz_id,\n",
    "    user_lat, user_lng,\n",
    "    question_text, answer_text\n",
    "FROM       core_surveyresult as survey\n",
    "INNER JOIN core_predictionquizresult as result ON survey.id = result.survey_result_id\n",
    "INNER JOIN core_predictionquizresultquestionanswer as answer\n",
    "    ON result.id = answer.prediction_quiz_id\n",
    "WHERE\n",
    "    survey.submitted_at >= '2017-09-17'\n",
    "    AND result.submitted_at >= '2017-09-17'\n",
    "''', db)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Words\n",
    "\n",
    "Words for which prediction quiz participants provided pronunciations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " - \"borst\" (*lichaamsdeel)\n",
      " - \"gegaan\"\n",
      " - \"gezet\"\n",
      " - \"geel\"\n",
      " - \"bij\" (*insect)\n",
      " - \"avond\"\n",
      " - \"kaas\"\n",
      " - \"deurtje\"\n",
      " - \"koken\"\n",
      " - \"dag\"\n",
      " - \"heel\"\n",
      " - \"blad\" (aan een boom)\n",
      " - \"armen\" (*lichaamsdeel)\n",
      " - \"trein\"\n",
      " - \"oog\"\n",
      " - \"zaterdag\"\n",
      " - \"sprak (toe)\"\n",
      " - \"vis\"\n",
      " - \"tand\"\n"
     ]
    }
   ],
   "source": [
    "questions = answers['question_text'].unique()\n",
    "print(' - ' + '\\n - '.join(questions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Repeated quizes\n",
    "\n",
    "Some surveys repeated the predictions quiz."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# surveys \\w repeated pronunciations 197\n",
      "# surveys                            3104\n"
     ]
    }
   ],
   "source": [
    "survey_counts = answers.groupby('survey_id').count()['answer_text']\n",
    "\n",
    "print('# surveys \\w repeated pronunciations', (survey_counts != 19).sum())\n",
    "print('# surveys                           ', len(survey_counts))\n",
    "\n",
    "repeat_survey_ids = set(survey_counts[survey_counts != 19].index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# answers of all surveys with repeated quizes\n",
    "\n",
    "repeat_survey_answers = answers[[\n",
    "    survey_id in repeat_survey_ids\n",
    "    for survey_id in answers['survey_id']\n",
    "]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Sanity check (is it safe to use '|' as a token):\n",
    "assert not any('|' in x for x in answers['answer_text'])\n",
    "\n",
    "## How often one word has a reported pronunciation within one survey\n",
    "question_counts = repeat_survey_answers.groupby(['survey_id', 'question_text']).agg({\n",
    "    'prediction_quiz_id': len,\n",
    "    'answer_text': lambda x: '|'.join(set(x))    \n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "regarding surveys with repeated quizes\n",
      "# reported pronunciations, counting repeats 8398\n",
      "# reported pronunciations                   3743\n"
     ]
    }
   ],
   "source": [
    "print('regarding surveys with repeated quizes')\n",
    "\n",
    "print('# reported pronunciations, counting repeats', len(repeat_survey_answers))\n",
    "print('# reported pronunciations                  ', len(question_counts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sanity check, each survey with repeated quizes, are all the words repeated\n",
    "# Conclusion: they are (which is good), since nothing was printed\n",
    "for survey_id, rows in repeat_survey_answers.groupby('survey_id'):\n",
    "    if len(set(\n",
    "        rows.groupby('question_text').count()['survey_id']\n",
    "    )) != 1:\n",
    "        print('for survey', survey_id, ', different words were provided a pronunciation a different number of times')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "how often different pronunciations for a word were given within one survey: 1203\n"
     ]
    }
   ],
   "source": [
    "print('how often different pronunciations for a word were given within one survey:', sum(\n",
    "    '|' in x # we used '|' as a pronunciation-seperator a few cells up, so it's existance equals > 1 pronunciation\n",
    "    for x in question_counts['answer_text']\n",
    "))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}