195 lines
111 KiB
Plaintext
195 lines
111 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Number of projects per kickstarter category\n",
|
||
|
"\n",
|
||
|
"For each category and subcategory, find out how many projects there are in total, are / were sucessful, and are live."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 182,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import json\n",
|
||
|
"import time\n",
|
||
|
"import datetime\n",
|
||
|
"import selenium\n",
|
||
|
"from selenium import webdriver\n",
|
||
|
"from multiprocessing import Pool\n",
|
||
|
"from jupyter_progressbar import ProgressBar\n",
|
||
|
"from ipy_table import make_table, set_row_style\n",
|
||
|
"from IPython.display import display, Image, HTML"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Executed around:"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 191,
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"2018-01-03 17:56\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"d = datetime.datetime.now()\n",
|
||
|
"print(d.strftime('%Y-%m-%d %H:%M'))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"driver = webdriver.Chrome()\n",
|
||
|
"\n",
|
||
|
"root = 'https://www.kickstarter.com/'\n",
|
||
|
"driver.get(root)\n",
|
||
|
"driver.execute_script('$(\".section_global-nav-left > button:first-child\").click()')\n",
|
||
|
"time.sleep(3)\n",
|
||
|
"category_links = driver.execute_script(\"return $('a').map(function(i, x) { return $(x).attr('href'); }).filter(function(i, x) { return x.indexOf('/discover/categories') >= 0; })\")\n",
|
||
|
"category_links = list(set(category_links))\n",
|
||
|
"\n",
|
||
|
"driver.close()\n",
|
||
|
"driver.quit()"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 145,
|
||
|
"metadata": {
|
||
|
"scrolled": true
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def get_count(driver, url):\n",
|
||
|
" driver.get(url)\n",
|
||
|
" try:\n",
|
||
|
" return next(\n",
|
||
|
" int(element.text.replace(' projects', '').replace(',', ''))\n",
|
||
|
" for element in driver.find_elements_by_class_name('count')\n",
|
||
|
" if element.text.endswith(' projects')\n",
|
||
|
" )\n",
|
||
|
" except StopIteration:\n",
|
||
|
" return -1\n",
|
||
|
"\n",
|
||
|
"def get_rows(urls):\n",
|
||
|
" try:\n",
|
||
|
" driver = webdriver.Chrome()\n",
|
||
|
" result = []\n",
|
||
|
" for url in urls:\n",
|
||
|
" category = url.split('?')[0][len('https://www.kickstarter.com/discover/categories/'):].replace('%20', ' ').replace('%2520', ' ')\n",
|
||
|
"\n",
|
||
|
" category, subcategory = (category.split('/') + ['', ''])[:2]\n",
|
||
|
"\n",
|
||
|
" all_projects = get_count(driver, url)\n",
|
||
|
" live_projects = get_count(driver, url + '&state=live')\n",
|
||
|
" success_projects = get_count(driver, url + '&state=successful')\n",
|
||
|
"\n",
|
||
|
" result.append([category, subcategory, all_projects, success_projects, live_projects])\n",
|
||
|
" finally:\n",
|
||
|
" driver.quit()\n",
|
||
|
" return result\n",
|
||
|
"\n",
|
||
|
"results = []\n",
|
||
|
"pool = Pool(8)\n",
|
||
|
"for start, to in zip(range(0, len(category_links), 11), range(11, len(category_links)+1, 11)):\n",
|
||
|
" results.append(pool.apply_async(get_rows, [category_links[start:to]]))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 192,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"table = [['category', 'subcategory', 'total', 'successful', 'live']]\n",
|
||
|
"\n",
|
||
|
"for part in results:\n",
|
||
|
" assert part.ready()\n",
|
||
|
" table.extend(part.get())\n",
|
||
|
"table = table[:1] + sorted(table[1:])"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Result\n",
|
||
|
"\n",
|
||
|
"Green indicates a category (not a subcategory), red indicates over 2400 projects, the limit to scrape successfully."
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 193,
|
||
|
"metadata": {
|
||
|
"scrolled": false
|
||
|
},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/html": [
|
||
|
"<table border=\"1\" cellpadding=\"3\" cellspacing=\"0\" style=\"border:black; border-collapse:collapse;\"><tr><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\"><b>category</b></td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\"><b>subcategory</b></td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\"><b>total</b></td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\"><b>successful</b></td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\"><b>live</b></td></tr><tr><td style=\"background-color:lightgreen;border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\"><b>art</b></td><td style=\"background-color:lightgreen;border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\"><b></b></td><td style=\"background-color:lightgreen;border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\"><b>28151</b></td><td style=\"background-color:lightgreen;border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\"><b>11497</b></td><td style=\"background-color:lightgreen;border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\"><b>207</b></td></tr><tr><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">art</td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">ceramics</td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">308</td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">134</td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">5</td></tr><tr><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">art</td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">conceptual art</td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">1027</td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">366</td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">8</td></tr><tr><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">art</td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">digital art</td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">1348</td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">374</td><td style=\"border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">13</td></tr><tr><td style=\"background-color:#ffcccc;border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">art</td><td style=\"background-color:#ffcccc;border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">illustration</td><td style=\"background-color:#ffcccc;border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">3192</td><td style=\"background-color:#ffcccc;border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\">1636</td><td style=\"background-color:#ffcccc;border-left: 1px solid;border-right: 1px solid;border-top: 1px solid;border-bottom: 1px solid;\
|
||
|
],
|
||
|
"text/plain": [
|
||
|
"<ipy_table.ipy_table.IpyTable at 0x7fe350161b38>"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 193,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"tab = make_table(table)\n",
|
||
|
"set_row_style(0, bold=True)\n",
|
||
|
"for i in range(len(table)):\n",
|
||
|
" if table[i][1] == '':\n",
|
||
|
" set_row_style(i, bold=True, color='lightgreen')\n",
|
||
|
" elif i > 0 and any(int(x) > 2400 for x in table[i][2:]):\n",
|
||
|
" set_row_style(i, color='#ffcccc')\n",
|
||
|
"\n",
|
||
|
"tab"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.5.2"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2
|
||
|
}
|