# Number of projects per kickstarter category

For each category and subcategory, find out how many projects there are in total, are / were sucessful, and are live.

In [182]:
import json
import time
import datetime
import selenium
from selenium import webdriver
from multiprocessing import Pool
from jupyter_progressbar import ProgressBar
from ipy_table import make_table, set_row_style
from IPython.display import display, Image, HTML

Executed around:

In [191]:
d = datetime.datetime.now()
print(d.strftime('%Y-%m-%d %H:%M'))

2018-01-03 17:56


In [3]:
driver = webdriver.Chrome()

root = 'https://www.kickstarter.com/'
driver.get(root)
driver.execute_script('$(".section_global-nav-left > button:first-child").click()')
time.sleep(3)
category_links = driver.execute_script("return $('a').map(function(i, x) { return $(x).attr('href'); }).filter(function(i, x) { return x.indexOf('/discover/categories') >= 0; })")
category_links = list(set(category_links))

driver.close()
driver.quit()

In [145]:
def get_count(driver, url):
    driver.get(url)
    try:
        return next(
            int(element.text.replace(' projects', '').replace(',', ''))
            for element in driver.find_elements_by_class_name('count')
            if element.text.endswith(' projects')
        )
    except StopIteration:
        return -1

def get_rows(urls):
    try:
        driver = webdriver.Chrome()
        result = []
        for url in urls:
            category = url.split('?')[0][len('https://www.kickstarter.com/discover/categories/'):].replace('%20', ' ').replace('%2520', ' ')

            category, subcategory = (category.split('/') + ['', ''])[:2]

            all_projects = get_count(driver, url)
            live_projects = get_count(driver, url + '&state=live')
            success_projects = get_count(driver, url + '&state=successful')

            result.append([category, subcategory, all_projects, success_projects, live_projects])
    finally:
        driver.quit()
    return result

results = []
pool = Pool(8)
for start, to in zip(range(0, len(category_links), 11), range(11, len(category_links)+1, 11)):
    results.append(pool.apply_async(get_rows, [category_links[start:to]]))

In [192]:
table = [['category', 'subcategory', 'total', 'successful', 'live']]

for part in results:
    assert part.ready()
    table.extend(part.get())
table = table[:1] + sorted(table[1:])

# Result

Green indicates a category (not a subcategory), red indicates over 2400 projects, the limit to scrape successfully.

In [193]:
tab = make_table(table)
set_row_style(0, bold=True)
for i in range(len(table)):
    if table[i][1] == '':
        set_row_style(i, bold=True, color='lightgreen')
    elif i > 0 and any(int(x) > 2400 for x in table[i][2:]):
        set_row_style(i, color='#ffcccc')

tab

0,1,2,3,4
category,subcategory,total,successful,live
art,,28151,11497,207
art,ceramics,308,134,5
art,conceptual art,1027,366,8
art,digital art,1348,374,13
art,illustration,3192,1636,44
art,installations,484,235,7
art,mixed media,2757,948,19
art,painting,3294,1145,19
art,performance art,2151,930,6
