kickstarter/Collect comments for one pa...

9.5 KiB

In [1]:
import selenium
import time
import datetime
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from bs4 import BeautifulSoup

from IPython.display import display, Image, HTML

from jupyter_progressbar import ProgressBar
import json
In [3]:
def remove_kickstarter_url_prefix(url):
    if url.startswith('https://www.kickstarter.com/'):
        return url[len('https://www.kickstarter.com'):]
    return url
In [7]:
driver = webdriver.Chrome()

root = 'https://www.kickstarter.com/'
driver.get(root)

discover_links = {
    link
    for link in driver.find_elements_by_tag_name('a')
    for link in [link.get_attribute('href')]
    for link in [remove_kickstarter_url_prefix(link)]
    if link.startswith("/discover/")
}

driver.close()
driver.quit()
In [26]:
try:
    driver.close()
    driver.quit()
except Exception as e:
    print(e)

driver = webdriver.Chrome()
Request-sent
In [27]:
projects = dict()

class get_all_projects:
    def __init__(self, driver):
        self.driver = driver
        self.total_comments = next(
            int(element.text.replace(' projects', '').replace(',', ''))
            for element in driver.find_elements_by_class_name('count')
            if element.text.endswith(' projects')
        )
    
    def __iter__(self):
        done = set()
        driver.execute_script("$('.load_more > a').click()")
        n_wait = 0
        
        while driver.execute_script("return $('.load_more > a').length") > 0:
            n_wait += 1
            n_projects = driver.execute_script("return $('*[data-project]').length")
            if n_projects > 0 or n_wait > 5:
                driver.execute_script("$('.load_more > a').click()")
        
                for item in driver.find_elements_by_css_selector('*[data-project]'):
                    project = json.loads(item.get_attribute('data-project'))
                    if project['id'] not in done:
                        done.add(project['id'])
                        driver.execute_script('$("*[data-project_pid=%d]").parent().remove()' % project['id'])
                        yield project
                n_wait = 0
            time.sleep(0.5)
    
    def __len__(self):
        return self.total_comments
    
for discover_link in discover_links:
    driver.get(root + discover_link)
    for project in ProgressBar(get_all_projects(driver)):
        projects[project['id']] = project
    break

Failed to display Jupyter Widget of type VBox.

If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean that the widgets JavaScript is still loading. If this message persists, it likely means that the widgets JavaScript library is either not installed or not enabled. See the Jupyter Widgets Documentation for setup instructions.

If you're reading this message in another frontend (for example, a static rendering on GitHub or NBViewer), it may mean that your frontend doesn't currently support widgets.

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-27-9f1242042551> in <module>()
     35 for discover_link in discover_links:
     36     driver.get(root + discover_link)
---> 37     for project in ProgressBar(get_all_projects(driver)):
     38         projects[project['id']] = project
     39     break

~/.virtualenvs/kickstarter/lib/python3.5/site-packages/jupyter_progressbar/__init__.py in ProgressBar(iter, size)
     30     tsq = 0
     31     i = 0
---> 32     for i, item in enumerate(iter, start=1):
     33         while i > size:
     34             size = size * 2

<ipython-input-27-9f1242042551> in __iter__(self)
     28                         yield project
     29                 n_wait = 0
---> 30             time.sleep(0.5)
     31 
     32     def __len__(self):

KeyboardInterrupt: 
In [121]:
print(root + discover_link)
https://www.kickstarter.com//discover/newest?ref=discovery_overlay