initial commit

This commit is contained in:
H.T. Kruitbosch 2018-01-10 10:42:38 +01:00
parent 033e935d5c
commit cba5305d4a
5 changed files with 1949 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
.idea
.ipynb_checkpoints
__pycache__
*.pyc
ghostdriver.log

View File

@ -0,0 +1,211 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import selenium\n",
"import time\n",
"import datetime\n",
"from selenium import webdriver\n",
"from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
"from bs4 import BeautifulSoup\n",
"\n",
"from IPython.display import display, Image, HTML\n",
"\n",
"from jupyter_progressbar import ProgressBar\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def remove_kickstarter_url_prefix(url):\n",
" if url.startswith('https://www.kickstarter.com/'):\n",
" return url[len('https://www.kickstarter.com'):]\n",
" return url"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"driver = webdriver.Chrome()\n",
"\n",
"root = 'https://www.kickstarter.com/'\n",
"driver.get(root)\n",
"\n",
"discover_links = {\n",
" link\n",
" for link in driver.find_elements_by_tag_name('a')\n",
" for link in [link.get_attribute('href')]\n",
" for link in [remove_kickstarter_url_prefix(link)]\n",
" if link.startswith(\"/discover/\")\n",
"}\n",
"\n",
"driver.close()\n",
"driver.quit()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Request-sent\n"
]
}
],
"source": [
"try:\n",
" driver.close()\n",
" driver.quit()\n",
"except Exception as e:\n",
" print(e)\n",
"\n",
"driver = webdriver.Chrome()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "990f33c96fef49aebb4caaf7df72e20f",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>VBox</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0%'))), HTML(value='<b>0</b>% or <b>0</b> of <b>0</b> done', placeholder='0%')))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-27-9f1242042551>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdiscover_link\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdiscover_links\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdiscover_link\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mproject\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mProgressBar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mget_all_projects\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0mprojects\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mproject\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproject\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/kickstarter/lib/python3.5/site-packages/jupyter_progressbar/__init__.py\u001b[0m in \u001b[0;36mProgressBar\u001b[0;34m(iter, size)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mtsq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0msize\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msize\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-27-9f1242042551>\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mproject\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mn_wait\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 30\u001b[0;31m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 31\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"projects = dict()\n",
"\n",
"class get_all_projects:\n",
" def __init__(self, driver):\n",
" self.driver = driver\n",
" self.total_comments = next(\n",
" int(element.text.replace(' projects', '').replace(',', ''))\n",
" for element in driver.find_elements_by_class_name('count')\n",
" if element.text.endswith(' projects')\n",
" )\n",
" \n",
" def __iter__(self):\n",
" done = set()\n",
" driver.execute_script(\"$('.load_more > a').click()\")\n",
" n_wait = 0\n",
" \n",
" while driver.execute_script(\"return $('.load_more > a').length\") > 0:\n",
" n_wait += 1\n",
" n_projects = driver.execute_script(\"return $('*[data-project]').length\")\n",
" if n_projects > 0 or n_wait > 5:\n",
" driver.execute_script(\"$('.load_more > a').click()\")\n",
" \n",
" for item in driver.find_elements_by_css_selector('*[data-project]'):\n",
" project = json.loads(item.get_attribute('data-project'))\n",
" if project['id'] not in done:\n",
" done.add(project['id'])\n",
" driver.execute_script('$(\"*[data-project_pid=%d]\").parent().remove()' % project['id'])\n",
" yield project\n",
" n_wait = 0\n",
" time.sleep(0.5)\n",
" \n",
" def __len__(self):\n",
" return self.total_comments\n",
" \n",
"for discover_link in discover_links:\n",
" driver.get(root + discover_link)\n",
" for project in ProgressBar(get_all_projects(driver)):\n",
" projects[project['id']] = project\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"https://www.kickstarter.com//discover/newest?ref=discovery_overlay\n"
]
}
],
"source": [
"print(root + discover_link)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because it is too large Load Diff

98
scrape.py Normal file
View File

@ -0,0 +1,98 @@
import base64
import scrapy
from scrapy_splash import SplashRequest
class ExploreSpider(scrapy.Spider):
name = 'explorespider'
start_urls = ['https://www.kickstarter.com/']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse_explore)
def parse_explore(self, response):
for link in response.xpath('//a/@href').extract():
if link.startswith('https://www.kickstarter.com/'):
link = link[len('https://www.kickstarter.com'):]
if link.startswith("/discover/"):
yield SplashRequest(
'https://www.kickstarter.com' + link,
self.parse_discover,
endpoint='execute',
args={'lua_source': """
function main(splash)
assert(splash:go(splash.args.url))
assert(splash:wait(1))
local n_comments = -1
splash:runjs("$('.load_more > a').click()")
assert(splash:wait(8))
splash:runjs("$('.load_more > a').click()")
assert(splash:wait(8))
"""
# while splash:evaljs("$('.load_more > a:visible').length") > 0 do
# if (splash:evaljs("$('*[data-pid]').length") ~= n_comments) then
# splash:runjs("$('.load_more > a').click()")
# end
# n_comments = splash:evaljs("$('*[data-pid]').length")
# assert(splash:wait(0.5))
# break
# end
"""
return {
n0 = splash:evaljs("$('.load_more > a:visible').length"),
n1 = splash:evaljs("$('.load_more > a:visible').length") > 0,
m = splash:evaljs("$('*[data-pid]').length"),
html = splash:html(),
}
end
"""}
)
return
def parse_discover(self, response):
print('*' * 60)
# print(response.data.keys())
print({k:v for k,v in response.data.items() if k != 'html'})
print('*' * 60)
return
urls = set()
for link in response.xpath('//a/@href').extract():
if link.startswith('https://www.kickstarter.com/'):
link = link[len('https://www.kickstarter.com'):]
if link.startswith("/projects/"):
urls.add('https://www.kickstarter.com' + link)
yield SplashRequest(
'https://www.kickstarter.com' + link,
self.parse_project,
args={'lua_source': """
function main(splash)
assert(splash:go(splash.args.url))
assert(splash:wait(1))
while splash:evaljs("$('.older_comments:visible').length") > 0 do
print(splash:evaljs("$('.older_comments:visible').length"))
if (splash:evaljs("$('li.comments').length") ~= n_comments) then
splash:runjs("$('.older_comments').click()")
end
n_comments = splash:evaljs("$('li.comments').length")
assert(splash:wait(0.5))
end
return {
html = splash:html(),
}
end
"""}
)
print('*'*20, response.url, len(urls), urls)
def parse_project(self, response):
print(response.url)

15
settings.py Normal file
View File

@ -0,0 +1,15 @@
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPLASH_URL = 'http://localhost:8050/'
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'