initial commit
This commit is contained in:
parent
033e935d5c
commit
cba5305d4a
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
.idea
|
||||||
|
.ipynb_checkpoints
|
||||||
|
__pycache__
|
||||||
|
*.pyc
|
||||||
|
ghostdriver.log
|
211
Collect comments for one page.ipynb
Normal file
211
Collect comments for one page.ipynb
Normal file
@ -0,0 +1,211 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import selenium\n",
|
||||||
|
"import time\n",
|
||||||
|
"import datetime\n",
|
||||||
|
"from selenium import webdriver\n",
|
||||||
|
"from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
|
||||||
|
"from bs4 import BeautifulSoup\n",
|
||||||
|
"\n",
|
||||||
|
"from IPython.display import display, Image, HTML\n",
|
||||||
|
"\n",
|
||||||
|
"from jupyter_progressbar import ProgressBar\n",
|
||||||
|
"import json"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def remove_kickstarter_url_prefix(url):\n",
|
||||||
|
" if url.startswith('https://www.kickstarter.com/'):\n",
|
||||||
|
" return url[len('https://www.kickstarter.com'):]\n",
|
||||||
|
" return url"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"driver = webdriver.Chrome()\n",
|
||||||
|
"\n",
|
||||||
|
"root = 'https://www.kickstarter.com/'\n",
|
||||||
|
"driver.get(root)\n",
|
||||||
|
"\n",
|
||||||
|
"discover_links = {\n",
|
||||||
|
" link\n",
|
||||||
|
" for link in driver.find_elements_by_tag_name('a')\n",
|
||||||
|
" for link in [link.get_attribute('href')]\n",
|
||||||
|
" for link in [remove_kickstarter_url_prefix(link)]\n",
|
||||||
|
" if link.startswith(\"/discover/\")\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"driver.close()\n",
|
||||||
|
"driver.quit()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Request-sent\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"try:\n",
|
||||||
|
" driver.close()\n",
|
||||||
|
" driver.quit()\n",
|
||||||
|
"except Exception as e:\n",
|
||||||
|
" print(e)\n",
|
||||||
|
"\n",
|
||||||
|
"driver = webdriver.Chrome()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "990f33c96fef49aebb4caaf7df72e20f",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/html": [
|
||||||
|
"<p>Failed to display Jupyter Widget of type <code>VBox</code>.</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||||
|
" Widgets Documentation</a> for setup instructions.\n",
|
||||||
|
"</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in another frontend (for example, a static\n",
|
||||||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||||
|
"</p>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='<b>0</b>s passed', placeholder='0%'))), HTML(value='<b>0</b>% or <b>0</b> of <b>0</b> done', placeholder='0%')))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ename": "KeyboardInterrupt",
|
||||||
|
"evalue": "",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[0;32m<ipython-input-27-9f1242042551>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdiscover_link\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdiscover_links\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdiscover_link\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mproject\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mProgressBar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mget_all_projects\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0mprojects\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mproject\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproject\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[0;32m~/.virtualenvs/kickstarter/lib/python3.5/site-packages/jupyter_progressbar/__init__.py\u001b[0m in \u001b[0;36mProgressBar\u001b[0;34m(iter, size)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mtsq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0msize\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msize\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[0;32m<ipython-input-27-9f1242042551>\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mproject\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mn_wait\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 30\u001b[0;31m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 31\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||||
|
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"projects = dict()\n",
|
||||||
|
"\n",
|
||||||
|
"class get_all_projects:\n",
|
||||||
|
" def __init__(self, driver):\n",
|
||||||
|
" self.driver = driver\n",
|
||||||
|
" self.total_comments = next(\n",
|
||||||
|
" int(element.text.replace(' projects', '').replace(',', ''))\n",
|
||||||
|
" for element in driver.find_elements_by_class_name('count')\n",
|
||||||
|
" if element.text.endswith(' projects')\n",
|
||||||
|
" )\n",
|
||||||
|
" \n",
|
||||||
|
" def __iter__(self):\n",
|
||||||
|
" done = set()\n",
|
||||||
|
" driver.execute_script(\"$('.load_more > a').click()\")\n",
|
||||||
|
" n_wait = 0\n",
|
||||||
|
" \n",
|
||||||
|
" while driver.execute_script(\"return $('.load_more > a').length\") > 0:\n",
|
||||||
|
" n_wait += 1\n",
|
||||||
|
" n_projects = driver.execute_script(\"return $('*[data-project]').length\")\n",
|
||||||
|
" if n_projects > 0 or n_wait > 5:\n",
|
||||||
|
" driver.execute_script(\"$('.load_more > a').click()\")\n",
|
||||||
|
" \n",
|
||||||
|
" for item in driver.find_elements_by_css_selector('*[data-project]'):\n",
|
||||||
|
" project = json.loads(item.get_attribute('data-project'))\n",
|
||||||
|
" if project['id'] not in done:\n",
|
||||||
|
" done.add(project['id'])\n",
|
||||||
|
" driver.execute_script('$(\"*[data-project_pid=%d]\").parent().remove()' % project['id'])\n",
|
||||||
|
" yield project\n",
|
||||||
|
" n_wait = 0\n",
|
||||||
|
" time.sleep(0.5)\n",
|
||||||
|
" \n",
|
||||||
|
" def __len__(self):\n",
|
||||||
|
" return self.total_comments\n",
|
||||||
|
" \n",
|
||||||
|
"for discover_link in discover_links:\n",
|
||||||
|
" driver.get(root + discover_link)\n",
|
||||||
|
" for project in ProgressBar(get_all_projects(driver)):\n",
|
||||||
|
" projects[project['id']] = project\n",
|
||||||
|
" break"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 121,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"https://www.kickstarter.com//discover/newest?ref=discovery_overlay\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(root + discover_link)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.5.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
1620
Get all comments for a project.ipynb
Normal file
1620
Get all comments for a project.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
98
scrape.py
Normal file
98
scrape.py
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
import base64
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from scrapy_splash import SplashRequest
|
||||||
|
|
||||||
|
|
||||||
|
class ExploreSpider(scrapy.Spider):
|
||||||
|
name = 'explorespider'
|
||||||
|
start_urls = ['https://www.kickstarter.com/']
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
for url in self.start_urls:
|
||||||
|
yield SplashRequest(url, self.parse_explore)
|
||||||
|
|
||||||
|
def parse_explore(self, response):
|
||||||
|
for link in response.xpath('//a/@href').extract():
|
||||||
|
if link.startswith('https://www.kickstarter.com/'):
|
||||||
|
link = link[len('https://www.kickstarter.com'):]
|
||||||
|
if link.startswith("/discover/"):
|
||||||
|
yield SplashRequest(
|
||||||
|
'https://www.kickstarter.com' + link,
|
||||||
|
self.parse_discover,
|
||||||
|
endpoint='execute',
|
||||||
|
args={'lua_source': """
|
||||||
|
function main(splash)
|
||||||
|
assert(splash:go(splash.args.url))
|
||||||
|
assert(splash:wait(1))
|
||||||
|
local n_comments = -1
|
||||||
|
|
||||||
|
splash:runjs("$('.load_more > a').click()")
|
||||||
|
assert(splash:wait(8))
|
||||||
|
splash:runjs("$('.load_more > a').click()")
|
||||||
|
assert(splash:wait(8))
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# while splash:evaljs("$('.load_more > a:visible').length") > 0 do
|
||||||
|
# if (splash:evaljs("$('*[data-pid]').length") ~= n_comments) then
|
||||||
|
# splash:runjs("$('.load_more > a').click()")
|
||||||
|
# end
|
||||||
|
# n_comments = splash:evaljs("$('*[data-pid]').length")
|
||||||
|
# assert(splash:wait(0.5))
|
||||||
|
# break
|
||||||
|
# end
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
n0 = splash:evaljs("$('.load_more > a:visible').length"),
|
||||||
|
n1 = splash:evaljs("$('.load_more > a:visible').length") > 0,
|
||||||
|
m = splash:evaljs("$('*[data-pid]').length"),
|
||||||
|
html = splash:html(),
|
||||||
|
}
|
||||||
|
end
|
||||||
|
"""}
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
def parse_discover(self, response):
|
||||||
|
print('*' * 60)
|
||||||
|
# print(response.data.keys())
|
||||||
|
print({k:v for k,v in response.data.items() if k != 'html'})
|
||||||
|
print('*' * 60)
|
||||||
|
|
||||||
|
return
|
||||||
|
urls = set()
|
||||||
|
for link in response.xpath('//a/@href').extract():
|
||||||
|
if link.startswith('https://www.kickstarter.com/'):
|
||||||
|
link = link[len('https://www.kickstarter.com'):]
|
||||||
|
if link.startswith("/projects/"):
|
||||||
|
urls.add('https://www.kickstarter.com' + link)
|
||||||
|
yield SplashRequest(
|
||||||
|
'https://www.kickstarter.com' + link,
|
||||||
|
self.parse_project,
|
||||||
|
args={'lua_source': """
|
||||||
|
function main(splash)
|
||||||
|
assert(splash:go(splash.args.url))
|
||||||
|
assert(splash:wait(1))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
while splash:evaljs("$('.older_comments:visible').length") > 0 do
|
||||||
|
print(splash:evaljs("$('.older_comments:visible').length"))
|
||||||
|
if (splash:evaljs("$('li.comments').length") ~= n_comments) then
|
||||||
|
splash:runjs("$('.older_comments').click()")
|
||||||
|
end
|
||||||
|
n_comments = splash:evaljs("$('li.comments').length")
|
||||||
|
assert(splash:wait(0.5))
|
||||||
|
end
|
||||||
|
return {
|
||||||
|
html = splash:html(),
|
||||||
|
}
|
||||||
|
end
|
||||||
|
"""}
|
||||||
|
)
|
||||||
|
|
||||||
|
print('*'*20, response.url, len(urls), urls)
|
||||||
|
|
||||||
|
def parse_project(self, response):
|
||||||
|
print(response.url)
|
15
settings.py
Normal file
15
settings.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
DOWNLOADER_MIDDLEWARES = {
|
||||||
|
'scrapy_splash.SplashCookiesMiddleware': 723,
|
||||||
|
'scrapy_splash.SplashMiddleware': 725,
|
||||||
|
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
|
||||||
|
}
|
||||||
|
|
||||||
|
SPLASH_URL = 'http://localhost:8050/'
|
||||||
|
|
||||||
|
SPIDER_MIDDLEWARES = {
|
||||||
|
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
|
||||||
|
}
|
||||||
|
|
||||||
|
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
|
||||||
|
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
|
||||||
|
|
Loading…
Reference in New Issue
Block a user