kickstarter/scrape.py

99 lines
3.0 KiB
Python

import base64
import scrapy
from scrapy_splash import SplashRequest
class ExploreSpider(scrapy.Spider):
name = 'explorespider'
start_urls = ['https://www.kickstarter.com/']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse_explore)
def parse_explore(self, response):
for link in response.xpath('//a/@href').extract():
if link.startswith('https://www.kickstarter.com/'):
link = link[len('https://www.kickstarter.com'):]
if link.startswith("/discover/"):
yield SplashRequest(
'https://www.kickstarter.com' + link,
self.parse_discover,
endpoint='execute',
args={'lua_source': """
function main(splash)
assert(splash:go(splash.args.url))
assert(splash:wait(1))
local n_comments = -1
splash:runjs("$('.load_more > a').click()")
assert(splash:wait(8))
splash:runjs("$('.load_more > a').click()")
assert(splash:wait(8))
"""
# while splash:evaljs("$('.load_more > a:visible').length") > 0 do
# if (splash:evaljs("$('*[data-pid]').length") ~= n_comments) then
# splash:runjs("$('.load_more > a').click()")
# end
# n_comments = splash:evaljs("$('*[data-pid]').length")
# assert(splash:wait(0.5))
# break
# end
"""
return {
n0 = splash:evaljs("$('.load_more > a:visible').length"),
n1 = splash:evaljs("$('.load_more > a:visible').length") > 0,
m = splash:evaljs("$('*[data-pid]').length"),
html = splash:html(),
}
end
"""}
)
return
def parse_discover(self, response):
print('*' * 60)
# print(response.data.keys())
print({k:v for k,v in response.data.items() if k != 'html'})
print('*' * 60)
return
urls = set()
for link in response.xpath('//a/@href').extract():
if link.startswith('https://www.kickstarter.com/'):
link = link[len('https://www.kickstarter.com'):]
if link.startswith("/projects/"):
urls.add('https://www.kickstarter.com' + link)
yield SplashRequest(
'https://www.kickstarter.com' + link,
self.parse_project,
args={'lua_source': """
function main(splash)
assert(splash:go(splash.args.url))
assert(splash:wait(1))
while splash:evaljs("$('.older_comments:visible').length") > 0 do
print(splash:evaljs("$('.older_comments:visible').length"))
if (splash:evaljs("$('li.comments').length") ~= n_comments) then
splash:runjs("$('.older_comments').click()")
end
n_comments = splash:evaljs("$('li.comments').length")
assert(splash:wait(0.5))
end
return {
html = splash:html(),
}
end
"""}
)
print('*'*20, response.url, len(urls), urls)
def parse_project(self, response):
print(response.url)