99 lines
3.0 KiB
Python
99 lines
3.0 KiB
Python
|
import base64
|
||
|
|
||
|
import scrapy
|
||
|
from scrapy_splash import SplashRequest
|
||
|
|
||
|
|
||
|
class ExploreSpider(scrapy.Spider):
|
||
|
name = 'explorespider'
|
||
|
start_urls = ['https://www.kickstarter.com/']
|
||
|
|
||
|
def start_requests(self):
|
||
|
for url in self.start_urls:
|
||
|
yield SplashRequest(url, self.parse_explore)
|
||
|
|
||
|
def parse_explore(self, response):
|
||
|
for link in response.xpath('//a/@href').extract():
|
||
|
if link.startswith('https://www.kickstarter.com/'):
|
||
|
link = link[len('https://www.kickstarter.com'):]
|
||
|
if link.startswith("/discover/"):
|
||
|
yield SplashRequest(
|
||
|
'https://www.kickstarter.com' + link,
|
||
|
self.parse_discover,
|
||
|
endpoint='execute',
|
||
|
args={'lua_source': """
|
||
|
function main(splash)
|
||
|
assert(splash:go(splash.args.url))
|
||
|
assert(splash:wait(1))
|
||
|
local n_comments = -1
|
||
|
|
||
|
splash:runjs("$('.load_more > a').click()")
|
||
|
assert(splash:wait(8))
|
||
|
splash:runjs("$('.load_more > a').click()")
|
||
|
assert(splash:wait(8))
|
||
|
|
||
|
"""
|
||
|
|
||
|
# while splash:evaljs("$('.load_more > a:visible').length") > 0 do
|
||
|
# if (splash:evaljs("$('*[data-pid]').length") ~= n_comments) then
|
||
|
# splash:runjs("$('.load_more > a').click()")
|
||
|
# end
|
||
|
# n_comments = splash:evaljs("$('*[data-pid]').length")
|
||
|
# assert(splash:wait(0.5))
|
||
|
# break
|
||
|
# end
|
||
|
"""
|
||
|
return {
|
||
|
n0 = splash:evaljs("$('.load_more > a:visible').length"),
|
||
|
n1 = splash:evaljs("$('.load_more > a:visible').length") > 0,
|
||
|
m = splash:evaljs("$('*[data-pid]').length"),
|
||
|
html = splash:html(),
|
||
|
}
|
||
|
end
|
||
|
"""}
|
||
|
)
|
||
|
return
|
||
|
|
||
|
def parse_discover(self, response):
|
||
|
print('*' * 60)
|
||
|
# print(response.data.keys())
|
||
|
print({k:v for k,v in response.data.items() if k != 'html'})
|
||
|
print('*' * 60)
|
||
|
|
||
|
return
|
||
|
urls = set()
|
||
|
for link in response.xpath('//a/@href').extract():
|
||
|
if link.startswith('https://www.kickstarter.com/'):
|
||
|
link = link[len('https://www.kickstarter.com'):]
|
||
|
if link.startswith("/projects/"):
|
||
|
urls.add('https://www.kickstarter.com' + link)
|
||
|
yield SplashRequest(
|
||
|
'https://www.kickstarter.com' + link,
|
||
|
self.parse_project,
|
||
|
args={'lua_source': """
|
||
|
function main(splash)
|
||
|
assert(splash:go(splash.args.url))
|
||
|
assert(splash:wait(1))
|
||
|
|
||
|
|
||
|
|
||
|
while splash:evaljs("$('.older_comments:visible').length") > 0 do
|
||
|
print(splash:evaljs("$('.older_comments:visible').length"))
|
||
|
if (splash:evaljs("$('li.comments').length") ~= n_comments) then
|
||
|
splash:runjs("$('.older_comments').click()")
|
||
|
end
|
||
|
n_comments = splash:evaljs("$('li.comments').length")
|
||
|
assert(splash:wait(0.5))
|
||
|
end
|
||
|
return {
|
||
|
html = splash:html(),
|
||
|
}
|
||
|
end
|
||
|
"""}
|
||
|
)
|
||
|
|
||
|
print('*'*20, response.url, len(urls), urls)
|
||
|
|
||
|
def parse_project(self, response):
|
||
|
print(response.url)
|