import base64 import scrapy from scrapy_splash import SplashRequest class ExploreSpider(scrapy.Spider): name = 'explorespider' start_urls = ['https://www.kickstarter.com/'] def start_requests(self): for url in self.start_urls: yield SplashRequest(url, self.parse_explore) def parse_explore(self, response): for link in response.xpath('//a/@href').extract(): if link.startswith('https://www.kickstarter.com/'): link = link[len('https://www.kickstarter.com'):] if link.startswith("/discover/"): yield SplashRequest( 'https://www.kickstarter.com' + link, self.parse_discover, endpoint='execute', args={'lua_source': """ function main(splash) assert(splash:go(splash.args.url)) assert(splash:wait(1)) local n_comments = -1 splash:runjs("$('.load_more > a').click()") assert(splash:wait(8)) splash:runjs("$('.load_more > a').click()") assert(splash:wait(8)) """ # while splash:evaljs("$('.load_more > a:visible').length") > 0 do # if (splash:evaljs("$('*[data-pid]').length") ~= n_comments) then # splash:runjs("$('.load_more > a').click()") # end # n_comments = splash:evaljs("$('*[data-pid]').length") # assert(splash:wait(0.5)) # break # end """ return { n0 = splash:evaljs("$('.load_more > a:visible').length"), n1 = splash:evaljs("$('.load_more > a:visible').length") > 0, m = splash:evaljs("$('*[data-pid]').length"), html = splash:html(), } end """} ) return def parse_discover(self, response): print('*' * 60) # print(response.data.keys()) print({k:v for k,v in response.data.items() if k != 'html'}) print('*' * 60) return urls = set() for link in response.xpath('//a/@href').extract(): if link.startswith('https://www.kickstarter.com/'): link = link[len('https://www.kickstarter.com'):] if link.startswith("/projects/"): urls.add('https://www.kickstarter.com' + link) yield SplashRequest( 'https://www.kickstarter.com' + link, self.parse_project, args={'lua_source': """ function main(splash) assert(splash:go(splash.args.url)) assert(splash:wait(1)) while splash:evaljs("$('.older_comments:visible').length") > 0 do print(splash:evaljs("$('.older_comments:visible').length")) if (splash:evaljs("$('li.comments').length") ~= n_comments) then splash:runjs("$('.older_comments').click()") end n_comments = splash:evaljs("$('li.comments').length") assert(splash:wait(0.5)) end return { html = splash:html(), } end """} ) print('*'*20, response.url, len(urls), urls) def parse_project(self, response): print(response.url)