diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4e9d6a4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.idea +.ipynb_checkpoints +__pycache__ +*.pyc +ghostdriver.log diff --git a/Collect comments for one page.ipynb b/Collect comments for one page.ipynb new file mode 100644 index 0000000..edcaf8e --- /dev/null +++ b/Collect comments for one page.ipynb @@ -0,0 +1,211 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import selenium\n", + "import time\n", + "import datetime\n", + "from selenium import webdriver\n", + "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n", + "from bs4 import BeautifulSoup\n", + "\n", + "from IPython.display import display, Image, HTML\n", + "\n", + "from jupyter_progressbar import ProgressBar\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def remove_kickstarter_url_prefix(url):\n", + " if url.startswith('https://www.kickstarter.com/'):\n", + " return url[len('https://www.kickstarter.com'):]\n", + " return url" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "driver = webdriver.Chrome()\n", + "\n", + "root = 'https://www.kickstarter.com/'\n", + "driver.get(root)\n", + "\n", + "discover_links = {\n", + " link\n", + " for link in driver.find_elements_by_tag_name('a')\n", + " for link in [link.get_attribute('href')]\n", + " for link in [remove_kickstarter_url_prefix(link)]\n", + " if link.startswith(\"/discover/\")\n", + "}\n", + "\n", + "driver.close()\n", + "driver.quit()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Request-sent\n" + ] + } + ], + "source": [ + "try:\n", + " driver.close()\n", + " driver.quit()\n", + "except Exception as e:\n", + " print(e)\n", + "\n", + "driver = webdriver.Chrome()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "990f33c96fef49aebb4caaf7df72e20f", + "version_major": 2, + "version_minor": 0 + }, + "text/html": [ + "

Failed to display Jupyter Widget of type VBox.

\n", + "

\n", + " If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", + " that the widgets JavaScript is still loading. If this message persists, it\n", + " likely means that the widgets JavaScript library is either not installed or\n", + " not enabled. See the Jupyter\n", + " Widgets Documentation for setup instructions.\n", + "

\n", + "

\n", + " If you're reading this message in another frontend (for example, a static\n", + " rendering on GitHub or NBViewer),\n", + " it may mean that your frontend doesn't currently support widgets.\n", + "

\n" + ], + "text/plain": [ + "VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='0s passed', placeholder='0%'))), HTML(value='0% or 0 of 0 done', placeholder='0%')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdiscover_link\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdiscover_links\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mroot\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdiscover_link\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 37\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mproject\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mProgressBar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mget_all_projects\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 38\u001b[0m \u001b[0mprojects\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mproject\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mproject\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/kickstarter/lib/python3.5/site-packages/jupyter_progressbar/__init__.py\u001b[0m in \u001b[0;36mProgressBar\u001b[0;34m(iter, size)\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mtsq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 32\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 33\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0msize\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msize\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mproject\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mn_wait\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 30\u001b[0;31m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 31\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "projects = dict()\n", + "\n", + "class get_all_projects:\n", + " def __init__(self, driver):\n", + " self.driver = driver\n", + " self.total_comments = next(\n", + " int(element.text.replace(' projects', '').replace(',', ''))\n", + " for element in driver.find_elements_by_class_name('count')\n", + " if element.text.endswith(' projects')\n", + " )\n", + " \n", + " def __iter__(self):\n", + " done = set()\n", + " driver.execute_script(\"$('.load_more > a').click()\")\n", + " n_wait = 0\n", + " \n", + " while driver.execute_script(\"return $('.load_more > a').length\") > 0:\n", + " n_wait += 1\n", + " n_projects = driver.execute_script(\"return $('*[data-project]').length\")\n", + " if n_projects > 0 or n_wait > 5:\n", + " driver.execute_script(\"$('.load_more > a').click()\")\n", + " \n", + " for item in driver.find_elements_by_css_selector('*[data-project]'):\n", + " project = json.loads(item.get_attribute('data-project'))\n", + " if project['id'] not in done:\n", + " done.add(project['id'])\n", + " driver.execute_script('$(\"*[data-project_pid=%d]\").parent().remove()' % project['id'])\n", + " yield project\n", + " n_wait = 0\n", + " time.sleep(0.5)\n", + " \n", + " def __len__(self):\n", + " return self.total_comments\n", + " \n", + "for discover_link in discover_links:\n", + " driver.get(root + discover_link)\n", + " for project in ProgressBar(get_all_projects(driver)):\n", + " projects[project['id']] = project\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://www.kickstarter.com//discover/newest?ref=discovery_overlay\n" + ] + } + ], + "source": [ + "print(root + discover_link)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Get all comments for a project.ipynb b/Get all comments for a project.ipynb new file mode 100644 index 0000000..81f33a2 --- /dev/null +++ b/Get all comments for a project.ipynb @@ -0,0 +1,1620 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "import selenium\n", + "import time\n", + "import datetime\n", + "from selenium import webdriver\n", + "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n", + "from bs4 import BeautifulSoup\n", + "\n", + "from IPython.display import display, Image, HTML\n", + "\n", + "from jupyter_progressbar import ProgressBar\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "driver = webdriver.Chrome()\n", + "\n", + "driver.get('https://www.kickstarter.com/projects/inspero/vinci-20-worlds-first-standalone-ai-sports-headpho/comments')" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "64af8363727e4f95bd9a730f5c91acc5", + "version_major": 2, + "version_minor": 0 + }, + "text/html": [ + "

Failed to display Jupyter Widget of type VBox.

\n", + "

\n", + " If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", + " that the widgets JavaScript is still loading. If this message persists, it\n", + " likely means that the widgets JavaScript library is either not installed or\n", + " not enabled. See the Jupyter\n", + " Widgets Documentation for setup instructions.\n", + "

\n", + "

\n", + " If you're reading this message in another frontend (for example, a static\n", + " rendering on GitHub or NBViewer),\n", + " it may mean that your frontend doesn't currently support widgets.\n", + "

\n" + ], + "text/plain": [ + "VBox(children=(HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='0s passed', placeholder='0%'))), HTML(value='0% or 0 of 0 done', placeholder='0%')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1833678544\n", + "839015508\n", + "458527266\n", + "2001994789\n", + "1217263212\n", + "889917255\n", + "395880877\n", + "395880877\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "495300557\n", + "495300557\n", + "205681801\n", + "643428\n", + "jamesvo66\n", + "1864830176\n", + "458527266\n", + "839015508\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "761485634\n", + "1608275255\n", + "1232829437\n", + "1232829437\n", + "688462716\n", + "1833678544\n", + "498306784\n", + "1923877728\n", + "1233568244\n", + "458527266\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1561880636\n", + "1369960534\n", + "1882731028\n", + "796017561\n", + "611328595\n", + "794959240\n", + "794959240\n", + "794959240\n", + "944540369\n", + "1635607059\n", + "1810929003\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "796017561\n", + "1196724163\n", + "588920077\n", + "237902823\n", + "1511452974\n", + "nyhotdog\n", + "1751366140\n", + "1090197121\n", + "1479259025\n", + "1882731028\n", + "1036267918\n", + "1196506039\n", + "1849808531\n", + "1082496859\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1668507980\n", + "1001248356\n", + "1699988054\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1369960534\n", + "1882731028\n", + "796017561\n", + "611328595\n", + "794959240\n", + "794959240\n", + "794959240\n", + "944540369\n", + "1635607059\n", + "1810929003\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "796017561\n", + "1196724163\n", + "588920077\n", + "237902823\n", + "1511452974\n", + "nyhotdog\n", + "1751366140\n", + "1090197121\n", + "1479259025\n", + "1882731028\n", + "1036267918\n", + "1196506039\n", + "1849808531\n", + "1082496859\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1668507980\n", + "1001248356\n", + "1699988054\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "518370125\n", + "595774491\n", + "1001248356\n", + "728253404\n", + "410446349\n", + "1862429352\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1383084788\n", + "588920077\n", + "1383084788\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1082496859\n", + "1018335009\n", + "2043281165\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1695443028\n", + "1695443028\n", + "347211813\n", + "347211813\n", + "984721398\n", + "1864830176\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1402676220\n", + "355097478\n", + "588920077\n", + "1342165126\n", + "1337628871\n", + "794959240\n", + "794959240\n", + "794959240\n", + "2044662041\n", + "9214568\n", + "1635607059\n", + "794959240\n", + "794959240\n", + "1997448604\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "370126996\n", + "1161261191\n", + "1699992933\n", + "334387016\n", + "2098214317\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1018335009\n", + "665155733\n", + "1864830176\n", + "shivasiddharth\n", + "shivasiddharth\n", + "458322083\n", + "azirius\n", + "1633004206\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "minari\n", + "minari\n", + "minari\n", + "479933865\n", + "shivasiddharth\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "665155733\n", + "1051575684\n", + "1736145611\n", + "384930616\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "673029263\n", + "796052291\n", + "329203707\n", + "1208694063\n", + "329203707\n", + "794959240\n", + "794959240\n", + "31153046\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "shivasiddharth\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "383123477\n", + "383123477\n", + "383123477\n", + "383123477\n", + "383123477\n", + "301690636\n", + "301690636\n", + "301690636\n", + "677180350\n", + "677180350\n", + "677180350\n", + "677180350\n", + "677180350\n", + "677180350\n", + "301690636\n", + "301690636\n", + "301690636\n", + "383123477\n", + "383123477\n", + "383123477\n", + "671121685\n", + "671121685\n", + "671121685\n", + "671121685\n", + "247626573\n", + "shivasiddharth\n", + "873037190\n", + "796017561\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "344091852\n", + "114788583\n", + "shivasiddharth\n", + "787017813\n", + "1018199754\n", + "jessslead\n", + "1024738362\n", + "1494317641\n", + "1214658227\n", + "1471890730\n", + "inspero\n", + "jamesvo66\n", + "1208694063\n", + "1208694063\n", + "1957484080\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1890562474\n", + "1208694063\n", + "1862429352\n", + "1979910459\n", + "972291987\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "366980299\n", + "shivasiddharth\n", + "shivasiddharth\n", + "1930498551\n", + "212692344\n", + "1862429352\n", + "972291987\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "shivasiddharth\n", + "shivasiddharth\n", + "334387016\n", + "728253404\n", + "1862429352\n", + "95691339\n", + "1862429352\n", + "1862429352\n", + "344091852\n", + "653537333\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "796017561\n", + "1494317641\n", + "518370125\n", + "518370125\n", + "1979910459\n", + "819120336\n", + "1979910459\n", + "728253404\n", + "jaiaravindj\n", + "794959240\n", + "794959240\n", + "276311303\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1018335009\n", + "794959240\n", + "794959240\n", + "794959240\n", + "459875215\n", + "1614890600\n", + "518370125\n", + "1051575684\n", + "690276919\n", + "796017561\n", + "1077783542\n", + "794959240\n", + "794959240\n", + "794959240\n", + "75341017\n", + "75341017\n", + "796017561\n", + "1617342515\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1337628871\n", + "878996580\n", + "518370125\n", + "1488592791\n", + "1000694536\n", + "639982179\n", + "1979910459\n", + "1635607059\n", + "2112136110\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1184017324\n", + "1184017324\n", + "felixkim\n", + "2145345421\n", + "1233568244\n", + "1012380866\n", + "770423085\n", + "794959240\n", + "92748688\n", + "794959240\n", + "794959240\n", + "794959240\n", + "2044177248\n", + "907670727\n", + "1422458250\n", + "794959240\n", + "794959240\n", + "75341017\n", + "728253404\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1494317641\n", + "1979910459\n", + "2011132359\n", + "2043908614\n", + "344091852\n", + "16380284\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1494317641\n", + "jjl\n", + "643428\n", + "643428\n", + "971896604\n", + "344091852\n", + "1051575684\n", + "1614890600\n", + "794959240\n", + "794959240\n", + "794959240\n", + "643428\n", + "1614890600\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1024738362\n", + "643428\n", + "1635607059\n", + "1729640748\n", + "728253404\n", + "728253404\n", + "1979910459\n", + "907670727\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "95691339\n", + "2043908614\n", + "2043908614\n", + "1942686605\n", + "794959240\n", + "inspero\n", + "271031935\n", + "1742579183\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1882731028\n", + "301690636\n", + "383123477\n", + "383123477\n", + "383123477\n", + "383123477\n", + "677180350\n", + "677180350\n", + "671121685\n", + "671121685\n", + "671121685\n", + "334387016\n", + "philcassell\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1024738362\n", + "796017561\n", + "1284830330\n", + "794959240\n", + "728253404\n", + "794959240\n", + "794959240\n", + "794959240\n", + "344091852\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "464629808\n", + "671121685\n", + "796017561\n", + "95691339\n", + "677180350\n", + "383123477\n", + "653537333\n", + "722648632\n", + "728253404\n", + "344091852\n", + "1635607059\n", + "301690636\n", + "301690636\n", + "728253404\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1024738362\n", + "794959240\n", + "cassadee\n", + "1979910459\n", + "677180350\n", + "677180350\n", + "677180350\n", + "383123477\n", + "383123477\n", + "383123477\n", + "383123477\n", + "383123477\n", + "671121685\n", + "671121685\n", + "671121685\n", + "250887333\n", + "728253404\n", + "250887333\n", + "728253404\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "344091852\n", + "677180350\n", + "677180350\n", + "677180350\n", + "2044177248\n", + "301690636\n", + "301690636\n", + "677180350\n", + "677180350\n", + "677180350\n", + "383123477\n", + "383123477\n", + "383123477\n", + "383123477\n", + "671121685\n", + "671121685\n", + "671121685\n", + "794959240\n", + "794959240\n", + "671121685\n", + "671121685\n", + "794959240\n", + "671121685\n", + "794959240\n", + "794959240\n", + "794959240\n", + "873037190\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "301690636\n", + "301690636\n", + "250887333\n", + "794959240\n", + "794959240\n", + "383123477\n", + "383123477\n", + "794959240\n", + "383123477\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "1289035088\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "873037190\n", + "301690636\n", + "383123477\n", + "383123477\n", + "671121685\n", + "671121685\n", + "677180350\n", + "677180350\n", + "677180350\n", + "301690636\n", + "301690636\n", + "301690636\n", + "301690636\n", + "383123477\n", + "383123477\n", + "671121685\n", + "671121685\n", + "1289035088\n", + "671121685\n", + "1289035088\n", + "464629808\n", + "873037190\n", + "1979910459\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "vegasved\n", + "1024738362\n", + "341983418\n", + "1430176509\n", + "344091852\n", + "794959240\n", + "794959240\n", + "464629808\n", + "464629808\n", + "1882731028\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "384930616\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "464629808\n", + "464629808\n", + "794959240\n", + "464629808\n", + "794959240\n", + "677180350\n", + "677180350\n", + "794959240\n", + "677180350\n", + "301690636\n", + "301690636\n", + "794959240\n", + "383123477\n", + "383123477\n", + "383123477\n", + "671121685\n", + "671121685\n", + "794959240\n", + "794959240\n", + "794959240\n", + "2044177248\n", + "inspero\n", + "728253404\n", + "1430176509\n", + "1430176509\n", + "1430176509\n", + "2043281165\n", + "1289035088\n", + "1430176509\n", + "1430176509\n", + "344091852\n", + "344091852\n", + "794959240\n", + "1472374881\n", + "1472374881\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "344091852\n", + "1289035088\n", + "659523922\n", + "158797475\n", + "inspero\n", + "1882731028\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1024738362\n", + "1119502946\n", + "344091852\n", + "1030938781\n", + "1936314241\n", + "2011132359\n", + "344091852\n", + "unicornwerewolf\n", + "1289035088\n", + "794959240\n", + "794959240\n", + "645845854\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1641264279\n", + "2011132359\n", + "2011132359\n", + "2011132359\n", + "653537333\n", + "653537333\n", + "231563443\n", + "231563443\n", + "794959240\n", + "1024738362\n", + "794959240\n", + "shivasiddharth\n", + "794959240\n", + "794959240\n", + "160648997\n", + "2011132359\n", + "794959240\n", + "shivasiddharth\n", + "794959240\n", + "1255315769\n", + "794959240\n", + "971896604\n", + "794959240\n", + "971896604\n", + "794959240\n", + "794959240\n", + "796017561\n", + "1255315769\n", + "794959240\n", + "794959240\n", + "1942686605\n", + "2011132359\n", + "2011132359\n", + "2011132359\n", + "794959240\n", + "794959240\n", + "18825015\n", + "sparkxster\n", + "794959240\n", + "vegasved\n", + "794959240\n", + "344091852\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "344091852\n", + "1979910459\n", + "636861889\n", + "344091852\n", + "794959240\n", + "1979910459\n", + "794959240\n", + "344091852\n", + "794959240\n", + "inspero\n", + "648827505\n", + "344091852\n", + "794959240\n", + "1641264279\n", + "794959240\n", + "794959240\n", + "794959240\n", + "636861889\n", + "1641264279\n", + "2044177248\n", + "794959240\n", + "794959240\n", + "1030938781\n", + "1742579183\n", + "572566317\n", + "inspero\n", + "2009017451\n", + "794959240\n", + "794959240\n", + "1640795193\n", + "244674104\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "2108752044\n", + "572566317\n", + "796017561\n", + "1742579183\n", + "794959240\n", + "794959240\n", + "344091852\n", + "1024738362\n", + "794959240\n", + "794959240\n", + "315925251\n", + "648827505\n", + "794959240\n", + "794959240\n", + "315925251\n", + "cassadee\n", + "inspero\n", + "cassadee\n", + "inspero\n", + "1979910459\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "143183704\n", + "1979910459\n", + "1255315769\n", + "1289035088\n", + "794959240\n", + "794959240\n", + "794959240\n", + "360443789\n", + "mssnglnk\n", + "796017561\n", + "1255315769\n", + "794959240\n", + "794959240\n", + "1063541354\n", + "1328029828\n", + "794959240\n", + "1328029828\n", + "794959240\n", + "794959240\n", + "794959240\n", + "143183704\n", + "347211813\n", + "2043281165\n", + "794959240\n", + "794959240\n", + "95691339\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "265819191\n", + "stelboe\n", + "1471890730\n", + "1024738362\n", + "143183704\n", + "29968412\n", + "794959240\n", + "1472374881\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1255315769\n", + "cassadee\n", + "388904050\n", + "1289035088\n", + "inspero\n", + "inspero\n", + "648827505\n", + "felixkim\n", + "1471890730\n", + "1471890730\n", + "143183704\n", + "794959240\n", + "143183704\n", + "inspero\n", + "212692344\n", + "sparkxster\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "525089662\n", + "102902226\n", + "cassadee\n", + "143183704\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1024738362\n", + "1687709187\n", + "830001776\n", + "830001776\n", + "830001776\n", + "794959240\n", + "1890562474\n", + "794959240\n", + "794959240\n", + "1890562474\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1677728595\n", + "1024738362\n", + "648827505\n", + "1328658675\n", + "1298879274\n", + "1289035088\n", + "794959240\n", + "794959240\n", + "inspero\n", + "572566317\n", + "1024738362\n", + "inspero\n", + "794959240\n", + "1024738362\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1289035088\n", + "648827505\n", + "648827505\n", + "shivasiddharth\n", + "572566317\n", + "794959240\n", + "794959240\n", + "794959240\n", + "cassadee\n", + "shivasiddharth\n", + "inspero\n", + "1472374881\n", + "1472374881\n", + "525089662\n", + "794959240\n", + "1942686605\n", + "inspero\n", + "1472374881\n", + "inspero\n", + "237902823\n", + "212692344\n", + "1289035088\n", + "1472374881\n", + "inspero\n", + "1472374881\n", + "1149889731\n", + "shivasiddharth\n", + "1024738362\n", + "1289035088\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "1957484080\n", + "1957484080\n", + "878996580\n", + "1024738362\n", + "shivasiddharth\n", + "1202235094\n", + "1332825064\n", + "shivasiddharth\n", + "794959240\n", + "794959240\n", + "shivasiddharth\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "inspero\n", + "1298879274\n", + "1184017324\n", + "cassadee\n", + "inspero\n", + "inspero\n", + "cassadee\n", + "796017561\n", + "inspero\n", + "sparkxster\n", + "shivasiddharth\n", + "1024738362\n", + "shivasiddharth\n", + "1880305534\n", + "shivasiddharth\n", + "1530123874\n", + "1101737592\n", + "525089662\n", + "inspero\n", + "shivasiddharth\n", + "1880305534\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "143183704\n", + "unicornwerewolf\n", + "unicornwerewolf\n", + "1472374881\n", + "philcassell\n", + "979724834\n", + "794959240\n", + "felixkim\n", + "794959240\n", + "794959240\n", + "felixkim\n", + "18825015\n", + "inspero\n", + "1472374881\n", + "philcassell\n", + "inspero\n", + "1472374881\n", + "794959240\n", + "794959240\n", + "shivasiddharth\n", + "shivasiddharth\n", + "shivasiddharth\n", + "felixkim\n", + "794959240\n", + "250887333\n", + "866391821\n", + "1024738362\n", + "250887333\n", + "1060921719\n", + "794959240\n", + "1472374881\n", + "794959240\n", + "1472374881\n", + "250887333\n", + "2104848058\n", + "250887333\n", + "983397869\n", + "250887333\n", + "983397869\n", + "250887333\n", + "250887333\n", + "250887333\n", + "2104848058\n", + "2104848058\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "250887333\n", + "524552396\n", + "794959240\n", + "524552396\n", + "794959240\n", + "794959240\n", + "983397869\n", + "983397869\n", + "794959240\n", + "794959240\n", + "794959240\n", + "2104848058\n", + "794959240\n", + "794959240\n", + "2104848058\n", + "794959240\n", + "794959240\n", + "794959240\n", + "2104848058\n", + "794959240\n", + "2104848058\n", + "983397869\n", + "983397869\n", + "794959240\n", + "794959240\n", + "524552396\n", + "794959240\n", + "524552396\n", + "794959240\n", + "felixkim\n", + "2104848058\n", + "333580554\n", + "250887333\n", + "inspero\n", + "1971952145\n", + "inspero\n", + "755417874\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "1472374881\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "250887333\n", + "1640795193\n", + "755417874\n", + "1024738362\n", + "1472374881\n", + "796017561\n", + "1277702896\n", + "inspero\n", + "1890562474\n", + "1060921719\n", + "250887333\n", + "inspero\n", + "1192200229\n", + "inspero\n", + "250887333\n", + "248194716\n", + "248194716\n", + "2030482438\n", + "794959240\n", + "inspero\n", + "542415372\n", + "inspero\n", + "794959240\n", + "955901347\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "794959240\n", + "678644647\n", + "1094742639\n", + "1971952145\n", + "1641855695\n", + "924561261\n", + "1024738362\n", + "1206590166\n", + "1206590166\n", + "inspero\n", + "678989421\n", + "inspero\n", + "2104848058\n", + "inspero\n", + "inspero\n", + "525089662\n", + "333580554\n", + "inspero\n", + "524552396\n", + "inspero\n", + "inspero\n", + "inspero\n", + "inspero\n", + "inspero\n", + "inspero\n", + "inspero\n", + "2117733844\n", + "1024738362\n", + "64313570\n", + "inspero\n", + "inspero\n", + "1024738362\n", + "1383695146\n", + "inspero\n", + "inspero\n", + "1060921719\n", + "inspero\n", + "715684691\n", + "159591613\n", + "inspero\n", + "inspero\n", + "1060921719\n", + "715684691\n", + "1060921719\n", + "794959240\n", + "inspero\n", + "1973698103\n", + "794959240\n", + "1973698103\n", + "794959240\n", + "301892085\n", + "794959240\n", + "1472374881\n", + "794959240\n", + "inspero\n", + "2135661401\n", + "794959240\n", + "794959240\n", + "794959240\n", + "likeapunpun\n", + "678644647\n", + "301892085\n", + "794959240\n", + "inspero\n", + "unicornwerewolf\n", + "417197042\n", + "inspero\n", + "525089662\n", + "inspero\n", + "inspero\n", + "inspero\n", + "878996580\n", + "878996580\n", + "301892085\n", + "384930616\n", + "inspero\n", + "384930616\n", + "inspero\n", + "1677728595\n", + "165654883\n", + "1161261191\n", + "27254771\n", + "shivasiddharth\n", + "inspero\n", + "255046417\n", + "794959240\n", + "1255315769\n", + "inspero\n", + "575839374\n", + "inspero\n", + "575839374\n", + "794959240\n", + "794959240\n", + "inspero\n", + "inspero\n", + "794959240\n", + "820909221\n", + "sparkxster\n", + "1144206602\n", + "416760475\n", + "95691339\n", + "inspero\n", + "983397869\n", + "794959240\n", + "575839374\n", + "inspero\n", + "794959240\n", + "333580554\n", + "983397869\n", + "794959240\n", + "416760475\n", + "sparkxster\n", + "95691339\n", + "inspero\n", + "794959240\n", + "1957484080\n", + "inspero\n", + "2044177248\n", + "794959240\n", + "1957484080\n", + "794959240\n", + "sparkxster\n", + "1957484080\n", + "inspero\n", + "skim620\n", + "794959240\n", + "1255315769\n", + "794959240\n", + "1255315769\n", + "794959240\n", + "1640795193\n", + "794959240\n", + "inspero\n", + "noodle4001\n", + "794959240\n", + "inspero\n", + "794959240\n", + "267740704\n", + "1640795193\n", + "794959240\n", + "794959240\n", + "inspero\n", + "1677728595\n", + "inspero\n", + "aprildamaso\n", + "416760475\n", + "inspero\n", + "387647469\n", + "794959240\n", + "273319507\n", + "inspero\n", + "416760475\n", + "794959240\n", + "unicornwerewolf\n", + "794959240\n", + "noodle4001\n", + "416760475\n", + "794959240\n", + "unicornwerewolf\n", + "794959240\n", + "1255315769\n", + "inspero\n", + "794959240\n", + "794959240\n", + "1298879274\n", + "nswint\n", + "inspero\n", + "794959240\n", + "1255315769\n", + "1472374881\n", + "794959240\n", + "794959240\n", + "themantimeforgot\n", + "794959240\n", + "794959240\n", + "1472374881\n", + "471142115\n", + "120685372\n", + "794959240\n", + "471142115\n", + "794959240\n", + "inspero\n", + "794959240\n", + "2044177248\n", + "1720221761\n", + "inspero\n", + "996557550\n", + "794959240\n", + "inspero\n", + "141748059\n", + "794959240\n", + "12141699\n", + "794959240\n", + "inspero\n", + "12141699\n", + "794959240\n", + "416760475\n", + "794959240\n", + "695595997\n", + "794959240\n", + "794959240\n", + "iamdefiler\n", + "inspero\n", + "1472374881\n", + "1009147578\n", + "inspero\n", + "inspero\n", + "inspero\n", + "1053695170\n", + "1354963070\n", + "794959240\n", + "95691339\n", + "inspero\n", + "794959240\n", + "1472374881\n" + ] + } + ], + "source": [ + "class get_comments_of_project:\n", + " def __init__(self, driver):\n", + " self.driver = driver\n", + " val = driver.find_element_by_css_selector(\"[itemprop=Project\\[comments_count\\]]\").get_attribute('data-value')\n", + " self.total_comments = int(val)\n", + " \n", + " def __iter__(self):\n", + " done = set()\n", + " driver.execute_script(\"$('a.older_comments').click()\")\n", + " n_wait = 0\n", + " \n", + " while driver.execute_script(\"return $('a.older_comments:visible').length\") > 0:\n", + " n_wait += 1\n", + " n_comments = driver.execute_script(\"return $('li.comment').length\")\n", + " if n_comments > 0 or n_wait > 5:\n", + " driver.execute_script(\"$('a.older_comments').click()\")\n", + " \n", + " for item in driver.find_elements_by_css_selector('li.comment .avatar.left a'):\n", + " profile_id = item.get_attribute('href').split('/profile/')[1]\n", + " driver.execute_script('$(\"#%s\").remove()' % item.find_element_by_xpath('../../..').get_attribute('id'))\n", + " yield profile_id\n", + " n_wait = 0\n", + " time.sleep(0.5)\n", + " \n", + " def __len__(self):\n", + " return self.total_comments\n", + "\n", + "for profile_id in ProgressBar(get_comments_of_project(driver)):\n", + " print(profile_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Request-sent\n" + ] + } + ], + "source": [ + "driver.quit()" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://www.kickstarter.com//discover/newest?ref=discovery_overlay\n" + ] + } + ], + "source": [ + "print(root + discover_link)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..8edbc74 --- /dev/null +++ b/scrape.py @@ -0,0 +1,98 @@ +import base64 + +import scrapy +from scrapy_splash import SplashRequest + + +class ExploreSpider(scrapy.Spider): + name = 'explorespider' + start_urls = ['https://www.kickstarter.com/'] + + def start_requests(self): + for url in self.start_urls: + yield SplashRequest(url, self.parse_explore) + + def parse_explore(self, response): + for link in response.xpath('//a/@href').extract(): + if link.startswith('https://www.kickstarter.com/'): + link = link[len('https://www.kickstarter.com'):] + if link.startswith("/discover/"): + yield SplashRequest( + 'https://www.kickstarter.com' + link, + self.parse_discover, + endpoint='execute', + args={'lua_source': """ +function main(splash) + assert(splash:go(splash.args.url)) + assert(splash:wait(1)) + local n_comments = -1 + + splash:runjs("$('.load_more > a').click()") + assert(splash:wait(8)) + splash:runjs("$('.load_more > a').click()") + assert(splash:wait(8)) + + """ + + # while splash:evaljs("$('.load_more > a:visible').length") > 0 do + # if (splash:evaljs("$('*[data-pid]').length") ~= n_comments) then + # splash:runjs("$('.load_more > a').click()") + # end + # n_comments = splash:evaljs("$('*[data-pid]').length") + # assert(splash:wait(0.5)) + # break + # end + """ + return { + n0 = splash:evaljs("$('.load_more > a:visible').length"), + n1 = splash:evaljs("$('.load_more > a:visible').length") > 0, + m = splash:evaljs("$('*[data-pid]').length"), + html = splash:html(), + } +end +"""} + ) + return + + def parse_discover(self, response): + print('*' * 60) + # print(response.data.keys()) + print({k:v for k,v in response.data.items() if k != 'html'}) + print('*' * 60) + + return + urls = set() + for link in response.xpath('//a/@href').extract(): + if link.startswith('https://www.kickstarter.com/'): + link = link[len('https://www.kickstarter.com'):] + if link.startswith("/projects/"): + urls.add('https://www.kickstarter.com' + link) + yield SplashRequest( + 'https://www.kickstarter.com' + link, + self.parse_project, + args={'lua_source': """ +function main(splash) + assert(splash:go(splash.args.url)) + assert(splash:wait(1)) + + + + while splash:evaljs("$('.older_comments:visible').length") > 0 do + print(splash:evaljs("$('.older_comments:visible').length")) + if (splash:evaljs("$('li.comments').length") ~= n_comments) then + splash:runjs("$('.older_comments').click()") + end + n_comments = splash:evaljs("$('li.comments').length") + assert(splash:wait(0.5)) + end + return { + html = splash:html(), + } +end +"""} + ) + + print('*'*20, response.url, len(urls), urls) + + def parse_project(self, response): + print(response.url) diff --git a/settings.py b/settings.py new file mode 100644 index 0000000..b9bdcae --- /dev/null +++ b/settings.py @@ -0,0 +1,15 @@ +DOWNLOADER_MIDDLEWARES = { + 'scrapy_splash.SplashCookiesMiddleware': 723, + 'scrapy_splash.SplashMiddleware': 725, + 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, +} + +SPLASH_URL = 'http://localhost:8050/' + +SPIDER_MIDDLEWARES = { + 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, +} + +DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' +HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' +