update

zaironjacobs · Mar 26, 2021 · fdb1890 · fdb1890
1 parent 4585a62
commit fdb1890
Show file tree

Hide file tree

Showing 20 changed files with 261 additions and 133 deletions.
diff --git a/README.md b/README.md
@@ -23,16 +23,16 @@ $ pip install igscraper --upgrade
 
 Create a new directory and cd into the directory.
 
+*Use --max to specify a maximum amount of posts to scrape*
+
 Scrape a profile:
 ```console
-$ igscraper username1 username2 username3 --max 10
+$ igscraper username1 username2 username3 --max 5
 ```
 
-*With --max you can provide a maximum amount of posts to download*
-
 To scrape stories you have to be logged in first:
 ```console
-$ igscraper username1 username2 username3 --login-username username
+$ igscraper username1 username2 username3 --max 5 --stories --login-username username
 ```
 
 Scrape a tag:
@@ -41,7 +41,7 @@ $ igscraper --recent-tags tag1 tag2 --max 10
 ```
 
 ```console
-$ igscraper --top-tags tag1 tag2
+$ igscraper --top-tags tag1 tag2 --max 3
 ```
 
 List all scraped users or tags:
@@ -86,8 +86,10 @@ $ igscraper --remove-tags-n 1 2
 *Scraping the same profile again will only download new posts, provided that you are inside the same directory 
 when you run the program again.*
 
-*Scraping too much will get your IP address temporarily restricted by Instagram, this means that you cannot
-view any posts without being logged in.*
+*Scraping too much will get your IP address temporarily restricted by Instagram, sometimes instantly. To get around 
+this, it's best to login with a DUMMY account to scrape posts.*
+
+*Scraping Instagram with Selenium is significantly slower than other approaches.*
 
 ## Options
 
@@ -104,6 +106,8 @@ view any posts without being logged in.*
 
 --max                   Maximum number of posts to scrape.
 
+--stories               Scrape stories also.
+
 --headful               Display the browser UI.
 
 --list-users            List all scraped users.

diff --git a/instagram_scraper/__init__.py b/instagram_scraper/__init__.py
@@ -1 +1 @@
-__version__ = '1.1.17'
+__version__ = '1.1.18'
diff --git a/instagram_scraper/actions/__init__.py b/instagram_scraper/actions/__init__.py
@@ -15,4 +15,5 @@
 from .check_if_account_is_private import CheckIfAccountIsPrivate
 from .grab_post_links import GrabPostLinks
 from .go_to_link import GoToLink
-from .get_id import GetId
+from .get_user_id import GetUserId
+from .get_vid_src_url import GetVidSrcUrl
diff --git a/instagram_scraper/actions/count_stories.py b/instagram_scraper/actions/count_stories.py
@@ -16,7 +16,6 @@ def do(self):
         """ Count amount of stories """
 
         actions.GoToLink(self._scraper, self.__user.stories_link).do()
-        time.sleep(2)
 
         try:
             return len(self._web_driver.find_elements_by_css_selector(constants.STORIES_BAR_CSS))

diff --git a/instagram_scraper/actions/get_id.py → instagram_scraper/actions/get_user_id.py b/instagram_scraper/actions/get_id.py → instagram_scraper/actions/get_user_id.py
@@ -1,5 +1,6 @@
 import logging
 import json
+import time
 
 from bs4 import BeautifulSoup
 from json.decoder import JSONDecodeError
@@ -10,23 +11,37 @@
 logger = logging.getLogger('__name__')
 
 
-class GetId(actions.Action):
+class GetUserId(actions.Action):
     def __init__(self, scraper, username):
         super().__init__(scraper)
         self.__username = username
 
     def do(self):
         """ Get the id of a username """
 
+        # Open new tab and load the link
         link = constants.INSTAGRAM_USER_INFO_URL_DEFAULT.format(self.__username)
-        actions.GoToLink(self._scraper, link).do()
+        self._web_driver.execute_script("window.open('" + link + "','_blank');")
+        first_tab_handle = self._web_driver.current_window_handle
+
+        # Switch to the new tab
+        self._web_driver.switch_to.window(self._web_driver.window_handles[1])
+        time.sleep(2)
+
+        # Get data
         result = self._scraper.web_driver.page_source
         soup = BeautifulSoup(result, 'html.parser')
+
+        # Close the the new tab
+        self._web_driver.close()
+        self._web_driver.switch_to.window(first_tab_handle)
+        time.sleep(2)
+
         try:
             data = json.loads(soup.text)
             return data['graphql']['user']['id']
         except (JSONDecodeError, KeyError) as err:
-            logger.error('could not retrieve user id: %s', str(err))
+            logger.error('could not retrieve user id: %s' % str(err))
 
     def on_fail(self):
         pass
diff --git a/instagram_scraper/actions/get_vid_src_url.py b/instagram_scraper/actions/get_vid_src_url.py
@@ -0,0 +1,55 @@
+import logging
+import json
+import time
+
+from bs4 import BeautifulSoup
+from json.decoder import JSONDecodeError
+
+from .. import constants
+from .. import actions
+
+logger = logging.getLogger('__name__')
+
+
+class GetVidSrcUrl(actions.Action):
+    def __init__(self, scraper, post_id, is_multiple, post_index=0):
+        super().__init__(scraper)
+        self.__post_id = post_id
+        self.__is_multiple = is_multiple
+        self.__post_index = post_index
+
+    def do(self):
+        """ Get the video source url """
+
+        # Open new tab and load the link
+        link = constants.INSTAGRAM_POST_INFO.format(self.__post_id)
+        self._web_driver.execute_script("window.open('" + link + "','_blank');")
+        first_tab_handle = self._web_driver.current_window_handle
+
+        # Switch to the new tab
+        self._web_driver.switch_to.window(self._web_driver.window_handles[1])
+        time.sleep(2)
+
+        # Get data
+        result = self._scraper.web_driver.page_source
+        soup = BeautifulSoup(result, 'html.parser')
+
+        # Close the the new tab
+        self._web_driver.close()
+        self._web_driver.switch_to.window(first_tab_handle)
+        time.sleep(2)
+
+        try:
+            post_info = json.loads(soup.text)
+            if self.__is_multiple:
+                vid_url = post_info['graphql']['shortcode_media']['edge_sidecar_to_children']['edges'][
+                    self.__post_index]['node']['video_url']
+                return vid_url
+            else:
+                vid_url = post_info['graphql']['shortcode_media']['video_url']
+                return vid_url
+        except (JSONDecodeError, KeyError) as err:
+            logger.error('Unable to get video source url: %s' % str(err))
+
+    def on_fail(self):
+        pass
diff --git a/instagram_scraper/actions/go_to_link.py b/instagram_scraper/actions/go_to_link.py
@@ -1,4 +1,5 @@
 import logging
+import time
 
 from selenium.common.exceptions import NoSuchElementException
 from selenium.common.exceptions import StaleElementReferenceException
@@ -37,17 +38,18 @@ def do(self):
         try:
             self._web_driver.get(link)
 
-            WebDriverWait(self._web_driver, 10).until(
-                lambda d: d.execute_script('return document.readyState') == 'complete')
+            WebDriverWait(
+                self._web_driver, 10).until(lambda d: d.execute_script('return document.readyState') == 'complete')
+            time.sleep(3)
 
+            # Check for page load failure
             try:
                 self._web_driver.find_element_by_id(constants.CHROME_RELOAD_BUTTON_ID)
                 self.__page_reload_tries += 1
                 logger.warning('chrome could not load page')
                 self.do()
             except (NoSuchElementException, StaleElementReferenceException):
                 pass
-
             try:
                 self._web_driver.find_element_by_id(constants.SORRY_ID)
                 self.__page_reload_tries += 1

diff --git a/instagram_scraper/actions/init_scrape.py b/instagram_scraper/actions/init_scrape.py
@@ -22,13 +22,19 @@ def do(self):
         Load the post page and check whether to start scraping a post with single content or multiple content.
         """
 
-        actions.GoToLink(self._scraper, self.__link).do()
-
-        try:
-            self._web_driver.find_element_by_css_selector(constants.PAGE_USERNAME)
-        except (NoSuchElementException, StaleElementReferenceException):
-            logger.warning('page not available at %s', self.__link)
-            self.on_fail()
+        # Load the post page, trigger on_fail after 3 unsuccessful tries
+        post_page_load_success = False
+        post_page_load_count = 0
+        while not post_page_load_success and post_page_load_count < 3:
+            actions.GoToLink(self._scraper, self.__link, force=True).do()
+            post_page_load_count += 1
+            try:
+                self._web_driver.find_element_by_css_selector(constants.PAGE_USERNAME)
+                post_page_load_success = True
+            except (NoSuchElementException, StaleElementReferenceException):
+                if post_page_load_count >= 3:
+                    logger.warning('error loading page: post not found at %s' % self.__link)
+                    self.on_fail()
 
         if actions.PostHasMultipleContent(self._scraper, self.__link).do():
             actions.ScrapeMultipleContent(self._scraper, self.__link, self.__output_path).do()
@@ -38,5 +44,5 @@ def do(self):
             self.__database.insert_post(self.__link, False, self.__userid)
 
     def on_fail(self):
-        print('\npage not available at %s', self.__link)
+        print('\nerror loading post')
         self._scraper.stop()
diff --git a/instagram_scraper/actions/login.py b/instagram_scraper/actions/login.py
@@ -22,7 +22,6 @@ def do(self):
         """ Login """
 
         actions.GoToLink(self._scraper, constants.INSTAGRAM_URL).do()
-        time.sleep(3)
 
         # Enter username and password and click login
         try:
@@ -38,32 +37,31 @@ def do(self):
 
             self._web_driver.find_element_by_css_selector(constants.LOGIN_BUTTON_CSS).click()
 
-            time.sleep(4)
+            time.sleep(5)
         except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException) as err:
             logger.error(err)
-            print(err)
             self.on_fail()
 
         # Enter security code if asked for
-        try:
-            security_input_box = self._web_driver.find_element_by_css_selector(constants.SECURITY_INPUT_BOX_CSS)
-            security_input_box.click()
-        except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException):
-            pass
-        else:
-            time.sleep(2)
-
-            try:
-                security_code = getpass.getpass(prompt='enter security code: ')
-                security_input_box.send_keys(security_code)
-                self._web_driver.find_element_by_css_selector(constants.SECURITY_BOX_BUTTON).click()
-            except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException) as err:
-                logger.info(err)
-                self.on_fail()
-            else:
-                time.sleep(4)
-
-        # Click no if asked to save login info
+        # try:
+        #     security_input_box = self._web_driver.find_element_by_css_selector(constants.SECURITY_BOX_BUTTON)
+        #     security_input_box.click()
+        # except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException):
+        #     pass
+        # else:
+        #     time.sleep(2)
+        #
+        #     try:
+        #         security_code = getpass.getpass(prompt='enter security code: ')
+        #         security_input_box.send_keys(security_code)
+        #         self._web_driver.find_element_by_css_selector(constants.SECURITY_BOX_BUTTON).click()
+        #     except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException) as err:
+        #         logger.info(err)
+        #         self.on_fail()
+        #     else:
+        #         time.sleep(4)
+
+        # Click not now if asked to save login info
         try:
             self._web_driver.find_element_by_css_selector(constants.BUTTON_NO_SAVE_LOGIN_INFO_CSS).click()
         except(NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException):