Skip to content
This repository has been archived by the owner on Aug 31, 2021. It is now read-only.

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
zaironjacobs committed Mar 26, 2021
1 parent 4585a62 commit fdb1890
Show file tree
Hide file tree
Showing 20 changed files with 261 additions and 133 deletions.
18 changes: 11 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,16 @@ $ pip install igscraper --upgrade

Create a new directory and cd into the directory.

*Use --max to specify a maximum amount of posts to scrape*

Scrape a profile:
```console
$ igscraper username1 username2 username3 --max 10
$ igscraper username1 username2 username3 --max 5
```

*With --max you can provide a maximum amount of posts to download*

To scrape stories you have to be logged in first:
```console
$ igscraper username1 username2 username3 --login-username username
$ igscraper username1 username2 username3 --max 5 --stories --login-username username
```

Scrape a tag:
Expand All @@ -41,7 +41,7 @@ $ igscraper --recent-tags tag1 tag2 --max 10
```

```console
$ igscraper --top-tags tag1 tag2
$ igscraper --top-tags tag1 tag2 --max 3
```

List all scraped users or tags:
Expand Down Expand Up @@ -86,8 +86,10 @@ $ igscraper --remove-tags-n 1 2
*Scraping the same profile again will only download new posts, provided that you are inside the same directory
when you run the program again.*

*Scraping too much will get your IP address temporarily restricted by Instagram, this means that you cannot
view any posts without being logged in.*
*Scraping too much will get your IP address temporarily restricted by Instagram, sometimes instantly. To get around
this, it's best to login with a DUMMY account to scrape posts.*

*Scraping Instagram with Selenium is significantly slower than other approaches.*

## Options

Expand All @@ -104,6 +106,8 @@ view any posts without being logged in.*
--max Maximum number of posts to scrape.
--stories Scrape stories also.
--headful Display the browser UI.
--list-users List all scraped users.
Expand Down
2 changes: 1 addition & 1 deletion instagram_scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.1.17'
__version__ = '1.1.18'
3 changes: 2 additions & 1 deletion instagram_scraper/actions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@
from .check_if_account_is_private import CheckIfAccountIsPrivate
from .grab_post_links import GrabPostLinks
from .go_to_link import GoToLink
from .get_id import GetId
from .get_user_id import GetUserId
from .get_vid_src_url import GetVidSrcUrl
1 change: 0 additions & 1 deletion instagram_scraper/actions/count_stories.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ def do(self):
""" Count amount of stories """

actions.GoToLink(self._scraper, self.__user.stories_link).do()
time.sleep(2)

try:
return len(self._web_driver.find_elements_by_css_selector(constants.STORIES_BAR_CSS))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import json
import time

from bs4 import BeautifulSoup
from json.decoder import JSONDecodeError
Expand All @@ -10,23 +11,37 @@
logger = logging.getLogger('__name__')


class GetId(actions.Action):
class GetUserId(actions.Action):
def __init__(self, scraper, username):
super().__init__(scraper)
self.__username = username

def do(self):
""" Get the id of a username """

# Open new tab and load the link
link = constants.INSTAGRAM_USER_INFO_URL_DEFAULT.format(self.__username)
actions.GoToLink(self._scraper, link).do()
self._web_driver.execute_script("window.open('" + link + "','_blank');")
first_tab_handle = self._web_driver.current_window_handle

# Switch to the new tab
self._web_driver.switch_to.window(self._web_driver.window_handles[1])
time.sleep(2)

# Get data
result = self._scraper.web_driver.page_source
soup = BeautifulSoup(result, 'html.parser')

# Close the the new tab
self._web_driver.close()
self._web_driver.switch_to.window(first_tab_handle)
time.sleep(2)

try:
data = json.loads(soup.text)
return data['graphql']['user']['id']
except (JSONDecodeError, KeyError) as err:
logger.error('could not retrieve user id: %s', str(err))
logger.error('could not retrieve user id: %s' % str(err))

def on_fail(self):
pass
55 changes: 55 additions & 0 deletions instagram_scraper/actions/get_vid_src_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import logging
import json
import time

from bs4 import BeautifulSoup
from json.decoder import JSONDecodeError

from .. import constants
from .. import actions

logger = logging.getLogger('__name__')


class GetVidSrcUrl(actions.Action):
def __init__(self, scraper, post_id, is_multiple, post_index=0):
super().__init__(scraper)
self.__post_id = post_id
self.__is_multiple = is_multiple
self.__post_index = post_index

def do(self):
""" Get the video source url """

# Open new tab and load the link
link = constants.INSTAGRAM_POST_INFO.format(self.__post_id)
self._web_driver.execute_script("window.open('" + link + "','_blank');")
first_tab_handle = self._web_driver.current_window_handle

# Switch to the new tab
self._web_driver.switch_to.window(self._web_driver.window_handles[1])
time.sleep(2)

# Get data
result = self._scraper.web_driver.page_source
soup = BeautifulSoup(result, 'html.parser')

# Close the the new tab
self._web_driver.close()
self._web_driver.switch_to.window(first_tab_handle)
time.sleep(2)

try:
post_info = json.loads(soup.text)
if self.__is_multiple:
vid_url = post_info['graphql']['shortcode_media']['edge_sidecar_to_children']['edges'][
self.__post_index]['node']['video_url']
return vid_url
else:
vid_url = post_info['graphql']['shortcode_media']['video_url']
return vid_url
except (JSONDecodeError, KeyError) as err:
logger.error('Unable to get video source url: %s' % str(err))

def on_fail(self):
pass
8 changes: 5 additions & 3 deletions instagram_scraper/actions/go_to_link.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import time

from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
Expand Down Expand Up @@ -37,17 +38,18 @@ def do(self):
try:
self._web_driver.get(link)

WebDriverWait(self._web_driver, 10).until(
lambda d: d.execute_script('return document.readyState') == 'complete')
WebDriverWait(
self._web_driver, 10).until(lambda d: d.execute_script('return document.readyState') == 'complete')
time.sleep(3)

# Check for page load failure
try:
self._web_driver.find_element_by_id(constants.CHROME_RELOAD_BUTTON_ID)
self.__page_reload_tries += 1
logger.warning('chrome could not load page')
self.do()
except (NoSuchElementException, StaleElementReferenceException):
pass

try:
self._web_driver.find_element_by_id(constants.SORRY_ID)
self.__page_reload_tries += 1
Expand Down
22 changes: 14 additions & 8 deletions instagram_scraper/actions/init_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,19 @@ def do(self):
Load the post page and check whether to start scraping a post with single content or multiple content.
"""

actions.GoToLink(self._scraper, self.__link).do()

try:
self._web_driver.find_element_by_css_selector(constants.PAGE_USERNAME)
except (NoSuchElementException, StaleElementReferenceException):
logger.warning('page not available at %s', self.__link)
self.on_fail()
# Load the post page, trigger on_fail after 3 unsuccessful tries
post_page_load_success = False
post_page_load_count = 0
while not post_page_load_success and post_page_load_count < 3:
actions.GoToLink(self._scraper, self.__link, force=True).do()
post_page_load_count += 1
try:
self._web_driver.find_element_by_css_selector(constants.PAGE_USERNAME)
post_page_load_success = True
except (NoSuchElementException, StaleElementReferenceException):
if post_page_load_count >= 3:
logger.warning('error loading page: post not found at %s' % self.__link)
self.on_fail()

if actions.PostHasMultipleContent(self._scraper, self.__link).do():
actions.ScrapeMultipleContent(self._scraper, self.__link, self.__output_path).do()
Expand All @@ -38,5 +44,5 @@ def do(self):
self.__database.insert_post(self.__link, False, self.__userid)

def on_fail(self):
print('\npage not available at %s', self.__link)
print('\nerror loading post')
self._scraper.stop()
42 changes: 20 additions & 22 deletions instagram_scraper/actions/login.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def do(self):
""" Login """

actions.GoToLink(self._scraper, constants.INSTAGRAM_URL).do()
time.sleep(3)

# Enter username and password and click login
try:
Expand All @@ -38,32 +37,31 @@ def do(self):

self._web_driver.find_element_by_css_selector(constants.LOGIN_BUTTON_CSS).click()

time.sleep(4)
time.sleep(5)
except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException) as err:
logger.error(err)
print(err)
self.on_fail()

# Enter security code if asked for
try:
security_input_box = self._web_driver.find_element_by_css_selector(constants.SECURITY_INPUT_BOX_CSS)
security_input_box.click()
except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException):
pass
else:
time.sleep(2)

try:
security_code = getpass.getpass(prompt='enter security code: ')
security_input_box.send_keys(security_code)
self._web_driver.find_element_by_css_selector(constants.SECURITY_BOX_BUTTON).click()
except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException) as err:
logger.info(err)
self.on_fail()
else:
time.sleep(4)

# Click no if asked to save login info
# try:
# security_input_box = self._web_driver.find_element_by_css_selector(constants.SECURITY_BOX_BUTTON)
# security_input_box.click()
# except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException):
# pass
# else:
# time.sleep(2)
#
# try:
# security_code = getpass.getpass(prompt='enter security code: ')
# security_input_box.send_keys(security_code)
# self._web_driver.find_element_by_css_selector(constants.SECURITY_BOX_BUTTON).click()
# except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException) as err:
# logger.info(err)
# self.on_fail()
# else:
# time.sleep(4)

# Click not now if asked to save login info
try:
self._web_driver.find_element_by_css_selector(constants.BUTTON_NO_SAVE_LOGIN_INFO_CSS).click()
except(NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException):
Expand Down
Loading

0 comments on commit fdb1890

Please sign in to comment.