module Instagram::Ripper

Constants

INSTAGRAM_URI

Instagram site

VERSION

Public Class Methods

authenticate(username, password) click to toggle source

Authenticate against the Instagram service.

@param username [String] the username to authenticate against the Instagram service @param password [String] the password to authenticate against the Instagram service @return session [Capybara::Session]

# File lib/instagram/ripper.rb, line 21
def self.authenticate(username, password)
  @session = new_session
  @session.visit INSTAGRAM_URI
  @session.click_link 'Log in'
  @session.fill_in 'Username', :with => username
  @session.fill_in 'Password', :with => password
  @session.click_button 'Log in'

  @session
end
reaper(dump_type = :json, persist = false, profile) click to toggle source

Loads the Instagram profile page of a user. If it's a private page, then you can authenticate yourself before it and try a second shot.

@param dump_type [Symbol] the type as the page will be dumped (`:json` or `:html`) @param persist [Boolean] save the page or not in the current directory @param profile [String] the user profile identity @return [JSON/HTML] the page

# File lib/instagram/ripper.rb, line 40
def self.reaper(dump_type = :json, persist = false, profile)

  @session = new_session if @session == nil

  # visit the page
  @session.visit INSTAGRAM_URI + profile

  # while exist photos to be loaded
  has_media = true if @session.body != nil and @session.body.size > 0

  # first time only
  @session.click_link 'Load more' if has_media and @session.has_link? 'Load more'

  # total media of the profile
  total_media_count = 0
  current_media_count = 0
  private_user = false

  if has_media
    # extract the HTML content
    html_content = Nokogiri::HTML(@session.body)
    
    # try to parse the Javascript inside the HTML
    javascript = html_content.content.match 'window._sharedData = .*'
    javascript = javascript.to_s.split('window._sharedData = ')[1].chop
    json_content = JSON.parse(javascript)

    # does it has content ?
    if json_content['entry_data'].size > 0 and json_content['entry_data']['ProfilePage'] != nil

      # it's a private user
      private_user = json_content['entry_data']['ProfilePage'][0]['user']['media']['nodes'] == nil ? true : false
      
      # total of media
      total_media_count = json_content['entry_data']['ProfilePage'][0]['user']['media']['count']

      # current
      current_media_count = count_loaded_media(html_content)
    end
  end

  # iterate over all 'pagebreaks'
  while !private_user and current_media_count < total_media_count

    # does it has more media to load ?
    @session.execute_script('window.scrollTo(0,document.body.scrollHeight)')

    # every scrolling down loads 12 new pictures
    expected_media_count = current_media_count + 12
    current_media_count = 0

    # waits while the page is not fully loaded
    while (current_media_count < expected_media_count and expected_media_count < total_media_count)
      sleep 1
      current_media_count = count_loaded_media(Nokogiri::HTML(@session.body))
      @session.execute_script('window.scrollTo(0,document.body.scrollHeight)') if current_media_count < expected_media_count
    end

  end

  # persist
  @session.save_page if persist

  html_content = Nokogiri::HTML(@session.body)

  # close the session
  @session.driver.quit

  @session = nil

  # return the content of the page
  dump_type == :json ? mount_json_content(html_content) : html_content
end

Private Class Methods

configuration() click to toggle source

Configure the Capybara with Poltergeist

# File lib/instagram/ripper.rb, line 124
def self.configuration
  Capybara.javascript_driver = :poltergeist
  Capybara.default_driver = :poltergeist
  Capybara.default_max_wait_time = 10

  Capybara.register_driver :poltergeist do |app|
    Capybara::Poltergeist::Driver.new(app, { debug: false,
                                             js_errors: false,
                                             timeout: 180,
                                             phantomjs_options: ['--load-images=no', '--ignore-ssl-errors=yes', '--ssl-protocol=any']
    })
  end
end
count_loaded_media(html_content) click to toggle source

Counts loaded media

@param html_content [Nokogiri::XML] the HTML content @return [int] the quantity of images already loaded

# File lib/instagram/ripper.rb, line 142
def self.count_loaded_media(html_content)
  count = 0

  # images and videos (thumbs) belongs to the class "_icyx7"
  html_content.xpath('//img').each do |img|
    count += 1 if img.has_attribute? 'class' and img.attributes['class'].value == '_icyx7'
  end

  count
end
extract_info(img) click to toggle source

Extract src and alt information about the image

@param img [Nokogiri::XML] the image @return [Hash] {image: {src: 'content', caption: 'content'} }

# File lib/instagram/ripper.rb, line 179
def self.extract_info(img)
  if img.has_attribute? 'src' and img.has_attribute? 'alt'
    {image: {src: treat_url(img.attributes['src'].value), caption: img.attributes['alt'].value}}
  end
end
mount_json_content(html_content) click to toggle source

Mount the JSON content through the HTML content:

{images: [

{image:
  {src: 'http://xyz', caption: 'photo xyz'}},
{image
  {src: 'http://abc', caption: 'photo abc'}}

]}

@param html_content [Nokogiri::XML] the HTML content @return [JSON] then json representing the html_content

# File lib/instagram/ripper.rb, line 164
def self.mount_json_content(html_content)
  imgs = []

  html_content.xpath('//img').each do |img|
    imgs.<< extract_info(img) if img.has_attribute? 'class' and img.attributes['class'].value == '_icyx7'
  end

  # remove nil values (compact!)
  {images: imgs.compact!}.to_json
end
new_session() click to toggle source

Creates a new Capybara/Poltergeist session

# File lib/instagram/ripper.rb, line 118
def self.new_session
  configuration
  Capybara::Session.new(:poltergeist)
end
treat_url(url) click to toggle source

Treat the image url from Instagram

@param url [String] the url @return [String] url without the squared-cutting context and the cache part

# File lib/instagram/ripper.rb, line 189
def self.treat_url(url)
  # removes the cache part and the squared-cutting context
  url.split('?ig_cache')[0].sub /e35\/.*\..*\..*\..*\//, 'e35/'
end