class InstagramCrawler::Parser::Html

Attributes

html[R]

Public Class Methods

new(url) click to toggle source
# File lib/instagram_crawler/parser/html.rb, line 6
def initialize(url)
  @html = get_html(url)
end

Public Instance Methods

parsing() click to toggle source
# File lib/instagram_crawler/parser/html.rb, line 10
def parsing
  doc       = Nokogiri::HTML(html)
  js_data   = doc.at_xpath("//script[contains(text(),'window._sharedData')]")
  json      = JSON.parse(js_data.text[21..-2])
  profile   = json["entry_data"]["ProfilePage"][0]
  page_info = profile["graphql"]["user"]["edge_owner_to_timeline_media"]['page_info']
  user_id   = profile["logging_page_id"].delete("profilePage_")
  edges     = profile["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"]

  loop_edges(edges)

  return page_info, user_id
end
parsing_photo_page() click to toggle source
# File lib/instagram_crawler/parser/html.rb, line 30
def parsing_photo_page
  doc       = Nokogiri::HTML(html)
  js_data   = doc.at_xpath("//script[contains(text(),'window._sharedData')]")
  json      = JSON.parse(js_data.text[21..-2])
  shortcode_media = json["entry_data"]["PostPage"][0]["graphql"]["shortcode_media"]

  if shortcode_media["edge_sidecar_to_children"]
    shortcode_media["edge_sidecar_to_children"]["edges"]
  else
    shortcode_media["display_url"]
  end
end
parsing_video_page() click to toggle source
# File lib/instagram_crawler/parser/html.rb, line 24
def parsing_video_page
  doc    = Nokogiri::HTML(html)
  meta_v = doc.at_xpath("//meta[@property='og:video']")
  url    = meta_v.attribute_nodes.last.value
end

Private Instance Methods

get_html(url) click to toggle source
# File lib/instagram_crawler/parser/html.rb, line 73
def get_html(url)
  res = Config.proxyname ?
    HTTP.via(Config.proxyname, Config.port).get(url) : HTTP.get(url)
  raise Errors::HttpError, "#{res.code} #{res.reason}" if res.code != 200
  res.to_s
end
loop_edges(edges) click to toggle source
# File lib/instagram_crawler/parser/html.rb, line 45
def loop_edges(edges)
  edges.each do |edge|
    node = edge["node"]
    next if Config.before_date && (Config.parse_before_date < node["taken_at_timestamp"])
    check_after_time(node["taken_at_timestamp"])
    time = parse_to_date(node["taken_at_timestamp"])
    page_url = "https://www.instagram.com/p/#{node["shortcode"]}/"

    if node["is_video"]
      Logger.info "========VIDEO========".light_yellow
      url = Html.new(page_url).parsing_video_page
      output(time, url)
      File.download(url, 'video', time)
    else
      shortcode_media = Html.new(page_url).parsing_photo_page
      if shortcode_media.is_a? Array
        Logger.info "========POST========".light_magenta
        parse_post(shortcode_media, time)
      else
        Logger.info "========PHOTO========".light_green
        url = shortcode_media
        output(time, url)
        File.download(url, 'photo', time)
      end
    end
  end
end