class InstagramScraper
Public Class Methods
new(brands, options = {})
click to toggle source
# File lib/instagram_scraper.rb, line 8 def initialize(brands, options = {}) @brands = brands @min_likes = options[:min_likes] || 500 @output_file = options[:output_file] || "./Instagram Data (#{brands.sort.join(', ')}).csv" @proxies = options[:proxies] || [] @data = [] end
Public Instance Methods
perform()
click to toggle source
# File lib/instagram_scraper.rb, line 16 def perform scrape_brands store_data_in_csv unless @data.empty? end
Private Instance Methods
build_query_params(query_hash, brand_id, end_cursor)
click to toggle source
# File lib/instagram_scraper.rb, line 63 def build_query_params(query_hash, brand_id, end_cursor) { query_hash: query_hash, variables: { id: brand_id, first: 50, after: end_cursor, }.to_json, } end
parse_post_data(post_data)
click to toggle source
# File lib/instagram_scraper.rb, line 94 def parse_post_data(post_data) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength publisher = post_data["owner"]["username"] likes = post_data["edge_liked_by"]["count"] return if likes < @min_likes { publisher: publisher, publisher_url: "#{BASE_URL}/#{publisher}", post_url: "#{BASE_URL}/p/#{post_data['shortcode']}", likes: likes, comments: post_data["edge_media_to_comment"]["count"], date: Time.zone.at(post_data["taken_at_timestamp"]).strftime("%d/%m/%Y"), caption: post_data["edge_media_to_caption"]["edges"]&.first&.[]("node")&.[]("text")&.gsub(/\n/, " "), } end
scrape_brand_data(brand)
click to toggle source
# File lib/instagram_scraper.rb, line 33 def scrape_brand_data(brand) brand_url = "#{BASE_URL}/#{brand}" brand_data = JSON.parse(URI.open("#{brand_url}/?__a=1").read)["graphql"]["user"] { id: brand_data["id"], brand: brand_data["full_name"], brand_url: brand_url, } end
scrape_brand_posts(brand_data, end_cursor = "")
click to toggle source
# File lib/instagram_scraper.rb, line 43 def scrape_brand_posts(brand_data, end_cursor = "") query_hash = scrape_query_hash while end_cursor query_params = build_query_params(query_hash, brand_data[:id], end_cursor) posts_data = scrape_posts_data(query_params) end_cursor = posts_data["page_info"]["end_cursor"] posts_data["edges"].each do |post_data| post = parse_post_data(post_data["node"]) @data << brand_data.slice(:brand, :brand_url).merge(post) if post end puts("Scraped #{@data.count} posts") unless @data.empty? end end
scrape_brands()
click to toggle source
# File lib/instagram_scraper.rb, line 23 def scrape_brands @brands.each do |brand| brand_data = scrape_brand_data(brand) rescue OpenURI::HTTPError next else scrape_brand_posts(brand_data) end end
scrape_posts_data(query_params, posts_data = [], proxy_index = 0)
click to toggle source
# File lib/instagram_scraper.rb, line 74 def scrape_posts_data(query_params, posts_data = [], proxy_index = 0) agent = Mechanize.new url = "#{BASE_URL}/graphql/query/?#{URI.encode_www_form(query_params)}" while posts_data.empty? proxy = @proxies[proxy_index] unless proxy puts "No more proxies available" end ip, port = proxy.split(":") agent.set_proxy(ip, port.to_i) begin posts_data = JSON.parse(agent.get(url).body)["data"]["user"]["edge_user_to_photos_of_you"] rescue proxy_index += 1 end end posts_data end
scrape_query_hash()
click to toggle source
# File lib/instagram_scraper.rb, line 57 def scrape_query_hash # TODO: scrape bundle name bundle_url = "#{BASE_URL}/static/bundles/es6/ProfilePageContainer.js/b10d8b1b32fc.js" URI.open(bundle_url).read.match(QUERY_ID_PATTERN)[1] end
store_data_in_csv()
click to toggle source
# File lib/instagram_scraper.rb, line 110 def store_data_in_csv headers = @data.first.keys.map { |key| key.to_s.tr("_", " ").capitalize } CSV.open(@output_file, "wb", write_headers: true, headers: headers) do |csv| @data.each { |post| csv << post.values } end end