class IML::IMDB

IMDB scraping

Attributes

doc[RW]
result[RW]

Public Class Methods

new(query) click to toggle source

Start a IMDB query @return [Array<Media::Movie, Media::TVSeries>] Array of Media objects

# File lib/iml/imdb.rb, line 10
def initialize(query)
  query = CGI.escape(query)
  html = open("https://www.imdb.com/find?q=#{query}&ref_=nv_sr_fn", 'X-Forwarded-For' => '35.228.112.200').read
  @doc = Nokogiri::HTML(html)
  @result = []
  search
end

Private Instance Methods

fetch_type(elem) click to toggle source
# File lib/iml/imdb.rb, line 32
def fetch_type(elem)
  elem.children[2].to_s.strip
end
game?(elem) click to toggle source
# File lib/iml/imdb.rb, line 44
def game?(elem)
  fetch_type(elem).match(/Video Game/) || false
end
href(elem) click to toggle source
# File lib/iml/imdb.rb, line 52
def href(elem)
  elem.children[1].attr(:href)
end
movie?(elem) click to toggle source
# File lib/iml/imdb.rb, line 40
def movie?(elem)
  fetch_type(elem).match(/\((?<year>\d{4})\)/) || false
end
parsable_element(elem) click to toggle source
# File lib/iml/imdb.rb, line 20
def parsable_element(elem)
  elem.children[1] && (elem.css('i').first || elem.children[1].child.to_s) && elem.children[1].attr(:href) =~ /title/
end
parse_title(elem) click to toggle source
# File lib/iml/imdb.rb, line 28
def parse_title(elem)
  title_first_choice(elem) || elem.children[1].child.to_s
end
processable_elements() click to toggle source
# File lib/iml/imdb.rb, line 48
def processable_elements
  @processable_elements ||= @doc.css('.result_text').select { |e| parsable_element(e) && !game?(e) }
end
title_first_choice(elem) click to toggle source
# File lib/iml/imdb.rb, line 24
def title_first_choice(elem)
  elem.css('i').first && elem.css('i').first.child.to_s.delete('"')
end
tv?(elem) click to toggle source
# File lib/iml/imdb.rb, line 36
def tv?(elem)
  fetch_type(elem).match(/\((?<year>\d{4})\) \(TV Series\)/) || false
end
year(elem) click to toggle source
# File lib/iml/imdb.rb, line 56
def year(elem)
  fetch_type(elem).match(/\((?<year>\d{4})\)/).named_captures['year']
end