class Crawler
Public Class Methods
new()
click to toggle source
# File lib/rfilma/crawler.rb, line 7 def initialize @a = Mechanize.new{|op| op.user_agent_alias = "Windows Mozilla" } end
Public Instance Methods
buscar_por_titulo(titulo)
click to toggle source
# File lib/rfilma/crawler.rb, line 60 def buscar_por_titulo(titulo) indices_pelis = [] p = @a.get("http://www.filmaffinity.com/es/search.php?stext=#{titulo.strip.gsub(" ","+")}&stype=title").body doc = Nokogiri::HTML(p) doc.xpath('//div[@class="movie-card movie-card-1"]').each{|mc| indices_pelis << mc["data-movie-id"].to_i } indices_pelis.map{|i| obtener_pelicula(i)} end
obtener_pelicula(id)
click to toggle source
# File lib/rfilma/crawler.rb, line 14 def obtener_pelicula(id) data = {} page = @a.get("http://www.filmaffinity.com/es/film#{id}.html").body doc = Nokogiri::HTML(page) data["id"] = id data["titulo"] = doc.xpath("//h1[@id='main-title']/a/span").text data["puntuacion"] = doc.xpath('//div[@id="movie-rat-avg"]').text.strip.gsub(",",".").to_f begin data["portada"] = doc.xpath('//div[@id="movie-main-image-container"]/a')[0]["href"] rescue data["portada"] = doc.xpath('//div[@id="movie-main-image-container"]/img')[0]["src"] end doc.xpath('//dl[@class="movie-info"]/dt').each{|m| dt = m.inner_html case when dt.include?("Título original") data["titulo_original"] = m.next_element.text when dt.include?("Año") data["año"] = m.next_element.text.to_i when dt.include?("Duración") data["duracion"] = m.next_element.text.match('(\d*)')[1].to_i when dt.include?("País") data["pais"] = m.next_element.at('img')['title'] when dt.include?("Director") data["director"] = m.next_element.search('a').map{|e| e.inner_html.strip} when dt.include?("Guión") data["guion"] = m.next_element.text.split(",").map{|e|e.strip} when dt.include?("Música") data["musica"] = m.next_element.text.split(",").map{|e|e.strip} when dt.include?("Fotografía") data["fotografia"] = m.next_element.text.split(",").map{|e|e.strip} when dt.include?("Reparto") data["reparto"] = m.next_element.text.split(",").map{|e|e.strip} when dt.include?("Productora") data["productora"] = m.next_element.text when dt.include?("Género") data["genero"] = m.next_element.search('a').map{|e| e.inner_html} when dt.include?("Web") data["web"] = m.next_element.text when dt.include?("Sinopsis") data["sinopsis"] = m.next_element.text end } data end