class CrawlerDB
Public Class Methods
new()
click to toggle source
Calls superclass method
Crawler::new
# File lib/rfilma/crawlerdb.rb, line 6 def initialize super end
Public Instance Methods
buscar_por_titulo(titulo,nlim=30)
click to toggle source
# File lib/rfilma/crawlerdb.rb, line 14 def buscar_por_titulo(titulo,nlim=30) Pelicula.where(titulo: /#{titulo}/i).limit(nlim).as_json end
guardar_pelicula(id)
click to toggle source
# File lib/rfilma/crawlerdb.rb, line 18 def guardar_pelicula(id) p = Crawler.new.obtener_pelicula(id) m = Pelicula.new(p) m.upsert end
guardar_peliculas(ids,nthread=5)
click to toggle source
# File lib/rfilma/crawlerdb.rb, line 24 def guardar_peliculas(ids,nthread=5) pool = Thread.pool(nthread) ids2 = Pelicula.find(ids).each.map{|idd| idd["id"]} ids3 = (ids - ids2) + (ids2 - ids) ids3.each{|i| pool.process{ guardar_pelicula(i) } } pool.shutdown end
obtener_pelicula(id)
click to toggle source
# File lib/rfilma/crawlerdb.rb, line 10 def obtener_pelicula(id) Pelicula.where(id: id).as_json end
procesar_paginas(letra)
click to toggle source
# File lib/rfilma/crawlerdb.rb, line 36 def procesar_paginas(letra) pagina = 1 # Cualquier categoría tiene más de una página r = ">>" indices_pelis = [] while r.include?(">>") p = @a.get("http://www.filmaffinity.com/es/allfilms_#{letra}_#{pagina}.html").body doc = Nokogiri::HTML(p) r = doc.xpath('//div[@class="pager"]/a[contains(text(),">>")]').text doc.xpath('//div[@class="movie-card movie-card-1"]').each{|mc| indices_pelis << mc["data-movie-id"].to_i } pagina+=1 end # Evitamos indices duplicados Set.new(indices_pelis).to_a end
procesar_todo()
click to toggle source
# File lib/rfilma/crawlerdb.rb, line 54 def procesar_todo cat = ('A'..'Z').to_a << "*" << "0-9" pool = Thread.pool(5) cat.each{|c| pool.process{ ra = procesar_paginas(c) guardar_peliculas(ra) } } pool.shutdown end