class CrawlerDB

Public Class Methods

new() click to toggle source
Calls superclass method Crawler::new
# File lib/rfilma/crawlerdb.rb, line 6
def initialize
        super
end

Public Instance Methods

buscar_por_titulo(titulo,nlim=30) click to toggle source
# File lib/rfilma/crawlerdb.rb, line 14
def buscar_por_titulo(titulo,nlim=30)
        Pelicula.where(titulo: /#{titulo}/i).limit(nlim).as_json
end
guardar_pelicula(id) click to toggle source
# File lib/rfilma/crawlerdb.rb, line 18
def guardar_pelicula(id)               
        p = Crawler.new.obtener_pelicula(id)          
        m = Pelicula.new(p)
        m.upsert              
end
guardar_peliculas(ids,nthread=5) click to toggle source
# File lib/rfilma/crawlerdb.rb, line 24
def guardar_peliculas(ids,nthread=5)
        pool = Thread.pool(nthread)   
        ids2 = Pelicula.find(ids).each.map{|idd| idd["id"]}
        ids3 = (ids - ids2) + (ids2 - ids)    
        ids3.each{|i|
                pool.process{
                        guardar_pelicula(i)                         
                }
        }
        pool.shutdown
end
obtener_pelicula(id) click to toggle source
# File lib/rfilma/crawlerdb.rb, line 10
def obtener_pelicula(id)
        Pelicula.where(id: id).as_json
end
procesar_paginas(letra) click to toggle source
# File lib/rfilma/crawlerdb.rb, line 36
def procesar_paginas(letra)
        pagina = 1
        # Cualquier categoría tiene más de una página
        r = ">>"              
        indices_pelis = []
        while r.include?(">>")
                p = @a.get("http://www.filmaffinity.com/es/allfilms_#{letra}_#{pagina}.html").body
                doc = Nokogiri::HTML(p)
                r = doc.xpath('//div[@class="pager"]/a[contains(text(),">>")]').text
                doc.xpath('//div[@class="movie-card movie-card-1"]').each{|mc|
                        indices_pelis << mc["data-movie-id"].to_i
                }
                pagina+=1
        end
        # Evitamos indices duplicados
        Set.new(indices_pelis).to_a
end
procesar_todo() click to toggle source
# File lib/rfilma/crawlerdb.rb, line 54
def procesar_todo
        cat = ('A'..'Z').to_a << "*" << "0-9"
        pool = Thread.pool(5)
        cat.each{|c|
                pool.process{
                        ra = procesar_paginas(c)
                        guardar_peliculas(ra)
                }
        }
        pool.shutdown
end