class MangaCrawler::Crawler

Public Instance Methods

get_chapters(manga_website) click to toggle source
# File lib/manga-crawler/crawler.rb, line 36
def get_chapters manga_website
  #TODO
  #uses the same logic of get_mangas
  return self.get_mangas manga_website
end
get_image_from_page(image_website) click to toggle source
# File lib/manga-crawler/crawler.rb, line 83
def get_image_from_page image_website

  begin
    html_image = Nokogiri::HTML(open(image_website.params.current_url))

    image_link = html_image.at_css(image_website.params.css_path)[image_website.params.html_field]
  rescue Exception => e
    p "Error trying to access: #{image_website.params.current_url}"
  end

  return image_link
end
get_mangas(index_website) click to toggle source
# File lib/manga-crawler/crawler.rb, line 8
def get_mangas index_website

  start_time = Time.now

  result = Array.new

  html_index = Nokogiri::HTML(open(index_website.params.current_url))

  #find all content that matches with the css_path
  links = html_index.css(index_website.params.css_path)

  #find all content from the anchor nodes found in last search
  links.each do |anchor|
    result.push([anchor.content, anchor[index_website.params.html_field]])
  end

  #TODO
  #if has a css_pagination, use recursion
  #example: result += get_mangas next_link, css_path, css_pagination

  end_time = Time.now

  puts "\nIndex completed!"
  puts "Elapsed time: #{end_time-start_time} seconds."

  return result
end
get_pages(chapter_website, css_image_path) click to toggle source
# File lib/manga-crawler/crawler.rb, line 42
def get_pages chapter_website, css_image_path
  
  start_time = Time.now

  result = Array.new

  pages_links = get_pages_links_from_chapter chapter_website

  pages_links.each do |page|
    
    current_url = chapter_website.params.base_url + page[1]

    params = Website::Parameters.new(chapter_website.params.base_url, current_url, css_image_path, :src)

    result.push( self.get_image_from_page Website::Page.new(params) )
  end

  end_time = Time.now

  puts "\nCollect pages completed!"
  puts "Elapsed time: #{end_time-start_time} seconds."

  return result

end