class Spieker::Crawler
Public Class Methods
new(url, verbose: false, lang: 'en')
click to toggle source
# File lib/spieker/crawler.rb, line 3 def initialize(url, verbose: false, lang: 'en') @url = url @tracked_links = [] @verbose = verbose @lang = lang end
Public Instance Methods
crawl!()
click to toggle source
# File lib/spieker/crawler.rb, line 10 def crawl! report "Starting to crawl on #{@url}" scraper = LinkScraper.new(@url, lang: @lang) track_link(@url) links = scraper.result recursively_crawl(links) print_results if @verbose end
current_path()
click to toggle source
# File lib/spieker/crawler.rb, line 21 def current_path URI.parse(@url).path end
Private Instance Methods
print_results()
click to toggle source
# File lib/spieker/crawler.rb, line 59 def print_results report "\n\n:::RESULTS:::\n\n" report "Pages found #{@tracked_links.uniq.compact.length}\n\n" report "All links found:\n\n #{@tracked_links.compact.join("\n")}" end
recursively_crawl(links)
click to toggle source
# File lib/spieker/crawler.rb, line 26 def recursively_crawl(links) new_links_threaded = {} links.each do |link| report "Crawling page #{link}" scraper = LinkScraper.new(link, lang: @lang) new_links_threaded[link] = scraper.result track_link(link) report "Finished page #{link}, #{new_links_threaded[link].length} links found" end new_links = select_untracked_links(new_links_threaded.values.flatten.uniq) report "Recursively crawling #{new_links.length} links ..." report "NEW LINKS FOUND: \n#{new_links.join("\n")}" report "TRACKED LINKS: \n#{@tracked_links.join("\n")}" recursively_crawl(new_links) if new_links.any? end
report(text)
click to toggle source
# File lib/spieker/crawler.rb, line 53 def report(text) if @verbose puts text end end
select_untracked_links(links)
click to toggle source
# File lib/spieker/crawler.rb, line 45 def select_untracked_links(links) links.select { |l| !@tracked_links.include?(l) } end
track_link(link)
click to toggle source
# File lib/spieker/crawler.rb, line 49 def track_link(link) @tracked_links = @tracked_links.push(link) end