class RubyWebCrawler
Attributes
root_url[RW]
time_limit[RW]
url_limit[RW]
urls[RW]
Public Class Methods
new(url, url_limit = 50, time_limit = 60)
click to toggle source
# File lib/ruby-web-crawler.rb, line 8 def initialize url, url_limit = 50, time_limit = 60 self.urls = [] self.root_url = url # Root URL from where the Crawling Starts self.url_limit = url_limit # Default URL Limit for the Crawler self.time_limit = time_limit # Timeout limit in seconds end
Public Instance Methods
get_page_content(url)
click to toggle source
Get HTML/Content of the Page to be parsed
# File lib/ruby-web-crawler.rb, line 50 def get_page_content url uri = URI(url) request = Net::HTTP::Get.new(uri) http = Net::HTTP.new(uri.host, uri.port) # Neet to enable use of SSL if the URL protocol is HTTPS http.use_ssl = (uri.scheme == "https") response = http.request(request) # Check if URL needs to be forwarded because of redirect case response when Net::HTTPSuccess return response.body when Net::HTTPMovedPermanently || Net::HTTPRedirection self.get_page_content response['location'] end end
get_urls_for_page(url)
click to toggle source
Get all URLs on a Page
# File lib/ruby-web-crawler.rb, line 27 def get_urls_for_page url page_content = self.get_page_content url # Regex to get all "links" in the page urls = page_content.scan(/\<a href\=(\"(http|https)\:.*?\")/) urls.each { |u| sanitized_url = u.first.gsub(/\"/, '').strip unless self.urls.include? sanitized_url self.urls.push(sanitized_url) # If Unexpected Error happens when trying to fetch URLs move on to the next URL begin break if self.urls.count >= self.url_limit self.get_urls_for_page(sanitized_url) rescue Exception => e next end end } return self.urls end
start_crawl()
click to toggle source
# File lib/ruby-web-crawler.rb, line 15 def start_crawl begin is_running = Timeout::timeout(self.time_limit) { self.get_urls_for_page self.root_url } rescue Exception => e # Do Nothing just don't let it error out end return self.urls end