class RubyWebCrawler

Attributes

root_url[RW]
time_limit[RW]
url_limit[RW]
urls[RW]

Public Class Methods

new(url, url_limit = 50, time_limit = 60) click to toggle source
# File lib/ruby-web-crawler.rb, line 8
def initialize url, url_limit = 50, time_limit = 60
        self.urls = []
        self.root_url = url                   # Root URL from where the Crawling Starts
        self.url_limit = url_limit            # Default URL Limit for the Crawler
        self.time_limit = time_limit  # Timeout limit in seconds
end

Public Instance Methods

get_page_content(url) click to toggle source

Get HTML/Content of the Page to be parsed

# File lib/ruby-web-crawler.rb, line 50
def get_page_content url
        uri = URI(url)
        request = Net::HTTP::Get.new(uri)

        http = Net::HTTP.new(uri.host, uri.port)
        
        # Neet to enable use of SSL if the URL protocol is HTTPS
        http.use_ssl = (uri.scheme == "https")

        response = http.request(request)

        # Check if URL needs to be forwarded because of redirect
        case response
        when Net::HTTPSuccess
                return response.body
        when Net::HTTPMovedPermanently || Net::HTTPRedirection
                self.get_page_content response['location']
        end
end
get_urls_for_page(url) click to toggle source

Get all URLs on a Page

# File lib/ruby-web-crawler.rb, line 27
def get_urls_for_page url
        page_content = self.get_page_content url
        
        # Regex to get all "links" in the page
        urls = page_content.scan(/\<a href\=(\"(http|https)\:.*?\")/)                 
        urls.each { |u| 
                sanitized_url = u.first.gsub(/\"/, '').strip
                unless self.urls.include? sanitized_url
                        self.urls.push(sanitized_url)

                        # If Unexpected Error happens when trying to fetch URLs move on to the next URL
                        begin
                                break if self.urls.count >= self.url_limit
                                self.get_urls_for_page(sanitized_url)      
                        rescue Exception => e
                                next
                        end
                end
        }
        return self.urls
end
start_crawl() click to toggle source
# File lib/ruby-web-crawler.rb, line 15
def start_crawl
        begin
                is_running = Timeout::timeout(self.time_limit) {
                        self.get_urls_for_page self.root_url        
                }    
        rescue Exception => e
                # Do Nothing just don't let it error out
        end
        return self.urls
end