class Spider

Public Class Methods

new() click to toggle source
# File lib/spider.rb, line 14
def initialize
        @already_visited = {}
end

Public Instance Methods

crawl_domain(url, page_limit = 100) click to toggle source

domain

# File lib/spider.rb, line 52
def crawl_domain(url, page_limit = 100)
        # [ 递归终止条件 ]
        return if @already_visited.size == page_limit

        url_object = open_url(url)
        return if url_object.nil?

        parsed_doc = parse_url(url_object)
        return if parsed_doc.nil?

        @already_visited[url] = true if @already_visited[url].nil?
        page_urls = find_urls_on_page(parsed_doc,url)
        page_urls.each do |page_url|
                # 是同一域名&&未访问过的url
                if urls_on_same_domain?(url,page_url) && @already_visited[page_url].nil?
                        # recursive 递归
                        crawl_domain(page_url)
                end
        end

end
crawl_web(urls, depth = 2, page_limit = 100) click to toggle source

web

# File lib/spider.rb, line 19
def crawl_web(urls, depth = 2, page_limit = 100)
        depth.times do # 几级
                next_urls = []

                # 读取初始url
                urls.each do |url| 
                        url_object = open_url(url)
                        next if url_object.nil?
                        
                        # 重定向后? url
                        url = update_url_if_redirected(url_object) 

                        # 解析url
                        parsed_doc = parse_url(url_object) 
                        next if parsed_doc.nil?

                        # 标记已经访问过url
                        @already_visited[url] = true if @already_visited[url].nil?
                        # [ 循坏的终止条件]  超出page_limit,则跳出
                        return if @already_visited.size == page_limit

                        # 解析page上新url,加入[],剔除已经访问过
                        next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
                        # 去掉相同的url
                        next_urls.uniq!
                end
                # each 循环
                urls = next_urls
        end

end

Private Instance Methods

find_urls_on_page(parsed_doc, current_url) click to toggle source
# File lib/spider.rb, line 97
def find_urls_on_page(parsed_doc, current_url)
        parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
                new_url = x['href'].split('#')[0]
                if new_url
                        # complicated feature: make_absolute
                        new_url = make_absolute(current_url, new_url) if relative?(new_url)
                        urls_list.push(new_url)
                end
        end
end
open_url(url) click to toggle source

add my exception to open(url)

# File lib/spider.rb, line 77
def open_url(url)
        open(url)
rescue
        puts "Unable to open url: " + url
end
parse_url(url_object) click to toggle source

deal dom tree content

# File lib/spider.rb, line 88
def parse_url(url_object)
        # doc = Hpricot(url_object)  #nokogiri
        doc = Nokogiri(url_object)
        puts 'Crawling url ' + url_object.base_uri.to_s
        doc
rescue
        puts 'Could not parse url: ' + url_object.base_uri.to_s
end
update_url_if_redirected(url_object) click to toggle source
# File lib/spider.rb, line 83
def update_url_if_redirected(url_object)
        url_object.base_uri.to_s
end