class Spider
Public Class Methods
new()
click to toggle source
# File lib/spider.rb, line 14 def initialize @already_visited = {} end
Public Instance Methods
crawl_domain(url, page_limit = 100)
click to toggle source
domain
# File lib/spider.rb, line 52 def crawl_domain(url, page_limit = 100) # [ 递归终止条件 ] return if @already_visited.size == page_limit url_object = open_url(url) return if url_object.nil? parsed_doc = parse_url(url_object) return if parsed_doc.nil? @already_visited[url] = true if @already_visited[url].nil? page_urls = find_urls_on_page(parsed_doc,url) page_urls.each do |page_url| # 是同一域名&&未访问过的url if urls_on_same_domain?(url,page_url) && @already_visited[page_url].nil? # recursive 递归 crawl_domain(page_url) end end end
crawl_web(urls, depth = 2, page_limit = 100)
click to toggle source
web
# File lib/spider.rb, line 19 def crawl_web(urls, depth = 2, page_limit = 100) depth.times do # 几级 next_urls = [] # 读取初始url urls.each do |url| url_object = open_url(url) next if url_object.nil? # 重定向后? url url = update_url_if_redirected(url_object) # 解析url parsed_doc = parse_url(url_object) next if parsed_doc.nil? # 标记已经访问过url @already_visited[url] = true if @already_visited[url].nil? # [ 循坏的终止条件] 超出page_limit,则跳出 return if @already_visited.size == page_limit # 解析page上新url,加入[],剔除已经访问过 next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys) # 去掉相同的url next_urls.uniq! end # each 循环 urls = next_urls end end
Private Instance Methods
find_urls_on_page(parsed_doc, current_url)
click to toggle source
# File lib/spider.rb, line 97 def find_urls_on_page(parsed_doc, current_url) parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list| new_url = x['href'].split('#')[0] if new_url # complicated feature: make_absolute new_url = make_absolute(current_url, new_url) if relative?(new_url) urls_list.push(new_url) end end end
open_url(url)
click to toggle source
add my exception to open(url)
# File lib/spider.rb, line 77 def open_url(url) open(url) rescue puts "Unable to open url: " + url end
parse_url(url_object)
click to toggle source
deal dom tree content
# File lib/spider.rb, line 88 def parse_url(url_object) # doc = Hpricot(url_object) #nokogiri doc = Nokogiri(url_object) puts 'Crawling url ' + url_object.base_uri.to_s doc rescue puts 'Could not parse url: ' + url_object.base_uri.to_s end
update_url_if_redirected(url_object)
click to toggle source
# File lib/spider.rb, line 83 def update_url_if_redirected(url_object) url_object.base_uri.to_s end