class NewsCrawler::LinkSelector::SameDomainSelector
Select all link from same domain. Domain is got from database
Public Class Methods
exclude?(url)
click to toggle source
Test whether url is excluded @param [ String ] url @return [ Boolean ] true if url is excluded, false otherwise
# File lib/news_crawler/link_selector/same_domain_selector.rb, line 111 def self.exclude?(url) config = SimpleConfig.for :same_domain_selector exclude_list = [] url_domain = get_url_path(url)[:domain] begin exclude_group = config.exclude rescue NoMethodError => e return false end unless exclude_group.nil? exclude_group.to_hash.keys.each do | url_e | if url_domain.to_s.end_with? url_e.to_s exclude_list = config.exclude.get(url_e) break end end end exclude_list = exclude_list.map do | elt | if /^\/.*\/$/ =~ elt Regexp.new(elt[1..-2]) # already an Regex else new_elt = "^(.*/)?#{elt}(/.*)?$" Regexp.new(new_elt) end end if exclude_list.count == 0 return false end # url.split('/').each do | part | # if exclude_list.include? part # return true # end # end exclude_list.each do | exclude_rule | if exclude_rule =~ url return true end end return false end
new(max_depth = -1, start_on_create = true)
click to toggle source
Create new selector with queue URL’s selected is put back into queue @param [ Fixnum ] max_depth maxinum depth to crawl @param [ Boolean ] start_on_create whether start selector immediately
# File lib/news_crawler/link_selector/same_domain_selector.rb, line 46 def initialize(max_depth = -1, start_on_create = true) @max_depth = max_depth @wait_time = 1 @status = :running @stoping = false run if start_on_create end
Public Instance Methods
extract_url(url)
click to toggle source
Extract url from page
# File lib/news_crawler/link_selector/same_domain_selector.rb, line 55 def extract_url(url) doc = RawData.find_by_url(url) html_doc = Nokogiri::HTML(doc) results = [] inner_url = html_doc.xpath('//a').collect { | a_el | temp_url = (a_el.attribute 'href').to_s if (!temp_url.nil?) && (temp_url[0] == '/') temp_url = URI.join(url, temp_url).to_s end temp_url } inner_url.delete_if { | url | (url.nil?) || (url.size == 0) || (url == '#') || (url == 'javascript:;') } # select url from same domain inner_url.select { | o_url | if (same_domain?(o_url, url)) if (!SameDomainSelector.exclude?(o_url)) begin URLQueue.add(o_url, url) results << [o_url, url] rescue URLQueue::DuplicateURLError => e end else # TODO Log here end end } end
graceful_terminate()
click to toggle source
Graceful terminate this selector
# File lib/news_crawler/link_selector/same_domain_selector.rb, line 157 def graceful_terminate @stoping = true while @status == :running sleep(1) end end
run()
click to toggle source
# File lib/news_crawler/link_selector/same_domain_selector.rb, line 89 def run @status = :running return if @stoping if @max_depth == 0 @status = :stopped return end while !@stoping url = next_unprocessed(@max_depth - 1) while (url.nil?) wait_for_url url = next_unprocessed(@max_depth - 1) end NCLogger.get_logger.info "Processing #{url}" extract_url(url) mark_processed(url) end end
Private Instance Methods
wait_for_url()
click to toggle source
Waiting for new urls’re added to queue, using backoff algorithms
# File lib/news_crawler/link_selector/same_domain_selector.rb, line 166 def wait_for_url @status = :waiting sleep @wait_time if @wait_time < 30 @wait_times = @wait_time * 2 end end