class NewsCrawler::LinkSelector::SameDomainSelector

Select all link from same domain. Domain is got from database

Public Class Methods

exclude?(url) click to toggle source

Test whether url is excluded @param [ String ] url @return [ Boolean ] true if url is excluded, false otherwise

# File lib/news_crawler/link_selector/same_domain_selector.rb, line 111
def self.exclude?(url)
  config       = SimpleConfig.for :same_domain_selector
  exclude_list = []
  url_domain   = get_url_path(url)[:domain]
  begin
    exclude_group = config.exclude
  rescue NoMethodError => e
    return false
  end

  unless exclude_group.nil?
    exclude_group.to_hash.keys.each do | url_e |
      if url_domain.to_s.end_with? url_e.to_s
        exclude_list = config.exclude.get(url_e)
        break
      end
    end
  end

  exclude_list = exclude_list.map do | elt |
    if /^\/.*\/$/ =~ elt
      Regexp.new(elt[1..-2])                        # already an Regex
    else
      new_elt = "^(.*/)?#{elt}(/.*)?$"
      Regexp.new(new_elt)
    end
  end

  if exclude_list.count == 0
    return false
  end

  # url.split('/').each do | part |
  #   if exclude_list.include? part
  #     return true
  #   end
  # end
  exclude_list.each do | exclude_rule |
    if exclude_rule =~ url
      return true
    end
  end
  return false
end
new(max_depth = -1, start_on_create = true) click to toggle source

Create new selector with queue URL’s selected is put back into queue @param [ Fixnum ] max_depth maxinum depth to crawl @param [ Boolean ] start_on_create whether start selector immediately

# File lib/news_crawler/link_selector/same_domain_selector.rb, line 46
def initialize(max_depth = -1, start_on_create = true)
  @max_depth = max_depth
  @wait_time = 1
  @status = :running
  @stoping = false
  run if start_on_create
end

Public Instance Methods

extract_url(url) click to toggle source

Extract url from page

# File lib/news_crawler/link_selector/same_domain_selector.rb, line 55
def extract_url(url)
  doc      = RawData.find_by_url(url)
  html_doc = Nokogiri::HTML(doc)
  results  = []

  inner_url = html_doc.xpath('//a').collect { | a_el |
    temp_url = (a_el.attribute 'href').to_s
    if (!temp_url.nil?) && (temp_url[0] == '/')
      temp_url = URI.join(url, temp_url).to_s
    end
    temp_url
  }

  inner_url.delete_if { | url |
      (url.nil?) || (url.size == 0) || (url == '#') ||
    (url == 'javascript:;')
  }

  # select url from same domain
  inner_url.select { | o_url |
    if (same_domain?(o_url, url))
      if (!SameDomainSelector.exclude?(o_url))
        begin
          URLQueue.add(o_url, url)
          results << [o_url, url]
        rescue URLQueue::DuplicateURLError => e
        end
      else
        # TODO Log here
      end
    end
  }
end
graceful_terminate() click to toggle source

Graceful terminate this selector

# File lib/news_crawler/link_selector/same_domain_selector.rb, line 157
def graceful_terminate
  @stoping = true
  while @status == :running
    sleep(1)
  end
end
run() click to toggle source
# File lib/news_crawler/link_selector/same_domain_selector.rb, line 89
def run
  @status = :running
  return if @stoping
  if @max_depth == 0
    @status = :stopped
    return
  end
  while !@stoping
    url = next_unprocessed(@max_depth - 1)
    while (url.nil?)
      wait_for_url
      url = next_unprocessed(@max_depth - 1)
    end
    NCLogger.get_logger.info "Processing #{url}"
    extract_url(url)
    mark_processed(url)
  end
end

Private Instance Methods

wait_for_url() click to toggle source

Waiting for new urls’re added to queue, using backoff algorithms

# File lib/news_crawler/link_selector/same_domain_selector.rb, line 166
def wait_for_url
  @status = :waiting
  sleep @wait_time
  if @wait_time < 30
    @wait_times = @wait_time * 2
  end
end