module NewsCrawler::Storage::URLQueue
Store and manipulate url queue
Constants
- ACTION_LIST
- PROCESSED
- PROCESSING
- UNPROCESSED
Public Class Methods
Add URL to queue @param [ String ] url @param [ String ] ref_url reference url
# File lib/news_crawler/storage/url_queue.rb, line 119 def add(url, ref_url = '') url = normalize_url url if ref_url != '' ref_url = normalize_url ref_url end @engine.add(url, ref_url) end
Get all url with status @return [ Array ] URL list
# File lib/news_crawler/storage/url_queue.rb, line 135 def all @engine.all end
Clear URLQueue
@return [ Fixnum ] number of urls removed
# File lib/news_crawler/storage/url_queue.rb, line 129 def clear @engine.clear end
Find all visited urls with module’s state @param [ String ] module_name @param [ String ] state @param [ Fixnum ] max_depth max url depth return (inclusive) @return [ Array ] URL list
# File lib/news_crawler/storage/url_queue.rb, line 88 def find_all(module_name, state, max_depth = -1) @engine.find_all(module_name, state, max_depth) end
Find one visited url with given module process state @param [ String ] module_name @param [ String ] state one of unprocessed, processing, processed @param [ Fixnum ] max_depth max url depth return (inclusive) @return [ String, nil ] URL
# File lib/news_crawler/storage/url_queue.rb, line 97 def find_one(module_name, state, max_depth = -1) @engine.find_one(module_name, state, max_depth) end
Get list of unvisited URL @param [ Fixnum ] max_depth maximum depth of url return @return [ Array ] unvisited url with maximum depth (option)
# File lib/news_crawler/storage/url_queue.rb, line 112 def find_unvisited(max_depth = -1) @engine.find_unvisited(max_depth) end
Set processing state of url in given module @param [ String ] module_name @param [ String ] url @param [ String ] state one of unprocessed, processing, processed
# File lib/news_crawler/storage/url_queue.rb, line 70 def mark(module_name, url, state) url = normalize_url url @engine.mark(module_name, url, state) end
Mark all url to state @param [ String ] module_name @param [ String ] new_state new state @param [ String ] orig_state original state
# File lib/news_crawler/storage/url_queue.rb, line 79 def mark_all(module_name, new_state, orig_state = nil) @engine.mark_all(module_name, new_state, orig_state) end
Mark all URLs as unvisited
# File lib/news_crawler/storage/url_queue.rb, line 62 def mark_all_unvisited @engine.mark_all_unvisited end
Mark an URL as visited @param [ String ] url
# File lib/news_crawler/storage/url_queue.rb, line 56 def mark_visited(url) url = normalize_url url @engine.mark_visited(url) end
Get next unprocessed a url and mark it as processing in atomic @param [ String ] module_name @param [ Fixnum ] max_depth max url depth return (inclusive) @return [ String, nil ] URL or nil if url doesn’t exists
# File lib/news_crawler/storage/url_queue.rb, line 105 def next_unprocessed(module_name, max_depth = -1) @engine.next_unprocessed(module_name, max_depth) end
# File lib/news_crawler/storage/url_queue.rb, line 139 def normalize_url(url) if (!url.start_with? "http") "http://" + url else url end end
Set URLQueue
storage engine @param [ Symbol, Object
] engine specify database engine, pass an object for custom engine @param [ Hash ] opts options pass to engine
This can be * `:mongo`, `:mongodb` for MongoDB backend
# File lib/news_crawler/storage/url_queue.rb, line 42 def set_engine(engine, *opts) if engine.respond_to? :intern engine = engine.intern end engine_class = URLQueueEngine.get_engines[engine] if engine_class @engine = engine_class.new(*opts) else @engine = engine end end