module CrawlerProcess
Public Class Methods
new(crawler, base_url, options={:timeout=>300, :user_agent=>nil})
click to toggle source
Initialize the crawler process
# File lib/content_crawler/crawler_process.rb, line 11 def initialize(crawler, base_url, options={:timeout=>300, :user_agent=>nil}) @base_url = base_url case crawler when "selenium_webdriver_with_headless" @headless = Headless.new @headless.start watir_web_browser(options[:timeout]) when "selenium_webdriver_without_headless" watir_web_browser(options[:timeout]) when "mechanize_parser" mechanize_parser(options[:user_agent]) else puts "Please select any one of the parser(selenium_webdriver_with_headless, selenium_webdriver_without_headless, mechanize_parser) to crawl content" end end
Public Instance Methods
audio_video_collection(audio_video_detail, options={})
click to toggle source
To get audio video details
# File lib/content_crawler/crawler_process.rb, line 112 def audio_video_collection(audio_video_detail, options={}) auvid_collection = [] audio_video_detail.each do |auvid| hash = {} hash[:src] = auvid.attributes["src"].value.strip hash[:type] = auvid.attributes["type"].value.strip auvid_collection << hash end collection_attr(auvid_collection, options) end
check_local_dir(image_store_dir)
click to toggle source
To save images in dir
# File lib/content_crawler/crawler_process.rb, line 83 def check_local_dir(image_store_dir) image_store_dir = "#{Dir.home}/crawled_images" if image_store_dir.nil? if not Dir.exist?("#{image_store_dir}") Dir.mkdir("#{image_store_dir}") end image_store_dir end
close_browser()
click to toggle source
close browser
# File lib/content_crawler/crawler_process.rb, line 164 def close_browser @browser.close if not @browser.nil? @headless.destroy if not @headless.nil? end
collection_attr(collection, options)
click to toggle source
To get particular attribute
# File lib/content_crawler/crawler_process.rb, line 144 def collection_attr(collection, options) collection = [collection].flatten.compact.uniq case options[:format] when "srcs_types", "texts_values", "texts_srcs", "texts_hrefs" collection when "only_srcs" collection.map{|collobjt| collobjt[:src]}.compact when "only_types" collection.map{|collobjt| collobjt[:type]}.compact when "only_values" collection.map{|collobjt| collobjt[:value]}.compact when "only_texts" collection.map{|collobjt| collobjt[:text]}.compact when "only_hrefs" collection.map{|collobjt| collobjt[:href]}.compact else collection end end
collection_links(parser_links, options={})
click to toggle source
To get the anchor tag details
# File lib/content_crawler/crawler_process.rb, line 45 def collection_links(parser_links, options={}) links = Array.new parser_links = [parser_links].flatten.uniq parser_links.each do |link| data = {} data[:href] = link.attributes["href"].nil? ? " " : link.attributes["href"].value.strip data[:text] = link.text.nil? ? " " : link.text.strip links << data end collection_attr(links, options) end
datalist_collection(datalist_detail, options={})
click to toggle source
to get datalists
# File lib/content_crawler/crawler_process.rb, line 134 def datalist_collection(datalist_detail, options={}) datalists = [] datalist_detail.each do |datalist| hash = {} hash[:value] = datalist.attributes["value"].value.strip datalists << hash end collection_attr(datalists, options) end
iframe_embed_collection(ifrm_embd_detail, options={})
click to toggle source
To get iframe links
# File lib/content_crawler/crawler_process.rb, line 102 def iframe_embed_collection(ifrm_embd_detail, options={}) ifrm_embds = [] ifrm_embd_detail.each do |ifrmembd| hash = {} hash[:src] = ifrmembd.value.strip ifrm_embds << hash end collection_attr(ifrm_embds, options) end
mechanize_parser(user_agent=nil)
click to toggle source
Mechanize parser
# File lib/content_crawler/crawler_process.rb, line 35 def mechanize_parser(user_agent=nil) if user_agent.nil? @agent = Mechanize.new{|a| a.ssl_version, a.verify_mode = 'SSLv3', OpenSSL::SSL::VERIFY_NONE} else @agent = Mechanize.new{|agent| agent.user_agent_alias = user_agent} end #@page = @agent.get(@base_url).parser @agent end
object_collection(object_detail, options={})
click to toggle source
to Get object details
# File lib/content_crawler/crawler_process.rb, line 123 def object_collection(object_detail, options={}) objects = [] object_detail.each do |object| hash = {} hash[:text] = object.text.strip hash[:value] = object.value.strip objects << hash end collection_attr(objects, options) end
select_collection(select_detail, options={})
click to toggle source
To get select tag details
# File lib/content_crawler/crawler_process.rb, line 91 def select_collection(select_detail, options={}) selects = [] select_detail.each do |select| hash = {} hash[:text] = select.text.strip hash[:value] = select.attributes["value"].text.strip selects << hash end collection_attr(selects, options) end
store_remote_image(image_detail, image_store_dir)
click to toggle source
To get image
# File lib/content_crawler/crawler_process.rb, line 57 def store_remote_image(image_detail, image_store_dir) image_store_dir = check_local_dir(image_store_dir) remote_image_urls = iframe_embed_collection(image_detail, {:format => "only_srcs"}) local_images = [] remote_image_urls.each do |image_url| image_url = "#{@base_url}#{image_url}" if not image_url.include?("http") url = URI.parse(image_url) response = Net::HTTP.get_response(url) if response.is_a?(Net::HTTPSuccess) http = Net::HTTP.new(url.host, url.port) http.use_ssl = true if url.scheme == "https" http.verify_mode = OpenSSL::SSL::VERIFY_NONE http.start do http.request_get(url.path) do |res| File.open("#{image_store_dir}/#{File.basename(url.path)}",'wb') do |file| file.write(res.body) end end end local_image = "#{image_store_dir}/#{File.basename(url.path)}" local_images << local_image end end local_images end
watir_web_browser(timeout)
click to toggle source
Web driver watir browser, which will be opening a browser
# File lib/content_crawler/crawler_process.rb, line 27 def watir_web_browser(timeout) client = Selenium::WebDriver::Remote::Http::Default.new client.timeout = timeout @browser = Watir::Browser.new :firefox, :http_client => client @browser.goto(@base_url) @browser end