module CrawlerHelper
Public Instance Methods
do_not_ignore?(each_link, scraped)
click to toggle source
def bad_link?(each_link)
# return True if these characters exists in # the URL: $, #, png, css, js, jpg, pdf # Check for both upper and lower case ^ begin each = each_link.upcase if each.include?("?") || each.include?("#") || each.include?(".PNG") || each.include?(".CSS") || each.include?("JS") || each.include?("JPG") || each.include?("PDF") || each.include?("/VIDEO/POP") # puts "Bad Link: " + each.to_s return true else return false end rescue Exception => e puts e.message puts e.backtrace.inspect end
end
# File lib/crawler_lib.rb, line 52 def do_not_ignore?(each_link, scraped) # This checks if the passed link should be # scraped or not based on: # Has it already been scraped, is it bad_link? # puts each_link # puts scraped.class if scraped.include?(each_link) return false elsif bad_link?(each_link) return false else return true end end
fix_scheme(url)
click to toggle source
# File lib/crawler_lib.rb, line 10 def fix_scheme(url) puts "- No scheme provided for #{url}, trying to fix it." driver = Selenium::WebDriver.for :firefox driver.get("http://"+url) #assumes redirect to https is setup if it exists. url_tmp = driver.current_url scheme = URI.parse(url_tmp).scheme driver.quit puts "scheme is: #{scheme}" return scheme+"://"+url end
sanitize(link)
click to toggle source
# File lib/crawler_lib.rb, line 22 def sanitize(link) # puts link name = link.gsub(":", "") name = name.gsub("/", "") name = name.gsub("%", "") name = name.gsub('\\', "") name = name.gsub('.', "") # puts name return name end
test()
click to toggle source
# File lib/crawler_lib.rb, line 6 def test puts "Test call" end