module ArticleCrux
Constants
- VERSION
Public Class Methods
fetch(url, user_agent="ArticleCrux(https://github.com/amitsaxena/article_crux)")
click to toggle source
# File lib/article_crux.rb, line 9 def self.fetch(url, user_agent="ArticleCrux(https://github.com/amitsaxena/article_crux)") url = (url =~ /^(http|https):\/\/(.)*/i) ? url : "http://#{url}" probe = HTTParty.head(url, headers: {"User-Agent" => user_agent}) if(probe.content_type && probe.content_type.split('/')[0] == "image") return {:image => url, :title => nil, :tags => []} end begin res = HTTParty.get(url, headers: {"User-Agent" => user_agent}) raise "Unable to crawl URL" if res.code != 200 doc = Nokogiri::HTML(res) rescue doc = Nokogiri::HTML(open(url, "User-Agent" => user_agent)) end og_url = doc.search("//meta[@property='og:url' or @name='og:url']") if (!og_url.empty? && !str_blank?(og_url[0]["content"]) && (og_url[0]["content"] =~ /^(http|https):\/\/(.)*/i) && (url != og_url[0]["content"])) begin res = HTTParty.get(og_url[0]["content"], headers: {"User-Agent" => user_agent}) raise "Unable to crawl URL" if res.code != 200 doc = Nokogiri::HTML(res) rescue # If the og:url is faulty or doesn't return a 200 response, use the original url for meta data scraping end end og_image = doc.search("//meta[@property='og:image' or @name='og:image']") og_images = [] if !og_image.empty? og_image.each do |ogi| if !str_blank?(ogi["content"]) image = ogi["content"] if (image =~ /^\/\/(.)*/) uri = URI.parse(url) image = "#{uri.scheme}:#{image}" elsif (image =~ /^\/(.)*/) uri = URI.parse(url) image = File.join("#{uri.scheme}://#{uri.host}", image) end og_images << image end end end # Try to get the best image based on heuristics image = get_best_image(og_images) # If og:image is an invalid image (or extremely small), fall back to <img> tags og_size = FastImage.size(Addressable::URI.escape(image)) if !str_blank?(image) if (str_blank?(image) || og_size.nil? || (og_size[0] < 100 && og_size[1] < 100)) image = nil # reset image so that it doesn't show up in response image_paths = [] page_images = doc.search("//img") page_images.each do |page_image| next if (str_blank?(page_image["src"])) clip_image = page_image["src"] if (clip_image && !(clip_image =~ /^(http|https):\/\/(.)*/i)) base = doc.search("//base")[0] base_url = base["href"] if (!base.nil? && !str_blank?(base["href"])) uri = URI.parse(url) if (clip_image =~ /^\/(.)*/) base_url = "#{uri.scheme}://#{uri.host}" if str_blank?(base_url) else base_url = "#{uri.scheme}://#{uri.host}#{uri.path[%r{^(.*[\/])}]}" if str_blank?(base_url) end clip_image = File.join(base_url, clip_image) end image_paths << clip_image end end og_title = doc.search("//meta[@property='og:title' or @name='og:title']") if (!og_title.empty? && !str_blank?(og_title[0]["content"])) clip_title = og_title[0]["content"] else page_title = doc.search("//title")[0] clip_title = page_title.text if !page_title.nil? end tags = [] possible_tags = doc.xpath('//meta[contains(@name, "tag") or contains(@name, "keyword") or contains(@property, "tag") or contains(@property, "keyword")]') possible_tags.each{|e| tags << e["content"].split(',') if !str_blank?(e["content"])} tags = tags.flatten.map(&:strip).uniq res = {:image => image, :title => clip_title, :tags => tags} end
get_best_image(images)
click to toggle source
# File lib/article_crux.rb, line 96 def self.get_best_image(images) return nil if images.empty? return images[0] if (images.size == 1) # reject logo or similar images refined_images = images.reject{|i| i =~ /logo|fallback/i} return refined_images[0] if (refined_images.size == 1) refined_images = images if refined_images.empty? dimensions = [] refined_images.each do |i| type = FastImage.type(Addressable::URI.escape(i)) size = FastImage.size(Addressable::URI.escape(i)) return i if((type == :gif) && (size && size[0] > 299 && size[1] > 199)) dimensions << {:x => size[0], :y => size[1], :image => i} if !size.nil? end image = dimensions.empty? ? nil : dimensions.max_by{|d| d[:x]}[:image] return image end
str_blank?(str)
click to toggle source
# File lib/article_crux.rb, line 114 def self.str_blank?(str) str.nil? || str.empty? end