class Raev::Url
Constants
- AUTHOR_CSS_SELECTORS
- REGEX_ENTRY_DATE
- REGEX_PAGE_TITLE
- REGEX_URL_DATE
- REGEX_UTM
Attributes
body[R]
doc[R]
url[R]
Public Class Methods
base(url)
click to toggle source
# File lib/raev/url.rb, line 48 def self.base(url) base_url = url.split('/'.freeze)[2] base_url.gsub!('www.'.freeze, ''.freeze) unless base_url.nil? base_url end
new(url)
click to toggle source
# File lib/raev/url.rb, line 40 def initialize(url) @body = "" fetch(url) @url = Url.remove_utm(@url) @doc = nil @linked_data = nil end
remove_utm(url)
click to toggle source
# File lib/raev/url.rb, line 54 def self.remove_utm(url) unless url.nil? utm_index = url.index(REGEX_UTM) unless(utm_index.nil?) url = url.slice(0, utm_index) end end url end
Public Instance Methods
bestRating()
click to toggle source
# File lib/raev/url.rb, line 204 def bestRating node = document.search('*[itemprop="bestRating"]'.freeze).first if node if node.attribute("content".freeze) value = node.attribute("content".freeze).value if value return value.to_f end end end nil end
document()
click to toggle source
# File lib/raev/url.rb, line 220 def document if @doc.nil? @doc = Nokogiri::HTML(@body) end @doc end
feed()
click to toggle source
# File lib/raev/url.rb, line 80 def feed feed_url = nil node = document.css('link[type="application/rss+xml"][rel="alternate"]'.freeze) if node.first feed_url = node.first["href"] else node = document.css('a:match_href("http://feeds.")'.freeze, Raev::Parser.new) if node.first feed_url = node.first["href"] end end if feed_url && feed_url[0,1] == "/".freeze feed_url = @url + feed_url end feed_url end
headline()
click to toggle source
# File lib/raev/url.rb, line 102 def headline if linked_data && linked_data["headline"] return Sanitize.clean(linked_data["headline"]) end page_title = nil node = document.css(".twitter-share-button".freeze) if node.first if node.first['data-text'] page_title = node.first['data-text'] end end if page_title.nil? document.css("head meta".freeze).each do |meta| if meta['property'] == 'og:title'.freeze || meta['property'] == 'twitter:title'.freeze page_title = meta['content'] end end end if page_title.nil? node = document.css("#article h1, a[rel=\"bookmark\"], h2[itemprop=\"name\"]".freeze) if node.first page_title = node.first.content end end unless page_title.nil? page_title.gsub!(REGEX_PAGE_TITLE, ' '.freeze) end page_title end
pubdate()
click to toggle source
# File lib/raev/url.rb, line 140 def pubdate if linked_data && linked_data["datePublished"] return Date.parse(linked_data["datePublished"]) end date_elements = @url.match(REGEX_URL_DATE).to_s.split("/".freeze) if date_elements.size == 3 return Date.new(date_elements[0].to_i, date_elements[1].to_i, date_elements[2].to_i) else node = document.search("meta[itemprop='datePublished'], meta[name='pub_date']".freeze).first if node return Date.parse(node.attribute("content".freeze)) else node = document.search(".entryDate, .entrydate".freeze).first if node return Chronic.parse(node.content.gsub(REGEX_ENTRY_DATE, "".freeze).strip) end end end nil end
ratingValue()
click to toggle source
# File lib/raev/url.rb, line 186 def ratingValue node = document.search('*[itemprop="ratingValue"]'.freeze).first if node if node.attribute("content".freeze) value = node.attribute("content".freeze).value else value = node.content end end if value value.to_f else nil end end
twitter()
click to toggle source
# File lib/raev/url.rb, line 69 def twitter node = document.css('a:match_href("twitter.com")'.freeze, Raev::Parser.new) if node.first twitter_url = node.first["href"] twitter_url.split('/'.freeze).last else nil end end
without_http()
click to toggle source
# File lib/raev/url.rb, line 65 def without_http @url.sub("http://".freeze, "".freeze) end
Private Instance Methods
fetch(uri_str, limit = 10)
click to toggle source
# File lib/raev/url.rb, line 242 def fetch(uri_str, limit = 10) raise ArgumentError, 'too many HTTP redirects' if limit == 0 @url = uri_str unless uri_str.nil? response = Net::HTTP.get_response(URI(uri_str)) case response when Net::HTTPSuccess then @body = response.body when Net::HTTPRedirection then fetch(response['location'], limit - 1) else # TODO handle Not Found end end
linked_data()
click to toggle source
# File lib/raev/url.rb, line 230 def linked_data if @linked_data.nil? node = document.css("script[type=\"application/ld+json\"]".freeze) if node.first @linked_data = JSON.parse(node.first.content) end end @linked_data end