module Textract
Constants
- TAG_WHITELIST
attr_accessor :client
- VERSION
Public Class Methods
build_site(url, html)
click to toggle source
# File lib/textract.rb, line 91 def self.build_site(url, html) site_twitter = Nokogiri::HTML(html).search('meta[name="twitter:site"]') site_name = Nokogiri::HTML(html).search('meta[property="og:site_name"]') if site_name.empty? domain_regex = /^(http|ftp)s?:\/\/(([\w\d\-_]+\.)?([\w\d\-_]+)\.(\w+))\// site = url.match domain_regex site = site[2] unless site[2].nil? site_name = site.sub(/^www\./, '').capitalize! else site_name = site_name.attribute('content').value end if !site_twitter.empty? if !site_twitter.attribute('content').nil? site_twitter = site_twitter.attribute('content').value elsif !site_twitter.attribute('value').nil? site_twitter = site_twitter.attribute('value').value else site_twitter = nill end end { name: site_name, twitter: site_twitter, } end
generate_hash(text)
click to toggle source
# File lib/textract.rb, line 125 def self.generate_hash(text) Digest::MD5.hexdigest text end
get_page_title(html)
click to toggle source
# File lib/textract.rb, line 65 def self.get_page_title(html) Nokogiri::HTML(html).search('head').search('title').text end
get_text(url, selectors=nil, format="markdown")
click to toggle source
# File lib/textract.rb, line 15 def self.get_text(url, selectors=nil, format="markdown") @client = Client.new(url, selectors, format) end
get_twitter(html)
click to toggle source
# File lib/textract.rb, line 77 def self.get_twitter(html) twitter_meta = Nokogiri::HTML(html).search('meta[name="twitter:creator"]') if !twitter_meta.empty? if !twitter_meta.attribute('content').nil? twitter_meta = twitter_meta.attribute('content').value elsif !twitter_meta.attribute('value').nil? twitter_meta = twitter_meta.attribute('value').value else twitter_meta = nill end end twitter_meta end
smart_extract(html, description, selectors)
click to toggle source
# File lib/textract.rb, line 27 def self.smart_extract(html, description, selectors) doc = Nokogiri::HTML html if selectors.nil? article = doc.search('article') else article = doc.search(selectors) end if article.count == 1 article_el = article[0] elsif !description.nil? and article.count == 0 els = [1,2,3] i = 1 until els.count < 2 or i > description.split(" ").length search_text = description.split(" ")[0..i].join(" ") if search_text.index "'" els = doc.search "[text()*=\"#{search_text}\"]" else els = doc.search "[text()*='#{search_text}']" end i += 1 end if els.count == 1 el = els[0] article_el = el.parent else article_el = doc # do something else if multiple or no matches end else article_el = doc end Readability::Document.new(article_el.to_s, tags: TAG_WHITELIST, attributes: %w[src href], remove_empty_nodes: false, ) end