module Textract

Constants

TAG_WHITELIST

attr_accessor :client

VERSION

Public Class Methods

build_author(article, html) click to toggle source
# File lib/textract.rb, line 118
def self.build_author(article, html)
  {
    name: article.author || get_author(html),
    twitter: get_twitter(html),
  }
end
build_site(url, html) click to toggle source
# File lib/textract.rb, line 91
def self.build_site(url, html)
  site_twitter = Nokogiri::HTML(html).search('meta[name="twitter:site"]')
  site_name = Nokogiri::HTML(html).search('meta[property="og:site_name"]')
  if site_name.empty?
    domain_regex = /^(http|ftp)s?:\/\/(([\w\d\-_]+\.)?([\w\d\-_]+)\.(\w+))\//
    site = url.match domain_regex
    site = site[2] unless site[2].nil?
    site_name = site.sub(/^www\./, '').capitalize!
  else
    site_name = site_name.attribute('content').value
  end
  if !site_twitter.empty?
    if !site_twitter.attribute('content').nil?
      site_twitter = site_twitter.attribute('content').value
    elsif !site_twitter.attribute('value').nil?
      site_twitter = site_twitter.attribute('value').value
    else
      site_twitter = nill
    end
  end
  {
    name: site_name,
    twitter: site_twitter,
  }
end
generate_hash(text) click to toggle source
# File lib/textract.rb, line 125
def self.generate_hash(text)
  Digest::MD5.hexdigest text
end
get_author(html) click to toggle source
# File lib/textract.rb, line 69
def self.get_author(html)
  name_meta = Nokogiri::HTML(html).search('meta[name="author"]')
  if name_meta.empty?
    name_meta = Nokogiri::HTML(html).search('meta[property="author"]')
  end
  name_meta.attribute('content').value unless name_meta.empty?
end
get_og_tags(html, url) click to toggle source
# File lib/textract.rb, line 19
def self.get_og_tags(html, url)
  begin
    OpenGraph.new(html)
  rescue
    OpenGraph.new(url)
  end
end
get_page_title(html) click to toggle source
# File lib/textract.rb, line 65
def self.get_page_title(html)
  Nokogiri::HTML(html).search('head').search('title').text
end
get_text(url, selectors=nil, format="markdown") click to toggle source
# File lib/textract.rb, line 15
def self.get_text(url, selectors=nil, format="markdown")
  @client = Client.new(url, selectors, format)
end
get_twitter(html) click to toggle source
# File lib/textract.rb, line 77
def self.get_twitter(html)
  twitter_meta = Nokogiri::HTML(html).search('meta[name="twitter:creator"]')
  if !twitter_meta.empty?
    if !twitter_meta.attribute('content').nil?
      twitter_meta = twitter_meta.attribute('content').value
    elsif !twitter_meta.attribute('value').nil?
      twitter_meta = twitter_meta.attribute('value').value
    else
      twitter_meta = nill
    end
  end
  twitter_meta
end
smart_extract(html, description, selectors) click to toggle source
# File lib/textract.rb, line 27
def self.smart_extract(html, description, selectors)
  doc = Nokogiri::HTML html
  if selectors.nil?
    article = doc.search('article')
  else
    article = doc.search(selectors)
  end
  if article.count == 1
    article_el = article[0]
  elsif !description.nil? and article.count == 0
    els = [1,2,3]
    i = 1
    until els.count < 2 or i > description.split(" ").length
      search_text = description.split(" ")[0..i].join(" ")
      if search_text.index "'"
        els = doc.search "[text()*=\"#{search_text}\"]"
      else
        els = doc.search "[text()*='#{search_text}']"
      end
      i += 1
    end
    if els.count == 1
      el = els[0]
      article_el = el.parent
    else
      article_el = doc
      # do something else if multiple or no matches
    end
  else
    article_el = doc
  end
  Readability::Document.new(article_el.to_s,
                            tags: TAG_WHITELIST,
                            attributes: %w[src href],
                            remove_empty_nodes: false,
                           )
end