class Textract::Client

Attributes

author[R]
html[R]
md5[R]
site[R]
tags[R]
text[R]
title[R]
url[R]

Public Class Methods

new(url, selectors, format) click to toggle source
# File lib/textract.rb, line 139
def initialize(url, selectors, format)
  @url = url
  agent = Mechanize.new
  agent.user_agent_alias = 'Mac Safari'
  @html = agent.get(url).content
  @tags = Textract.get_og_tags(@html, url)
  if @tags.url.match(/^(http|ftp)s?:\/\//)
    @url = @tags.url
  end

  @article = Textract.smart_extract(@html, @tags.description, selectors)
  if @article.content.nil?
    @text = ""
  else
    if format == 'markdown'
      @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
    else
      @text = @article.content
    end
  end
  @md5 = Textract.generate_hash @text
  @author = Textract.build_author @article, @html
  @site = Textract.build_site @url, @html
  @title = @tags.title || Textract.get_page_title(@html)
  if @url.match(/\/robots.txt$/) and @title = @text
    @title = @url
  end
end

Public Instance Methods

as_json() click to toggle source
# File lib/textract.rb, line 168
def as_json
  to_h.to_json
end
to_h() click to toggle source
# File lib/textract.rb, line 172
def to_h
  {
    url: @url,
    text: @text,
    md5: @md5,
    author: @author,
    title: @title,
    site: @site,
  }
end