class Textract::Client
Attributes
html[R]
md5[R]
site[R]
text[R]
title[R]
url[R]
Public Class Methods
new(url, selectors, format)
click to toggle source
# File lib/textract.rb, line 139 def initialize(url, selectors, format) @url = url agent = Mechanize.new agent.user_agent_alias = 'Mac Safari' @html = agent.get(url).content @tags = Textract.get_og_tags(@html, url) if @tags.url.match(/^(http|ftp)s?:\/\//) @url = @tags.url end @article = Textract.smart_extract(@html, @tags.description, selectors) if @article.content.nil? @text = "" else if format == 'markdown' @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass else @text = @article.content end end @md5 = Textract.generate_hash @text @author = Textract.build_author @article, @html @site = Textract.build_site @url, @html @title = @tags.title || Textract.get_page_title(@html) if @url.match(/\/robots.txt$/) and @title = @text @title = @url end end
Public Instance Methods
as_json()
click to toggle source
# File lib/textract.rb, line 168 def as_json to_h.to_json end
to_h()
click to toggle source
# File lib/textract.rb, line 172 def to_h { url: @url, text: @text, md5: @md5, author: @author, title: @title, site: @site, } end