module CxExtractor

nodoc

configure

Constants

DEFAULTS
TITLE_REGEXP
VERSION

Attributes

options[W]

Public Class Methods

article(html) click to toggle source
# File lib/cx_extractor.rb, line 15
def article(html)
  ctext = get_clean_text(html)
  lines = ctext.split("\n").map(&:strip)
  block_distribution = line_block_distribute(lines)
  content = get_content(lines, block_distribution)
  content = get_content_by_tag(html, content) if explore_parent
  # content.gsub("\n",'') if remove_newline
  content.squeeze.strip
end
configure() { |self| ... } click to toggle source
# File lib/cx_extractor/config.rb, line 23
def configure
  yield self
end
get_contect_block(block_distribution, to_line) click to toggle source
# File lib/cx_extractor.rb, line 48
def get_contect_block(block_distribution, to_line)
  from_line = find_surge(block_distribution, to_line, threshold)
  to_line = find_dive(block_distribution, from_line)
  [from_line, to_line]
end
get_content(lines, block_distribution) click to toggle source
# File lib/cx_extractor.rb, line 30
def get_content(lines, block_distribution)
  from_line = to_line = 0
  content = chart_points = []
  loop do
    from_line, to_line = get_contect_block(block_distribution, to_line)
    content += lines[from_line..to_line]
    break if from_line < 0

    chart_points += [from_line, to_line]
  end
  if chart_distribution && !chart_points.empty?
    chart(block_distribution, chart_points)
  else
    warn 'there is no content for the web page, cannot chart'
  end
  content.join("\n")
end
get_content_by_tag(html, block_content) click to toggle source
# File lib/cx_extractor.rb, line 54
def get_content_by_tag(html, block_content)
  doc =  Nokogiri::HTML(html)
  p_doms = doc.css('p')
  ptext = []
  p_doms.each do |p_dom|
    ptext << p_dom.parent if block_content.include?(p_dom.text)
  end
  max_p = ptext.max_by { |i| ptext.count(i) }
  get_clean_text(max_p.to_s).split("\n").map(&:strip).join(
    "\n"
  ).squeeze
end
get_title(html) click to toggle source
# File lib/cx_extractor.rb, line 25
def get_title(html)
  matcher = TITLE_REGEXP.match(html) || []
  matcher[1]
end
options() click to toggle source
# File lib/cx_extractor/config.rb, line 17
def options
  @options ||= DEFAULTS.dup
end