class Sitetap::Parser

Public Class Methods

new(root_dir) click to toggle source
# File lib/sitetap/parser.rb, line 9
def initialize(root_dir)
  @root = root_dir
end
parse!(root_dir, selector = nil) click to toggle source
# File lib/sitetap/parser.rb, line 13
def self.parse!(root_dir, selector = nil)
  parser = Sitetap::Parser.new(root_dir).parse!(selector)
  parser
end

Public Instance Methods

parse!(selector = nil) click to toggle source
# File lib/sitetap/parser.rb, line 18
def parse!(selector = nil)
  @selector = selector unless selector.nil?
  verify_directories
  do_the_loop
  self
end

Private Instance Methods

clean_html(file) click to toggle source

———————————— Parsing Actions

# File lib/sitetap/parser.rb, line 132
def clean_html(file)
  File.read(file)
    .encode('UTF-8', :invalid => :replace, :undef => :replace)
    .split(' ')
    .to_s
    .gsub(/\\u0000/, '')
    .split('", "')
    .join(' ')
    .gsub(/\\/, '')
    .gsub(/\"\]/, '')
    .gsub(/\[\"/, '')
    .gsub(/[”“]/, '"')
    .gsub(/[’]/, "'")
    .gsub(/[é]/, 'e')
    .gsub(/[–]/, '-')
end
do_the_loop() click to toggle source

———————————— The Loop

# File lib/sitetap/parser.rb, line 78
def do_the_loop
  files.each do |file|

    # get the path of the file relative to the html
    # directory (scraped dir)
    #
    file_path = file.gsub(/#{html_dir}\//, '')

    # clean the contents of the html file so we can work
    # with it
    #
    contents = clean_html(file)

    # set the references to where the new files will
    # live
    #
    tmp_file_path       = "#{tmp_dir}/#{file_path}"
    markdown_file_path  = "#{md_dir}/#{file_path}.md"
    text_file_path      = "#{txt_dir}/#{file_path}.txt"

    # find or create directories that will contain the
    # file
    #
    verify_file_directories([
      tmp_file_path,
      markdown_file_path,
      text_file_path
    ])

    # write a temporary html file with the cleaned-up
    # contents
    #
    write_file(tmp_file_path, contents)

    # now we hone in on the html contents and strip the
    # stuff we don't need
    #
    adj_contents = filter_html(tmp_file_path)

    # convert the adjusted html to markdown and write it
    # to file
    #
    write_file(markdown_file_path, html2markdown(adj_contents))

    # last, we remove all the tags and write the plain
    # text file
    #
    write_file(text_file_path, strip_tags(adj_contents))

  end
end
files() click to toggle source
# File lib/sitetap/parser.rb, line 53
def files
  @files ||= Dir.glob("#{html_dir}/**/*.html")
end
filter_html(file_path) click to toggle source
# File lib/sitetap/parser.rb, line 149
def filter_html(file_path)
  contents = File.read(file_path, :encoding => 'UTF-8')
  page = Nokogiri::HTML(contents)
  content = page.css(selector).to_s
end
html2markdown(html) click to toggle source
# File lib/sitetap/parser.rb, line 160
def html2markdown(html)
  ReverseMarkdown.convert(
    html,
    :unknown_tags => :bypass,
    :github_flavored => true
  ).gsub(/\n(\ )+/, "\n").gsub(/\n\n\n+/, "\n\n")
end
html_dir() click to toggle source
# File lib/sitetap/parser.rb, line 33
def html_dir
  @html_dir ||= "#{root}/html"
end
md_dir() click to toggle source
# File lib/sitetap/parser.rb, line 41
def md_dir
  @md_dir ||= "#{root}/markdown"
end
mkdir_p(dir) click to toggle source

———————————— Directories

# File lib/sitetap/parser.rb, line 59
def mkdir_p(dir)
  unless Dir.exists?(dir)
    FileUtils.mkdir_p(dir)
  end
end
root() click to toggle source

———————————— References

# File lib/sitetap/parser.rb, line 29
def root
  @root
end
selector() click to toggle source
# File lib/sitetap/parser.rb, line 49
def selector
  @selector ||= "body"
end
strip_tags(html) click to toggle source
# File lib/sitetap/parser.rb, line 155
def strip_tags(html)
  html = Sanitize.fragment(html)
  html.gsub(/\n(\ )+/, "\n").gsub(/\ \ +/, "\n\n").gsub(/\n\n\n+/, "\n\n")
end
tmp_dir() click to toggle source
# File lib/sitetap/parser.rb, line 37
def tmp_dir
  @tmp_dir ||= "#{root}/tmp"
end
txt_dir() click to toggle source
# File lib/sitetap/parser.rb, line 45
def txt_dir
  @txt_dir ||= "#{root}/text"
end
verify_directories() click to toggle source
# File lib/sitetap/parser.rb, line 65
def verify_directories
  [tmp_dir, md_dir, txt_dir].each { |dir| mkdir_p(dir) }
end
verify_file_directories(files) click to toggle source
# File lib/sitetap/parser.rb, line 69
def verify_file_directories(files)
  files.each do |file|
    dir = file.split('/')[0..-2].join('/')
    mkdir_p(dir)
  end
end
write_file(file_path, content) click to toggle source

———————————— Writing Files

# File lib/sitetap/parser.rb, line 170
def write_file(file_path, content)
  File.open(file_path, 'w') { |file| file.write(content) }
end