class Sitetap::Parser
Public Class Methods
new(root_dir)
click to toggle source
# File lib/sitetap/parser.rb, line 9 def initialize(root_dir) @root = root_dir end
parse!(root_dir, selector = nil)
click to toggle source
# File lib/sitetap/parser.rb, line 13 def self.parse!(root_dir, selector = nil) parser = Sitetap::Parser.new(root_dir).parse!(selector) parser end
Public Instance Methods
parse!(selector = nil)
click to toggle source
# File lib/sitetap/parser.rb, line 18 def parse!(selector = nil) @selector = selector unless selector.nil? verify_directories do_the_loop self end
Private Instance Methods
clean_html(file)
click to toggle source
———————————— Parsing Actions
# File lib/sitetap/parser.rb, line 132 def clean_html(file) File.read(file) .encode('UTF-8', :invalid => :replace, :undef => :replace) .split(' ') .to_s .gsub(/\\u0000/, '') .split('", "') .join(' ') .gsub(/\\/, '') .gsub(/\"\]/, '') .gsub(/\[\"/, '') .gsub(/[”“]/, '"') .gsub(/[’]/, "'") .gsub(/[é]/, 'e') .gsub(/[–]/, '-') end
do_the_loop()
click to toggle source
———————————— The Loop
# File lib/sitetap/parser.rb, line 78 def do_the_loop files.each do |file| # get the path of the file relative to the html # directory (scraped dir) # file_path = file.gsub(/#{html_dir}\//, '') # clean the contents of the html file so we can work # with it # contents = clean_html(file) # set the references to where the new files will # live # tmp_file_path = "#{tmp_dir}/#{file_path}" markdown_file_path = "#{md_dir}/#{file_path}.md" text_file_path = "#{txt_dir}/#{file_path}.txt" # find or create directories that will contain the # file # verify_file_directories([ tmp_file_path, markdown_file_path, text_file_path ]) # write a temporary html file with the cleaned-up # contents # write_file(tmp_file_path, contents) # now we hone in on the html contents and strip the # stuff we don't need # adj_contents = filter_html(tmp_file_path) # convert the adjusted html to markdown and write it # to file # write_file(markdown_file_path, html2markdown(adj_contents)) # last, we remove all the tags and write the plain # text file # write_file(text_file_path, strip_tags(adj_contents)) end end
files()
click to toggle source
# File lib/sitetap/parser.rb, line 53 def files @files ||= Dir.glob("#{html_dir}/**/*.html") end
filter_html(file_path)
click to toggle source
# File lib/sitetap/parser.rb, line 149 def filter_html(file_path) contents = File.read(file_path, :encoding => 'UTF-8') page = Nokogiri::HTML(contents) content = page.css(selector).to_s end
html2markdown(html)
click to toggle source
# File lib/sitetap/parser.rb, line 160 def html2markdown(html) ReverseMarkdown.convert( html, :unknown_tags => :bypass, :github_flavored => true ).gsub(/\n(\ )+/, "\n").gsub(/\n\n\n+/, "\n\n") end
html_dir()
click to toggle source
# File lib/sitetap/parser.rb, line 33 def html_dir @html_dir ||= "#{root}/html" end
md_dir()
click to toggle source
# File lib/sitetap/parser.rb, line 41 def md_dir @md_dir ||= "#{root}/markdown" end
mkdir_p(dir)
click to toggle source
———————————— Directories
# File lib/sitetap/parser.rb, line 59 def mkdir_p(dir) unless Dir.exists?(dir) FileUtils.mkdir_p(dir) end end
root()
click to toggle source
———————————— References
# File lib/sitetap/parser.rb, line 29 def root @root end
selector()
click to toggle source
# File lib/sitetap/parser.rb, line 49 def selector @selector ||= "body" end
tmp_dir()
click to toggle source
# File lib/sitetap/parser.rb, line 37 def tmp_dir @tmp_dir ||= "#{root}/tmp" end
txt_dir()
click to toggle source
# File lib/sitetap/parser.rb, line 45 def txt_dir @txt_dir ||= "#{root}/text" end
verify_directories()
click to toggle source
# File lib/sitetap/parser.rb, line 65 def verify_directories [tmp_dir, md_dir, txt_dir].each { |dir| mkdir_p(dir) } end
verify_file_directories(files)
click to toggle source
# File lib/sitetap/parser.rb, line 69 def verify_file_directories(files) files.each do |file| dir = file.split('/')[0..-2].join('/') mkdir_p(dir) end end
write_file(file_path, content)
click to toggle source
———————————— Writing Files
# File lib/sitetap/parser.rb, line 170 def write_file(file_path, content) File.open(file_path, 'w') { |file| file.write(content) } end