class ArticleTextCleaner
Public Class Methods
convert(page_text)
click to toggle source
# File lib/wikipedia_twitterbot/article_text_cleaner.rb, line 4 def self.convert(page_text) new(page_text).convert end
new(page_text)
click to toggle source
# File lib/wikipedia_twitterbot/article_text_cleaner.rb, line 8 def initialize(page_text) @page_text = page_text end
Public Instance Methods
convert()
click to toggle source
# File lib/wikipedia_twitterbot/article_text_cleaner.rb, line 12 def convert @output = PandocRuby.new(@page_text, from: :mediawiki, to: :plain).convert remove_refs replace_single_linebreaks @output end
remove_refs()
click to toggle source
Refs in up in plaintext as: [12]
# File lib/wikipedia_twitterbot/article_text_cleaner.rb, line 20 def remove_refs @output.gsub!(/\[\d+\]/, '') end
replace_single_linebreaks()
click to toggle source
Linebreaks just for line wrapping appear where spaces should be. Double line breaks happen between paragraphs; leave those in place.
# File lib/wikipedia_twitterbot/article_text_cleaner.rb, line 26 def replace_single_linebreaks @output.gsub!(/(?<!\n)\n(?!\n)/, ' ') end