class ArticleTextCleaner

Public Class Methods

convert(page_text) click to toggle source
# File lib/wikipedia_twitterbot/article_text_cleaner.rb, line 4
def self.convert(page_text)
  new(page_text).convert
end
new(page_text) click to toggle source
# File lib/wikipedia_twitterbot/article_text_cleaner.rb, line 8
def initialize(page_text)
  @page_text = page_text
end

Public Instance Methods

convert() click to toggle source
# File lib/wikipedia_twitterbot/article_text_cleaner.rb, line 12
def convert
  @output = PandocRuby.new(@page_text, from: :mediawiki, to: :plain).convert
  remove_refs
  replace_single_linebreaks
  @output
end
remove_refs() click to toggle source

Refs in up in plaintext as: [12]

# File lib/wikipedia_twitterbot/article_text_cleaner.rb, line 20
def remove_refs
  @output.gsub!(/\[\d+\]/, '')
end
replace_single_linebreaks() click to toggle source

Linebreaks just for line wrapping appear where spaces should be. Double line breaks happen between paragraphs; leave those in place.

# File lib/wikipedia_twitterbot/article_text_cleaner.rb, line 26
def replace_single_linebreaks
  @output.gsub!(/(?<!\n)\n(?!\n)/, ' ')
end