class DocxGfmConverter

Attributes

content[RW]
options[RW]

Public Class Methods

new(options) click to toggle source
# File lib/docx2gfm/docx_gfm_converter.rb, line 4
def initialize(options)
  @options = options
end

Public Instance Methods

add_frontmatter() click to toggle source
# File lib/docx2gfm/docx_gfm_converter.rb, line 87
def add_frontmatter()
  asset_file = File.join(File.dirname(__FILE__), '/assets/front-matter.md')
  front_matter = open(asset_file).readlines().join()
  @content = front_matter + "\n" + @content
end
cleanup_content_gfm() click to toggle source

this removes all sorts of strange stuff that pandoc generates when converting a .docx exported from Google Docs into GFM

# File lib/docx2gfm/docx_gfm_converter.rb, line 43
def cleanup_content_gfm()
  # remove escaping in front of exclamation marks
  @content = @content.gsub /\\!/, '!'

  # remove underlining of anchors. Anchors are styled by the markdown renderer, so no need to add any explicit formatting here pandoc!
  # example: [<span class="underline">In mattis lectus</span>](https://spier.hu) => [In mattis lectus](https://spier.hu)
  @content = @content.gsub /\[<span class="underline">(.*?)<\/span>\]/m,'[\1]'

  # convert underlining of regular text (not anchors) into markdown syntax
  # example: <span class="underline">Cras ac lectus quis</span> => _Cras ac lectus quis_
  # Underlining text is not possible??? ok, so I could spit out a warning here, as the author used a formatting feature that our blog does not support
  @content = @content.gsub /<span class="underline">(.*?)<\/span>/m,'\1'

  # fix unordered lists
  @content = @content.gsub(/^(\s*)- > /, '\1- ')
  @content = @content.gsub(/^(\s*)> /, '\1  ')

  # fix ordered lists
  @content = @content.gsub(/^(\d+\.)  > /, '\1  ')

  # remove `<!-- end list -->`
  # See http://pandoc.org/MANUAL.html => "Ending a list"
  @content = @content.gsub(/<!-- end list -->/,'')
end
cleanup_content_markdown() click to toggle source
# File lib/docx2gfm/docx_gfm_converter.rb, line 68
def cleanup_content_markdown()
  # remove underlining from links
  @content = @content.gsub /\[<span class="underline">(.*?)<\/span>\]/m,'[\1]'

  # remove underlining from all other text (and print a warning)
  @content = @content.gsub(/<span class="underline">(.*?)<\/span>/m) do |match|
    STDERR.puts "Underline is not supported in markdown. Removing underlining from '#{$1}'."
    $1
  end

  # fix lists - remove unneccesary spacing before list items
  # 1.  Numbered lists are great
  # -   And even more bullets
  @content = @content.gsub(/^(\s*)(-|\d+\.)\s+(\S)/, '\1\2 \3')

  # fix spacing in front of reference links
  @content = @content.gsub(/^ +(\[.+?\]:)/, '\1')
end
docx_2_gfm(file) click to toggle source

convert docx to initial markdown

# File lib/docx2gfm/docx_gfm_converter.rb, line 29
def docx_2_gfm(file)
  # TODO before reading the file, I could check if the file exists
  # TODO check out pandoc options that might be useful e.g. --extract-media='/images/own/'
  @content = `pandoc #{file} -f docx -t gfm --wrap=none`
end
docx_2_markdown(file) click to toggle source
# File lib/docx2gfm/docx_gfm_converter.rb, line 35
def docx_2_markdown(file)
  # TODO before reading the file, I could check if the file exists
  # TODO check out pandoc options that might be useful e.g. --extract-media='/images/own/'
  @content = `pandoc #{file} --wrap=none --atx-headers -f docx -t markdown-bracketed_spans-link_attributes-smart-simple_tables -s`
end
process_gfm() click to toggle source

perform all conversation and cleanup steps

# File lib/docx2gfm/docx_gfm_converter.rb, line 9
def process_gfm()
  docx_2_gfm(@options[:file])
  cleanup_content_gfm()
  create_ref_style_links() if @options[:ref_style_links]
  add_frontmatter() if @options[:jekyll]
end
process_markdown() click to toggle source
# File lib/docx2gfm/docx_gfm_converter.rb, line 16
def process_markdown()
  docx_2_markdown(@options[:file])
  cleanup_content_markdown()
  create_ref_style_links() if @options[:ref_style_links]
  add_frontmatter() if @options[:jekyll]
end
to_s() click to toggle source

output this document (i.e. the markdown content)

# File lib/docx2gfm/docx_gfm_converter.rb, line 24
def to_s
  @content
end