class Docx::Document
The Document
class wraps around a docx file and provides methods to interface with it.
# get a Docx::Document for a docx file in the local directory doc = Docx::Document.open("test.docx") # get the text from the document puts doc.text # do the same thing in a block Docx::Document.open("test.docx") do |d| puts d.text end
Attributes
Public Class Methods
# File lib/docx/document.rb, line 23 def initialize(path_or_io, options = {}) @replace = {} # if path-or_io is string && does not contain a null byte if (path_or_io.instance_of?(String) && !/\u0000/.match?(path_or_io)) @zip = Zip::File.open(path_or_io) else @zip = Zip::File.open_buffer(path_or_io) end document = @zip.glob('word/document*.xml').first raise Errno::ENOENT if document.nil? @document_xml = document.get_input_stream.read @doc = Nokogiri::XML(@document_xml) load_styles yield(self) if block_given? ensure @zip.close end
With no associated block, Docx::Document.open
is a synonym for Docx::Document.new
. If the optional code block is given, it will be passed the opened docx
file as an argument and the Docx::Document
oject will automatically be closed when the block terminates. The values of the block will be returned from Docx::Document.open
.
# File lib/docx/document.rb, line 56 def self.open(path, &block) new(path, &block) end
Public Instance Methods
# File lib/docx/document.rb, line 64 def bookmarks bkmrks_hsh = {} bkmrks_ary = @doc.xpath('//w:bookmarkStart').map { |b_node| parse_bookmark_from b_node } # auto-generated by office 2010 bkmrks_ary.reject! { |b| b.name == '_GoBack' } bkmrks_ary.each { |b| bkmrks_hsh[b.name] = b } bkmrks_hsh end
This stores the current global document properties, for now
# File lib/docx/document.rb, line 45 def document_properties { font_size: font_size, hyperlinks: hyperlinks } end
Deprecated
Iterates over paragraphs within document
# File lib/docx/document.rb, line 103 def each_paragraph paragraphs.each { |p| yield(p) } end
Some documents have this set, others don't. Values are returned as half-points, so to get points, that's why it's divided by 2.
# File lib/docx/document.rb, line 79 def font_size return nil unless @styles size_tag = @styles.xpath('//w:docDefaults//w:rPrDefault//w:rPr//w:sz').first size_tag ? size_tag.attributes['val'].value.to_i / 2 : nil end
# File lib/docx/document.rb, line 93 def hyperlink_relationships @rels.xpath("//xmlns:Relationship[contains(@Type,'hyperlink')]") end
Hyperlink targets are extracted from the document.xml.rels file
# File lib/docx/document.rb, line 87 def hyperlinks hyperlink_relationships.each_with_object({}) do |rel, hash| hash[rel.attributes['Id'].value] = rel.attributes['Target'].value end end
# File lib/docx/document.rb, line 60 def paragraphs @doc.xpath('//w:document//w:body/w:p').map { |p_node| parse_paragraph_from p_node } end
# File lib/docx/document.rb, line 162 def replace_entry(entry_path, file_contents) @replace[entry_path] = file_contents end
Save document to provided path
# File lib/docx/document.rb, line 121 def save(path) update Zip::OutputStream.open(path) do |out| zip.each do |entry| next unless entry.file? out.put_next_entry(entry.name) if @replace[entry.name] out.write(@replace[entry.name]) else out.write(zip.read(entry.name)) end end end zip.close end
Output entire document as a StringIO object
# File lib/docx/document.rb, line 140 def stream update stream = Zip::OutputStream.write_buffer do |out| zip.each do |entry| next unless entry.file? out.put_next_entry(entry.name) if @replace[entry.name] out.write(@replace[entry.name]) else out.write(zip.read(entry.name)) end end end stream.rewind stream end
# File lib/docx/document.rb, line 73 def tables @doc.xpath('//w:document//w:body//w:tbl').map { |t_node| parse_table_from t_node } end
Output entire document as a String HTML fragment
# File lib/docx/document.rb, line 114 def to_html paragraphs.map(&:to_html).join("\n") end
# File lib/docx/document.rb, line 109 def to_s paragraphs.map(&:to_s).join("\n") end
Private Instance Methods
# File lib/docx/document.rb, line 177 def load_rels rels_entry = @zip.glob('word/_rels/document*.xml.rels').first raise Errno::ENOENT unless rels_entry @rels_xml = rels_entry.get_input_stream.read @rels = Nokogiri::XML(@rels_xml) end
# File lib/docx/document.rb, line 168 def load_styles @styles_xml = @zip.read('word/styles.xml') @styles = Nokogiri::XML(@styles_xml) load_rels rescue Errno::ENOENT => e warn e.message nil end
generate Elements::Bookmark
from bookmark XML node
# File lib/docx/document.rb, line 200 def parse_bookmark_from(b_node) Elements::Bookmark.new(b_node) end
generate Elements::Containers::Paragraph
from paragraph XML node
# File lib/docx/document.rb, line 195 def parse_paragraph_from(p_node) Elements::Containers::Paragraph.new(p_node, document_properties) end
# File lib/docx/document.rb, line 204 def parse_table_from(t_node) Elements::Containers::Table.new(t_node) end
# File lib/docx/document.rb, line 190 def update replace_entry 'word/document.xml', doc.serialize(save_with: 0) end