class SimplePoParser::Parser
Fast parser directly using Rubys powerful StringScanner (strscan)
Important notes about StringScanner.scan:
-
scan will return nil if there is no match. Using the regex * (zero or more) quantifier will
let scan return an empty string if there is "no match" as the empty string qualifies as a match of the regex (zero times). We make use of this "trick"
-
the start of line anchor ^ is obsolete as scan will only match start of line.
-
rubys regex is by default in single-line mode, therefore scan will only match until
the next newline is hit (unless multi-line mode is explicitly enabled)
Public Instance Methods
parse a single message of the PO format.
@param message a single PO message in String format without leading or trailing whitespace @return [Hash] parsed PO message information in Hash format
# File lib/simple_po_parser/parser.rb, line 21 def parse(message) @result = {} @scanner = StringScanner.new(message.strip) begin lines rescue ParserError => pe error_msg = "SimplePoParser::ParserError" error_msg += pe.message error_msg += "\nParseing result before error: '#{@result}'" error_msg += "\nSimplePoParser filtered backtrace: SimplePoParser::ParserError" backtrace = "#{pe.backtrace.select{|i| i =~ /lib\/simple_po_parser/}.join("\n\tfrom ")}" raise ParserError, error_msg, backtrace end @result end
Private Instance Methods
adds text to the given key in results creates an array if the given key already has a result
# File lib/simple_po_parser/parser.rb, line 364 def add_result(key, text) if @result[key] if @result[key].is_a? Array @result[key].push(text) else @result[key] = [@result[key], text] end else @result[key] = text end end
match a comment line. called on lines starting with '#'. Recalls line when the comment line was parsed
# File lib/simple_po_parser/parser.rb, line 61 def comment begin case @scanner.getch when ' ' skip_whitespace add_result(:translator_comment, comment_text) lines when '.' skip_whitespace add_result(:extracted_comment, comment_text) lines when ':' skip_whitespace add_result(:reference, comment_text) lines when ',' skip_whitespace add_result(:flag, comment_text) lines when '|' skip_whitespace previous_comments lines when "\n" add_result(:translator_comment, "") # empty comment line lines when '~' if @result[:previous_msgctxt] || @result[:previous_msgid] || @result[:previous_msgid_plural] raise PoSyntaxError, "Previous comment entries need to be marked obsolete too in obsolete message entries. But already got: #{@result}" end skip_whitespace add_result(:obsolete, comment_text) obsoletes else @scanner.pos = @scanner.pos - 2 raise PoSyntaxError, "Unknown comment type #{@scanner.peek(10).inspect}" end rescue PoSyntaxError => pe raise PoSyntaxError, "Syntax error in comment\n" + pe.message, pe.backtrace end end
returns the text of a comment
@return [String] text
# File lib/simple_po_parser/parser.rb, line 330 def comment_text begin text = @scanner.scan(/.*/) # everything until newline text.rstrip! # benchmarked faster too rstrip the string in place raise PoSyntaxError, "Comment text should advance to next line or stop at eos" unless end_of_line text rescue PoSyntaxError => pe raise PoSyntaxError, "Syntax error in commtent_text\n" + pe.message, pe.backtrace end end
returns true if the scanner is at beginning of next line or end of string
@return [Boolean] true if scanner at beginning of line or eos
# File lib/simple_po_parser/parser.rb, line 357 def end_of_line @scanner.scan(/\n/) @scanner.eos? || @scanner.bol? end
arbitary line of a PO message. Can be comment or message message parsing is always started with checking for msgctxt as content is expected in msgctxt -> msgid -> msgid_plural
-> msgstr order
# File lib/simple_po_parser/parser.rb, line 46 def lines begin if @scanner.scan(/#/) comment else msgctxt end rescue PoSyntaxError => pe # throw a normal ParserError to break the recursion raise ParserError, "Syntax error in lines\n" + pe.message, pe.backtrace end end
identifies a message line and returns it's text or raises an error
@return [String] message_text
# File lib/simple_po_parser/parser.rb, line 285 def message_line begin if @scanner.getch == '"' text = message_text unless @scanner.getch == '"' err_msg = "The message text '#{text}' must be finished with the double quote character '\"'." raise PoSyntaxError, err_msg end skip_whitespace unless end_of_line err_msg = "There should be only whitespace until the end of line" err_msg += " after the double quote character of a message text." raise PoSyntaxError.new(err_msg) end text else @scanner.pos = @scanner.pos - 1 err_msg = "A message text needs to start with the double quote character '\"'," err_msg += " but this was found: #{@scanner.peek(10).inspect}" raise PoSyntaxError, err_msg end rescue PoSyntaxError => pe raise PoSyntaxError, "Syntax error in message_line\n" + pe.message, pe.backtrace end end
parses a multiline message
multiline messages are indicated by an empty content as first line and the next line starting with the double quote character
# File lib/simple_po_parser/parser.rb, line 270 def message_multiline(key) begin skip_whitespace if @scanner.check(/"/) add_result(key, message_line) message_multiline(key) end rescue PoSyntaxError => pe raise PoSyntaxError, "Syntax error in message_multiline with key '#{key}'\n" + pe.message, pe.backtrace end end
returns the text of a message line
@return [String] text
# File lib/simple_po_parser/parser.rb, line 344 def message_text @scanner.scan_until(/(\\(\\|")|[^"])*/) # this parses anything until an unescaped quote is hit end
matches the msgctxt line and will continue to check for msgid afterwards
msgctxt is optional
# File lib/simple_po_parser/parser.rb, line 106 def msgctxt begin if @scanner.scan(/msgctxt/) skip_whitespace text = message_line add_result(:msgctxt, text) message_multiline(:msgctxt) if text.empty? end msgid rescue PoSyntaxError => pe raise PoSyntaxError, "Syntax error in msgctxt\n" + pe.message, pe.backtrace end end
matches the msgid line. Will check for optional msgid_plural. Will advance to msgstr or msgstr_plural
based on msgid_plural
msgid is required
# File lib/simple_po_parser/parser.rb, line 124 def msgid begin if @scanner.scan(/msgid/) skip_whitespace text = message_line add_result(:msgid, text) message_multiline(:msgid) if text.empty? if msgid_plural msgstr_plural else msgstr end else err_msg = "Message without msgid is not allowed." err_msg += "The Line started unexpectedly with #{@scanner.peek(10).inspect}." raise PoSyntaxError, err_msg end rescue PoSyntaxError => pe raise PoSyntaxError, "Syntax error in msgid\n" + pe.message, pe.backtrace end end
matches the msgid_plural
line.
msgid_plural
is optional
@return [boolean] true if msgid_plural
is present, false otherwise
# File lib/simple_po_parser/parser.rb, line 152 def msgid_plural begin if @scanner.scan(/msgid_plural/) skip_whitespace text = message_line add_result(:msgid_plural, text) message_multiline(:msgid_plural) if text.empty? true else false end rescue PoSyntaxError => pe raise PoSyntaxError, "Syntax error in msgid\n" + pe.message, pe.backtrace end end
parses the msgstr singular line
msgstr is required in singular translations
# File lib/simple_po_parser/parser.rb, line 171 def msgstr begin if @scanner.scan(/msgstr/) skip_whitespace text = message_line add_result(:msgstr, text) message_multiline(:msgstr) if text.empty? skip_whitespace raise PoSyntaxError, "Unexpected content after expected message end #{@scanner.peek(10).inspect}" unless @scanner.eos? else raise PoSyntaxError, "Singular message without msgstr is not allowed. Line started unexpectedly with #{@scanner.peek(10).inspect}." end rescue PoSyntaxError => pe raise PoSyntaxError, "Syntax error in msgstr\n" + pe.message, pe.backtrace end end
parses the msgstr plural lines
msgstr plural lines are used when there is msgid_plural. They have the format msgstr where N is incremental number starting from zero representing the plural number as specified in the headers “Plural-Forms” entry. Most languages, like the English language only have two plural forms (singular and plural), but there are languages with more plurals
# File lib/simple_po_parser/parser.rb, line 195 def msgstr_plural(num = 0) begin msgstr_key = @scanner.scan(/msgstr\[\d\]/) # matches 'msgstr[0]' to 'msgstr[9]' if msgstr_key # msgstr plurals must come in 0-based index in order msgstr_num = msgstr_key.match(/\d/)[0].to_i raise PoSyntaxError, "Bad 'msgstr[index]' index." if msgstr_num != num skip_whitespace text = message_line add_result(msgstr_key, text) message_multiline(msgstr_key) if text.empty? msgstr_plural(num+1) elsif num == 0 # and msgstr_key was false raise PoSyntaxError, "Plural message without msgstr[0] is not allowed. Line started unexpectedly with #{@scanner.peek(10).inspect}." else raise PoSyntaxError, "End of message was expected, but line started unexpectedly with #{@scanner.peek(10).inspect}" unless @scanner.eos? end rescue PoSyntaxError => pe raise PoSyntaxError, "Syntax error in msgstr_plural\n" + pe.message, pe.backtrace end end
parses all obsolete lines. An obsolete message may only contain obsolete lines
# File lib/simple_po_parser/parser.rb, line 313 def obsoletes if @scanner.scan(/#~/) skip_whitespace add_result(:obsolete, comment_text) obsoletes else raise PoSyntaxError, "All lines must be obsolete after the first obsolete line, but got #{@scanner.peek(10).inspect}." unless @scanner.eos? end end
parses previous comments, which provide additional information on fuzzy matching
previous comments are:
-
#| msgctxt
-
#| msgid
-
#|
msgid_plural
# File lib/simple_po_parser/parser.rb, line 223 def previous_comments begin # next part must be msgctxt, msgid or msgid_plural if @scanner.scan(/msg/) if @scanner.scan(/id/) if @scanner.scan(/_plural/) key = :previous_msgid_plural else key = :previous_msgid end elsif @scanner.scan(/ctxt/) key = :previous_msgctxt else raise PoSyntaxError, "Previous comment type #{("msg" + @scanner.peek(10)).inspect} unknown." end skip_whitespace text = message_line add_result(key, text) previous_multiline(key) if text.empty? else raise PoSyntaxError, "Previous comments must start with '#| msg'. #{@scanner.peek(10).inspect} unknown." end rescue PoSyntaxError => pe raise PoSyntaxError, "Syntax error in previous_comments\n" + pe.message, pe.backtrace end end
parses the multiline messages of the previous comment lines
# File lib/simple_po_parser/parser.rb, line 251 def previous_multiline(key) begin # scan multilines until no further multiline is hit # /#\|\p{Blank}"/ needs to catch the double quote to ensure it hits a previous # multiline and not another line type. if @scanner.scan(/#\|\p{Blank}*"/) @scanner.pos = @scanner.pos - 1 # go one character back, so we can reuse the "message line" method add_result(key, message_line) previous_multiline(key) # go on until we no longer hit a multiline line end rescue PoSyntaxError => pe raise PoSyntaxError, "Syntax error in previous_multiline\n" + pe.message, pe.backtrace end end
advances the scanner until the next non whitespace position. Does not match newlines. See WHITESPACE_REGEX constant
# File lib/simple_po_parser/parser.rb, line 350 def skip_whitespace @scanner.skip(/\p{Blank}+/) end