class TTML
Library to handle TTML
Files
Uses the translator available to do the necessary language operations as defined by the AllFather
Constants
- SUPPORTED_TRANSFORMATIONS
Public Class Methods
new(cc_file, opts=nil)
click to toggle source
# File lib/ttml.rb, line 21 def initialize(cc_file, opts=nil) @cc_file = cc_file @force_detect = opts ? (opts[:force_detect] || false) : false raise "Invalid TTML file provided" unless is_valid? end
Public Instance Methods
callsign()
click to toggle source
# File lib/ttml.rb, line 27 def callsign TYPE_TTML end
infer_languages()
click to toggle source
# File lib/ttml.rb, line 45 def infer_languages lang = [] begin xml_file = File.open(@cc_file) xml_doc = Nokogiri::XML(xml_file) div_objects = xml_doc.css("/tt/body/div") local_force_detect = false div_objects.each_with_index do |div, index| # By default, return the lang if specified in the div and # force detect is false inferred_lang = div.attributes['lang'].value rescue nil if inferred_lang.nil? # If lang is not provided in the caption, then override # force detect for inferrence local_force_detect = true end if @force_detect || local_force_detect local_force_detect = false sample_text = get_text(div, 100) inferred_lang = @translator.infer_language(sample_text) rescue nil if inferred_lang.nil? err_msg = "Failed to detect lang for div block number #{index + 1}" unless lang.empty? err_msg += "; Detected languages before failure are #{lang}" end raise AllFather::LangDetectionFailureException.new(err_msg) end end lang << inferred_lang end rescue StandardError => e puts "Error while detecting the language due to #{e.message}" ensure xml_file.close rescue nil end return nil if lang.empty? lang end
is_valid?()
click to toggle source
# File lib/ttml.rb, line 31 def is_valid? # Do any VTT specific validations here if @cc_file =~ /^.*\.(ttml)$/ return true end # TODO: Check if it's required to do a File read to see if this # a well-formed XML. Another is to see if lang is available in each div return false end
set_translator(translator)
click to toggle source
# File lib/ttml.rb, line 41 def set_translator(translator) @translator = translator end
supported_transformations()
click to toggle source
# File lib/ttml.rb, line 129 def supported_transformations return SUPPORTED_TRANSFORMATIONS end
transform_to(types, src_lang, target_lang, output_dir)
click to toggle source
Calls superclass method
AllFather#transform_to
# File lib/ttml.rb, line 133 def transform_to(types, src_lang, target_lang, output_dir) # Let's start off with some validations super(types, src_lang, target_lang, output_dir) # Suffix output dir with File seperator output_dir = "#{output_dir}#{File::Separator}" unless output_dir.end_with?(File::Separator) begin xml_file = File.open(@cc_file, 'r') xml_doc = Nokogiri::XML(xml_file) div_objects = xml_doc.css("/tt/body/div") langs = div_objects.map {|div| div.attributes['lang'].value rescue nil} translate = false matching_divs = [] inferred_src_lang = nil if src_lang.nil? || src_lang.empty? if target_lang && !target_lang.empty? # Find if any of our div matches this. Else pick first and translate to target lang div_objects.each_with_index do |div, j| if matching_lang?(div, target_lang) matching_divs << div break end end if matching_divs.empty? # Let's pick the first div for target translation selected_div = div_objects.first inferred_src_lang = selected_div.lang matching_divs << selected_div translate = true end else # Then we will have to create output files for each lang matching_divs = div_objects end else # Find the matching lang div and create the outputs available_divs = langs.select { |lang| lang.eql?(src_lang) } if available_divs.length > 1 raise InvalidInputException.new("More than one section in Caption file specifies lang as #{src_lang}. This file is unsupported") end div_objects.each_with_index do |div, j| if matching_lang?(div, src_lang) matching_divs << div break end end if matching_divs.empty? raise InvalidInputException.new("Given Caption file #{@cc_file} doesn't contain #{src_lang} lang. Available langs are #{langs}") end if matching_divs.length > 1 raise InvalidInputException.new("More than one section in Caption file specifies lang as #{src_lang}. This file is unsupported") end if target_lang && !target_lang.empty? && !src_lang.eql?(target_lang) translate = true end end div_index = 1 multiple_outputs = matching_divs.size > 1 matching_divs.each do |div| div_lang = div.attributes['lang'].value rescue nil # Override div lang if translate is required div_lang = target_lang if translate file_map = {} # Prepare the output files for each type and for each lang in the file types.each do |type| output_file = File.basename(@cc_file, File.extname(@cc_file)) # Suffix div index when multiple outputs are created output_file << "_#{div_index}" if multiple_outputs if target_lang.nil? && !src_lang.nil? output_file << "_#{src_lang}" end # Suffix lang to filename if provideds if target_lang && !target_lang.empty? output_file << "_#{target_lang}" end output_file << extension_from_type(type) out_file = "#{output_dir}#{output_file}" if create_file(TYPE_TTML, type, out_file, div_lang) file_map[type] = out_file else raise StandardError.new("Failed to create output file for type #{type}") end end blocks = div.css("p") cue_index = 1 total_blocks = blocks.size blocks.each_with_index do |block, index| start_time = block.attributes['begin'].value end_time = block.attributes['end'].value text = block.inner_html.strip.gsub(/(\s){2,}/, '') message = "" text_blocks = get_block_text(text) text_blocks.each do |text_block| next if text_block.start_with?('<') || text_block.empty? message << text_block end cue_info = CueInfo.new(callsign) cue_info.index = cue_index cue_index += 1 cue_info.message = translated_msg(translate, message, src_lang, inferred_src_lang, target_lang) cue_info.start = start_time cue_info.end = end_time cue_info.start_time_units = time_details(start_time, callsign) cue_info.end_time_units = time_details(end_time, callsign) write_cue(cue_info, file_map, index == (total_blocks - 1)) end div_index += 1 end ensure xml_file.close if xml_file end end
translate(src_lang, dest_lang, out_file)
click to toggle source
Calls superclass method
AllFather#translate
# File lib/ttml.rb, line 84 def translate(src_lang, dest_lang, out_file) super(src_lang, dest_lang, out_file) xml_file = File.open(@cc_file, 'r:UTF-8', &:read) xml_doc = Nokogiri::XML(xml_file) div_objects = xml_doc.css("/tt/body/div") # Irrespective of what lang the div xml:lang says, infer the lang and then # check to see if it matches src_lang matched_div = nil div_objects.each do |div| sample_text = get_text(div, 100) inferred_lang = @translator.infer_language(sample_text) rescue nil next if inferred_lang.nil? if inferred_lang.eql?(src_lang) matched_div = div break end end if matched_div.nil? FileUtils.remove_file(out_file) raise AllFather::InvalidInputException.new("Unable to find #{src_lang} language section in TTML") end # Update the Lang in the Div matched_div.lang = dest_lang blocks = matched_div.css("p") blocks.each do |block| # Multiple spaces being stripped off text = block.inner_html.strip.gsub(/(\s){2,}/, '') text_blocks = get_block_text(text) translated_text = "" text_blocks.each do |text_block| if text_block.start_with?('<') || text_block.empty? translated_text << text_block next end translated_resp = @translator.translate(text_block, src_lang, dest_lang) translated_text << translated_resp end block.inner_html = translated_text end xml_file.close rescue nil File.write(out_file, xml_doc) out_file end
Private Instance Methods
get_block_text(text)
click to toggle source
Method to segregate the data from markups as markups don't need translations. For example, if the cue block is of the form This is a test caption with <span id=“1”>a test span </span> within a block This method returns
- “This is a test caption with ”, “<span id="1">”, “a test span ”, “</span>”, “ within a block”
-
as we can infer the markups can be retained as is to avoid translation
# File lib/ttml.rb, line 297 def get_block_text(text) data = [] tag_start = tag_end = false str_length = text.size text_block = "" markup_block = "" for i in 0...text.size do if text[i] == '<' tag_end = false tag_start = true markup_block << text[i] data << text_block text_block = "" next elsif text[i] == '>' tag_end = true tag_start = false markup_block << text[i] data << markup_block markup_block = "" next end if tag_start && !tag_end markup_block << text[i] else text_block << text[i] end end unless text_block.empty? data << text_block end data end
get_text(div, num_chars)
click to toggle source
Method to get a minimal amount of key text that excludes any tags or control information for the engine to meaninfully and correctly infer the language being referred to in ths TTML
# File lib/ttml.rb, line 336 def get_text(div, num_chars) text_sample = "" blocks = div.css("p") blocks.each do |block| # Multiple spaces being stripped off text = block.inner_html.strip.gsub(/(\s){2,}/, '') # Strip off html tags (if any) text = text.gsub(/(<.*?>)/, ' ') text_sample << text if text_sample.length > (num_chars + 1) break end end return text_sample[0, num_chars] end
matching_lang?(div, target_lang)
click to toggle source
# File lib/ttml.rb, line 265 def matching_lang?(div, target_lang) lang = div.attributes['lang'].value rescue nil if lang.nil? # Let's infer the lang if @translator.nil? raise StandardError.new("Cannot infer language as engine options are not provided") end reference_text = get_text(div, 100) inferred_lang = @translator.infer_language(reference_text) rescue nil if inferred_lang.nil? raise LangDetectionFailureException.new("Failed to infer language for div block #{j} of caption file") end # Store this lang in the div div.lang = inferred_lang if inferred_lang.eql?(target_lang) return true end elsif lang.eql?(target_lang) return true end return false end
translated_msg(translate, message, src_lang, inferred_src_lang, target_lang)
click to toggle source
# File lib/ttml.rb, line 250 def translated_msg(translate, message, src_lang, inferred_src_lang, target_lang) return message unless translate use_src = nil if (src_lang.nil? || src_lang.empty?) if inferred_src_lang.nil? raise LangDetectionFailureException.new("Unable to deduce source lang for translation") end use_src = inferred_src_lang else use_src = src_lang end return message if use_src.eql?(target_lang) @translator.translate(message, use_src, target_lang) end