class SRTParser

Public Class Methods

parse_file(dir) click to toggle source
# File lib/thecatishere_gem.rb, line 3
def self.parse_file(dir)
            if !File.exist?(dir) || File.extname(dir) != (".srt") then return -1 end
    hash = Hash.new
    line_count = 0
    word_count = 0
    symbol_count = 0
    max_line_symbol = 0
    current_line_sym = 0
    sentence_count = 0
    sub_ID = 0
    sentence_inprogress = false
    timestring, firsttimestring, firstread_time = "", "", true
    time_format_start = time_format_end = Array.new
    File.foreach(dir).each do |line|
        if line.match(/\d\d:\d\d:\d\d,\d\d\d.+/)
            if firstread_time
                firsttimestring = line
                firstread_time = false
            else
                timestring = line
            end
        elsif line.match(/\d+/)
            sub_ID = line.to_i
        elsif line.match(/^\n/)

        else
   #word ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            word_count += line.split(/\s+/).length
   #symbol ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            line.split("").each do |char|
                if char.match(/[~!@\#$%^&*()\-{}\[\]|”:><?\/]/)
                    current_line_sym += 1
                end
   #sentence ~~~~~~~~~~~~~~~~~~~~~~~~~~~
                if char.match(/[A-Z]/)
                    sentence_inprogress = true
                end
                if char.match(/[.!?]/) && sentence_inprogress == true
                    sentence_count += 1
                    sentence_inprogress = false
                end
            end
            symbol_count += current_line_sym
            if max_line_symbol < current_line_sym
                max_line_symbol = current_line_sym
            end
            current_line_sym = 0
   #line ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            line_count += 1
        end
    end
    time_format_end = timestring.scan(/\d+/).map! { |str| str.to_i}
    time_format_start = firsttimestring.scan(/\d+/).map! { |str| str.to_i}
    duration = (time_format_end[4]-time_format_start[0])*3600 + (time_format_end[5]-time_format_start[1])*60 + (time_format_end[6]-time_format_start[2]) + (time_format_end[7]-time_format_start[3]).to_f/1000
    avr_symbol_line = (symbol_count/line_count.to_f).round(2)
    avr_symbol_sentence = (symbol_count/sentence_count.to_f).round(2)
    hash["number_of_words"] = word_count
    hash["number_of_symbols"] = symbol_count
    hash["number_of_lines"] = line_count
    hash["average_symbols_per_line"] = avr_symbol_line
    hash["max_symbols_per_line"] = max_line_symbol
    hash["number_of_sentences"] = sentence_count
    hash["average_symbols_per_sentence"] = avr_symbol_sentence
    hash["duration"] = duration.round()
    hash["average_duration"] = (duration/sub_ID).round(2)
    return hash
end