class YoutubeTranscript2020
Attributes
id[R]
title[R]
to_a[R]
Public Class Methods
new(id=nil, debug: false)
click to toggle source
# File lib/youtube_transcript2020.rb, line 15 def initialize(id=nil, debug: false) return unless id @debug = debug @id = id[/https?:\/\//] ? YoutubeID.from(id) : id s = Net::HTTP.get(URI("http://video.google.com/timedtext?lang=en&v=#{@id}")) @s = parse(s) unless s.empty? fetch_info(@id) end
Public Instance Methods
import(obj)
click to toggle source
reads a plain text transcript which has been modified to include headings
# File lib/youtube_transcript2020.rb, line 48 def import(obj) s = RXFHelper.read(obj).first if s =~ /------+/ then header, body = s.split(/-----+/,2) h = SimpleConfig.new(header).to_h @id, @author, @title = h[:id], h[:author], h[:title] @s = body else body = obj raw_transcript = true end puts 'body: ' + body[0..400] if @debug a = body.lines.map(&:chomp).partition {|x| x =~ /\d+:\d+/ } @a = a[0].zip(a[1]) @s = join_sentences(@a) if raw_transcript end
to_headings()
click to toggle source
Outputs plain text containing the headings including timestamps note: This can be helpful for copyng and pasting directly into a YouTube comment
# File lib/youtube_transcript2020.rb, line 119 def to_headings() @to_a.select {|timestamp, _| timestamp =~ / /}.map(&:first) end
to_html()
click to toggle source
Outputs HTML containing the embedded video and transcription
# File lib/youtube_transcript2020.rb, line 73 def to_html() url = 'https://www.youtube.com/embed/' + @id links = @a.map do |timestamp, s| seconds = Subunit.new(units={minutes:60, hours:60}, timestamp.split(':').map(&:to_i)).to_i "<li><a href='%s?start=%s&autoplay=1' target='video'>%s</a><p>%s</p></li> " \ % [url, seconds, timestamp, s] end puts '@html_embed: ' + @html_embed.inspect if @debug doc = Rexle.new(@html_embed.to_s) puts 'before attributes' doc.root.attributes[:name] = 'video' embed = doc.xml(declaration: false) puts 'embed: ' + embed.inspect if @debug #embed = @html_embed <<EOF <!DOCTYPE html> <html lang="en"> <head> <title></title> <meta charset="utf-8" /> </head> <body> <div style="width: 1080px; background: white"> <div style="float:left; width: 580px; background: white"> #{embed} <h1>#{@title}</h1> </div> <div style="float:right; width: 500px; overflow-y: scroll; height: 400px"> <ul>#{links.join("\n")}</ul> </div> </div> </body> </html> EOF end
to_keywords(level: 2)
click to toggle source
returns a Hash object containing the frequenecy of each word level: 2 (ignores commond words including stop words) level: 3 (ignores dictionary words)
# File lib/youtube_transcript2020.rb, line 129 def to_keywords(level: 2) Yawc.new(self.to_text(), level: level).to_h end
to_s()
click to toggle source
returns the transcript in plain text including timestamps
# File lib/youtube_transcript2020.rb, line 36 def to_s() h = {id: @id, title: @title, author: @author} SimpleConfig.new(h).to_s + "\n#{'-'*78}\n\n" + @s end
to_text()
click to toggle source
# File lib/youtube_transcript2020.rb, line 42 def to_text() @a.map(&:last).join("\n") end
Private Instance Methods
fetch_info(id)
click to toggle source
# File lib/youtube_transcript2020.rb, line 135 def fetch_info(id) url = "http://www.youtube.com/oembed?url=http://www.youtube.com/watch?v=#{id}&format=xml" s = Net::HTTP.get(URI(url)) e = Rexle.new(s).root @title = e.text('title') @author = e.text('author_name') @html_embed = e.text('html').unescape puts '@html_embed: ' + @html_embed.inspect if @debug end
join_sentences(a)
click to toggle source
# File lib/youtube_transcript2020.rb, line 149 def join_sentences(a) if @debug then puts 'inside join_sentence' puts 'a: ' + a.take(3).inspect end a2 = [] # the following cleans up sentences that start with And, Or, But, So etc. (0..a.length - 1).each do |n| time, s = a[n] puts 's: ' + s.inspect if @debug if s[/^[a-z|0-9]|I\b|I'/] then if a2.any? then # only join two parts together if there was no full stop in # the previous line if a2[-1][-1] != /\.$/ then a2[-1][-1] = a2[-1][-1].chomp + ' ' + s else a2 << [time, s] end else a2 << [time, s.capitalize] end elsif s[/^And,? /] a2[-1][-1] += ' ' + s.sub(/^And,? /,'').capitalize elsif s[/^Or,? /] a2[-1][-1] = a2[-1][-1].chomp + ' ' + s elsif s[/^But /] a2[-1][-1] += ' ' + s.sub(/But,? /,'').capitalize elsif s[/^"/] a2[-1][-1] = a2[-1][-1].chomp + ' ' + s elsif s[/^So,? /] a2[-1][-1] += ' ' + s.sub(/^So,? /,'').capitalize elsif s[/^\[(?:Music|Applause)\]/i] # ignore it puts 'ignoring action commentary' if @debug a2 << [time, '.'] # To promote the next sentence to a new timestamp we # capitalize the 1st letter a[n+1][-1] = a[n+1][-1].capitalize if a[n+1] else if a2.any? and not a2[-1][-1] =~ /\.\s*$/ then a2[-1][-1] = a2[-1][-1].chomp + ' ' + s else a2 << [time, s] end end end # Remove those modified entries which were labelled [Music] etc a2.reject! {|time, s| s.length < 2} # formats the paragraph with the timestamp appearing above @a = a2 a2.map {|time, s| "\n%s\n\n%s" % [time, s]}.join("\n") end
parse(s)
click to toggle source
# File lib/youtube_transcript2020.rb, line 223 def parse(s) doc = Rexle.new(s) a = doc.root.elements.each.map do |x| timestamp = Subunit.new(units={minutes:60, hours:60}, \ seconds: x.attributes[:start].to_f).to_s(verbose: false) [timestamp, x.text.unescape.gsub("\n", ' ').gsub(''',"'").gsub('"','"')] end @to_a = a join_sentences(a) end