class News2Kindle::Generator::InternetWatch
Constants
- TOP
Public Class Methods
new( tmpdir )
click to toggle source
# File lib/news2kindle/generator/internet-watch.rb, line 17 def initialize( tmpdir ) @current_dir = tmpdir @src_dir = @current_dir + '/src' Dir::mkdir( @src_dir ) @dst_dir = @current_dir + '/dst' Dir::mkdir( @dst_dir ) resource = Pathname(__FILE__) + '../../../../resource' FileUtils.cp(resource + "internet-watch.jpg", @dst_dir) FileUtils.cp(resource + "internet-watch.css", @dst_dir) end
Public Instance Methods
generate(opts) { |"#{dst_dir}/internet-opf"| ... }
click to toggle source
# File lib/news2kindle/generator/internet-watch.rb, line 30 def generate(opts) now = opts[:now] items = [] rdf_file = "https://internet.watch.impress.co.jp/data/rss/1.0/iw/feed.rdf" rdf = retry_loop( 5 ) do Nokogiri(URI.open(rdf_file, 'r:utf-8', &:read)) end (rdf / 'item' ).each do |item| uri = URI( item.attr( 'rdf:about' ).to_s ) next unless /internet\.watch\.impress\.co\.jp/ =~ uri.host uri.query = nil # remove query of 'ref=rss' next if News2Kindle::DupChecker.dup?(uri) title = (item / 'title').text date = item.elements.map{|e| e.text if e.name == 'date'}.join items << OpenStruct::new( :uri => uri, :title => title, :date => date ) end items.sort!{|a,b| a.date <=> b.date} now_str = now.strftime( '%Y-%m-%d %H:%M' ) # # generating articles in html # items.each do |item| begin article = get_article( item.uri ) open( "#{@dst_dir}/#{item_id item.uri}.html", 'w' ) do |f| f.puts html_header( item.title ) contents = (article / 'div.mainContents') (contents / 'img').each do |img| org = img.attr('ajax') || img.attr('src') next if org =~ /^http/ # skip images on other servers begin img_file = retry_loop( 5 ) do URI.open( "#{TOP}#{org}", &:read ) end cache = "#{org.gsub( /\//, '_' ).sub( /^_/, '' )}" open( "#{@dst_dir}/#{cache}", 'w' ){|f| f.write img_file} img.set_attribute( 'src', cache ) rescue OpenURI::HTTPError News2Kindle.logger.error "skipped an image: #{TOP}#{org}" end end f.puts contents.inner_html f.puts html_footer end rescue News2Kindle.logger.warn "#{$!.class}: #$!" News2Kindle.logger.warn "skipped an article: #{item.uri}" end end # # generating TOC in html # open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |f| f.write html_header( 'Table of Contents' ) if items.size == 0 f.puts %Q|<p>本日は記事がありません。</p>| else f.puts "<ul>" items.each do |item| f.puts %Q|\t<li><a href="#{item_id item.uri}.html">#{item.title}</a></li>| end f.puts "</ul>" end f.write html_footer end # # generating TOC in ncx # open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |f| f.write <<-XML.gsub( /^\t/, '' ) <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd"> <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> <docTitle><text>INTERNET Watch (#{now_str})</text></docTitle> <navMap> <navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint> XML items.each_with_index do |item, index| f.puts %Q|\t\t<navPoint id="#{item_id item.uri}" playOrder="#{index}"><navLabel><text>#{item.title}</text></navLabel><content src="#{item_id item.uri}.html" /></navPoint>| end f.write <<-XML.gsub( /^\t/, '' ) </navMap> </ncx> XML end # # generating OPF # open( "#{@dst_dir}/internet-watch.opf", 'w:utf-8' ) do |f| f.write <<-XML.gsub( /^\t/, '' ) <?xml version="1.0" encoding="utf-8"?> <package unique-identifier="uid"> <metadata> <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/"> <dc:Title>INTERNET Watch (#{now_str})</dc:Title> <dc:Language>ja-JP</dc:Language> <dc:Creator>インプレス</dc:Creator> <dc:Description>INTERNET Watch、#{now_str}生成</dc:Description> <dc:Date>#{now.strftime( '%d/%m/%Y' )}</dc:Date> </dc-metadata> <x-metadata> <output encoding="utf-8" content-type="text/x-oeb1-document"></output> <EmbeddedCover>internet-watch.jpg</EmbeddedCover> </x-metadata> </metadata> <manifest> <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item> <item id="style" media-type="text/css" href="internet-watch.css"></item> <item id="index" media-type="text/html" href="toc.html"></item> XML items.each do |item| f.puts %Q|\t\t<item id="#{item_id item.uri}" media-type="text/html" href="#{item_id item.uri}.html"></item>| end f.write <<-XML.gsub( /^\t/, '' ) </manifest> <spine toc="toc"> <itemref idref="index" /> XML items.each do |item| f.puts %Q|\t<itemref idref="#{item_id item.uri}" />\n| end f.write <<-XML.gsub( /^\t/, '' ) </spine> <tours></tours> <guide> <reference type="toc" title="Table of Contents" href="toc.html"></reference> <reference type="start" title="Table of Contents" href="toc.html"></reference> </guide> </package> XML end yield "#{@dst_dir}/internet-watch.opf" end
Private Instance Methods
get_article( uri )
click to toggle source
# File lib/news2kindle/generator/internet-watch.rb, line 201 def get_article( uri ) cache = "#{@src_dir}/#{File::basename uri.path}" begin html = open( cache, &:read ) rescue Errno::ENOENT #puts "getting article: #{uri.path}".encode( Encoding::default_external ) html = retry_loop( 5 ) do URI.open( uri, &:read ) end open( cache, 'w' ){|f| f.write html } end Nokogiri( html.encode 'UTF-8', invalid: :replace, undef: :replace, replace: '?' ) end
html_header( title )
click to toggle source
# File lib/news2kindle/generator/internet-watch.rb, line 215 def html_header( title ) <<-HTML.gsub( /^\t/, '' ) <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta> <title>#{title}</title> <link rel="stylesheet" href="internet-watch.css" type="text/css" media="all"></link> </head> <body> <h1>#{title}</h1> HTML end
item_id( uri )
click to toggle source
# File lib/news2kindle/generator/internet-watch.rb, line 197 def item_id( uri ) File::basename( uri.path, '.html' ) end
retry_loop( times ) { || ... }
click to toggle source
# File lib/news2kindle/generator/internet-watch.rb, line 180 def retry_loop( times ) count = 0 begin yield rescue count += 1 if count >= times raise else News2Kindle.logger.error $! News2Kindle.logger.info "#{count} retry." sleep 1 retry end end end