class News2Kindle::Generator::InternetWatch

Constants

TOP

Public Class Methods

new( tmpdir ) click to toggle source
# File lib/news2kindle/generator/internet-watch.rb, line 17
def initialize( tmpdir )
        @current_dir = tmpdir

        @src_dir = @current_dir + '/src'
        Dir::mkdir( @src_dir )

        @dst_dir = @current_dir + '/dst'
        Dir::mkdir( @dst_dir )
        resource = Pathname(__FILE__) + '../../../../resource'
        FileUtils.cp(resource + "internet-watch.jpg", @dst_dir)
        FileUtils.cp(resource + "internet-watch.css", @dst_dir)
end

Public Instance Methods

generate(opts) { |"#{dst_dir}/internet-opf"| ... } click to toggle source
# File lib/news2kindle/generator/internet-watch.rb, line 30
                        def generate(opts)
                                now = opts[:now]
                                items = []
                                
                                rdf_file = "https://internet.watch.impress.co.jp/data/rss/1.0/iw/feed.rdf"
                                rdf = retry_loop( 5 ) do
                                        Nokogiri(URI.open(rdf_file, 'r:utf-8', &:read))
                                end
                                (rdf / 'item' ).each do |item|
                                        uri = URI( item.attr( 'rdf:about' ).to_s )
                                        next unless /internet\.watch\.impress\.co\.jp/ =~ uri.host
                                        uri.query = nil # remove query of 'ref=rss'
                                        next if News2Kindle::DupChecker.dup?(uri)
                                
                                        title = (item / 'title').text
                                        date = item.elements.map{|e| e.text if e.name == 'date'}.join
                                        items <<  OpenStruct::new( :uri => uri, :title => title, :date => date )
                                end
                                items.sort!{|a,b| a.date <=> b.date}
                                
                                now_str = now.strftime( '%Y-%m-%d %H:%M' )
                                
                                #
                                # generating articles in html
                                #
                                items.each do |item|
                                        begin
                                                article = get_article( item.uri )
                                                open( "#{@dst_dir}/#{item_id item.uri}.html", 'w' ) do |f|
                                                        f.puts html_header( item.title )
                                                        contents = (article / 'div.mainContents')
                                                        (contents / 'img').each do |img|
                                                                org = img.attr('ajax') || img.attr('src')
                                                                next if org =~ /^http/ # skip images on other servers
                                                                begin
                                                                        img_file = retry_loop( 5 ) do
                                                                                URI.open( "#{TOP}#{org}", &:read )
                                                                        end
                                                                        cache = "#{org.gsub( /\//, '_' ).sub( /^_/, '' )}"
                                                                        open( "#{@dst_dir}/#{cache}", 'w' ){|f| f.write img_file}
                                                                        img.set_attribute( 'src', cache )
                                                                rescue OpenURI::HTTPError
                                                                        News2Kindle.logger.error "skipped an image: #{TOP}#{org}"
                                                                end
                                                        end
                                                        f.puts contents.inner_html
                                                        f.puts html_footer
                                                end
                                        rescue
                                                News2Kindle.logger.warn "#{$!.class}: #$!"
                                                News2Kindle.logger.warn "skipped an article: #{item.uri}"
                                        end
                                end
                                
                                #
                                # generating TOC in html
                                #
                                open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |f|
                                        f.write html_header( 'Table of Contents' )
                                        if items.size == 0
                                                f.puts %Q|<p>本日は記事がありません。</p>|
                                        else
                                                f.puts "<ul>"
                                                items.each do |item|
                                                        f.puts %Q|\t<li><a href="#{item_id item.uri}.html">#{item.title}</a></li>|
                                                        end
                                                f.puts "</ul>"
                                        end
                                        f.write html_footer
                                end
                                
                                #
                                # generating TOC in ncx
                                #
                                open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |f|
                                        f.write <<-XML.gsub( /^\t/, '' )
                                        <?xml version="1.0" encoding="UTF-8"?>
                                        <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
                                        <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
                                        <docTitle><text>INTERNET Watch (#{now_str})</text></docTitle>
                                        <navMap>
                                                <navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
                                        XML
                                
                                        items.each_with_index do |item, index|
                                                f.puts %Q|\t\t<navPoint id="#{item_id item.uri}" playOrder="#{index}"><navLabel><text>#{item.title}</text></navLabel><content src="#{item_id item.uri}.html" /></navPoint>|
                                        end
                                
                                        f.write <<-XML.gsub( /^\t/, '' )
                                        </navMap>
                                        </ncx>
                                        XML
                                end
                                
                                #
                                # generating OPF
                                #
                                open( "#{@dst_dir}/internet-watch.opf", 'w:utf-8' ) do |f|
                                        f.write <<-XML.gsub( /^\t/, '' )
                                        <?xml version="1.0" encoding="utf-8"?>
                                        <package unique-identifier="uid">
                                                <metadata>
                                                        <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
                                                                <dc:Title>INTERNET Watch (#{now_str})</dc:Title>
                                                                <dc:Language>ja-JP</dc:Language>
                                                                <dc:Creator>インプレス</dc:Creator>
                                                                <dc:Description>INTERNET Watch、#{now_str}生成</dc:Description>
                                                                <dc:Date>#{now.strftime( '%d/%m/%Y' )}</dc:Date>
                                                        </dc-metadata>
                                                        <x-metadata>
                                                                <output encoding="utf-8" content-type="text/x-oeb1-document"></output>
                                                                <EmbeddedCover>internet-watch.jpg</EmbeddedCover>
                                                        </x-metadata>
                                                </metadata>
                                                <manifest>
                                                        <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
                                                        <item id="style" media-type="text/css" href="internet-watch.css"></item>
                                                        <item id="index" media-type="text/html" href="toc.html"></item>
                                        XML
                                
                                        items.each do |item|
                                                f.puts %Q|\t\t<item id="#{item_id item.uri}" media-type="text/html" href="#{item_id item.uri}.html"></item>|
                                        end
                                
                                        f.write <<-XML.gsub( /^\t/, '' )
                                        </manifest>
                                        <spine toc="toc">
                                                <itemref idref="index" />
                                        XML
                                
                                        items.each do |item|
                                                f.puts %Q|\t<itemref idref="#{item_id item.uri}" />\n|
                                        end
                                
                                        f.write <<-XML.gsub( /^\t/, '' )
                                        </spine>
                                        <tours></tours>
                                        <guide>
                                          <reference type="toc" title="Table of Contents" href="toc.html"></reference>
                                          <reference type="start" title="Table of Contents" href="toc.html"></reference>
                                        </guide>
                                        </package>
                                        XML
                                end

                                yield "#{@dst_dir}/internet-watch.opf"
                        end

Private Instance Methods

get_article( uri ) click to toggle source
# File lib/news2kindle/generator/internet-watch.rb, line 201
def get_article( uri )
        cache = "#{@src_dir}/#{File::basename uri.path}"
        begin
                html = open( cache, &:read )
        rescue Errno::ENOENT
                #puts "getting article: #{uri.path}".encode( Encoding::default_external )
                html = retry_loop( 5 ) do
                        URI.open( uri, &:read )
                end
                open( cache, 'w' ){|f| f.write html }
        end
        Nokogiri( html.encode 'UTF-8', invalid: :replace, undef: :replace, replace: '?' )
end
html_header( title ) click to toggle source
# File lib/news2kindle/generator/internet-watch.rb, line 215
                        def html_header( title )
                                <<-HTML.gsub( /^\t/, '' )
                                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
                                <html>
                                <head>
                                        <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
                                        <title>#{title}</title>
                                        <link rel="stylesheet" href="internet-watch.css" type="text/css" media="all"></link>
                                </head>
                                <body>
                                        <h1>#{title}</h1>
                                HTML
                        end
item_id( uri ) click to toggle source
# File lib/news2kindle/generator/internet-watch.rb, line 197
def item_id( uri )
        File::basename( uri.path, '.html' )
end
retry_loop( times ) { || ... } click to toggle source
# File lib/news2kindle/generator/internet-watch.rb, line 180
def retry_loop( times )
        count = 0
        begin
                yield
        rescue
                count += 1
                if count >= times
                        raise
                else
                        News2Kindle.logger.error $!
                        News2Kindle.logger.info "#{count} retry."
                        sleep 1
                        retry
                end
        end
end