class News2Kindle::Generator::NikkeiPaid

Constants

TOP

Public Class Methods

new( tmpdir ) click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 17
def initialize( tmpdir )
        @nikkei_id, @nikkei_pw = auth
        @current_dir = tmpdir

        @src_dir = @current_dir + '/src'
        Dir::mkdir( @src_dir )

        @dst_dir = @current_dir + '/dst'
        Dir::mkdir( @dst_dir )
        resource = Pathname(__FILE__) + '../../../../resource'
        FileUtils.cp(resource + "nikkei.jpg", @dst_dir)
        FileUtils.cp(resource + "nikkei.css", @dst_dir)
end

Public Instance Methods

generate(opts) { |"#{dst_dir}/#{basename}.opf"| ... } click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 31
def generate(opts)
        @now = opts[:now]
        @now_str = @now.strftime '%Y-%m-%d %H:%M'

        agent = Mechanize::new
        agent.user_agent_alias = "Windows Chrome"
        agent.set_proxy( *ENV['HTTP_PROXY'].split( /:/ ) ) if ENV['HTTP_PROXY']

        toc = []
        if @nikkei_id and @nikkei_pw
                agent.get('https://id.nikkei.com/lounge/li/main/')
                agent.page.form_with(name: 'LA0120Form01') do |form|
                        form['LA0120Form01:LA0120Email'] = @nikkei_id
                        form['LA0120Form01:LA0120Password'] = @nikkei_pw
                        form.click_button
                end
                begin # skip the confirmation page if exist
                        agent.get(TOP)
                        agent.page.form_with(name: 'autoPostForm').click_button
                rescue NoMethodError
                end
        else
                agent.get(TOP)
        end

        #
        # scraping top news
        #
        toc_top = ['TOP NEWS']
        (agent.page / 'div.m-miM11_box h3 a').each do |a|
                uri = a.attr('href')
                next if News2Kindle::DupChecker.dup?(uri)
                toc_top << [canonical(a.text.strip), uri]
        end
        toc << toc_top

        #
        # scraping all categories
        #
        (agent.page / 'div.m-miM11_box').each do |genre|
                headline = genre / 'div.m-headline h3'
                toc_cat = [headline.text]
                agent.get((headline / 'a').attr('href'))
                (agent.page / '#CONTENTS_MAIN h3 a').each do |article|
                        uri = article.attr('href')
                        next unless article.attr('href') =~ %r|^/article/|
                        next if News2Kindle::DupChecker.dup?(uri)
                        toc_cat << [canonical(article.text), uri]
                end
                toc << toc_cat
        end

        begin
                generate_contents( toc, agent )
                yield "#{@dst_dir}/#{basename}.opf"
        end

        if @nikkei_id and @nikkei_pw
                agent.get('https://regist.nikkei.com/ds/etc/accounts/logout')
        end
end

Private Instance Methods

auth() click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 95
def auth
        require 'pit'
        login = Pit::get('news2kindle', require: {
                nikkei_user: 'your ID of Nikkei.',
                nikkei_pass: 'your Password of Nikkei.',
        })
        return login[:nikkei_user], login[:nikkei_pass]
end
basename() click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 104
def basename
        self.class.to_s.sub(/.*:/, '').gsub(/([A-Z])/, '-\\1').sub(/^-/, '').downcase
end
canonical( str ) click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 108
def canonical( str )
        str.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem
end
generate_contents( toc, agent ) click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 311
def generate_contents( toc, agent )
        open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |html|
        open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |ncx|
        open( "#{@dst_dir}/#{basename}.opf", 'w:utf-8' ) do |opf|
                first = true
                toc_index = 0
                aids = []
                ncx.puts ncx_header
                opf.puts opf_header
                toc.each do |category|
                        category.each do |article|
                                if article.class == String
                                        html.puts first ?
                                                html_header( 'Table of Contents' ) :
                                                "\t</ul>\n\t<mbp:pagebreak />"
                                        html.puts "\t<h2>#{article}</h2>"
                                        html.puts "\t<ul>"
                                        first = false
                                else
                                        begin
                                                html.puts html_item( article[0], article[1], agent )
                                                ncx.puts ncx_item( article[0], article[1], toc_index += 1 )
                                                unless aids.index( uri2aid( article[1] ) )
                                                        opf.puts opf_item( article[1] )
                                                        aids << uri2aid( article[1] ) if uri2aid( article[1] )
                                                end
                                        rescue IllegalPage
                                        end
                                end
                        end
                end
                html.puts "\t</ul>"
                html.puts html_footer
                ncx.puts ncx_footer
                opf.puts opf_footer( aids )
        end
        end
        end
end
get_html_item( agent, uri, sub = nil ) click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 142
def get_html_item( agent, uri, sub = nil )
        uri.sub!( %r|^https://www.nikkei.com|, '' )
        aid = uri2aid( uri )
        html = nil
        if File::exist?( "#{@src_dir}/#{aid}#{sub}.html" ) # loading cache
                html = Nokogiri( open( "#{@src_dir}/#{aid}#{sub}.html", 'r:utf-8', &:read ) )
        else
                begin
                        #puts "getting html #{aid}#{sub}"
                        retry_loop( 5 ) do
                                agent.get("#{TOP}/news/print-article/?ng=#{aid}")
                                html = agent.page.root
                                sleep 1
                        end
                rescue
                        News2Kindle.logger.error "cannot get #{TOP}#{uri}."
                        raise
                end
                open( "#{@src_dir}/#{aid}#{sub}.html", 'w:utf-8' ) do |f|
                        f.write( html.to_html )
                end
        end
        html
end
html_header( title ) click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 128
                        def html_header( title )
                                <<~HTML
                                        <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
                                        <html>
                                        <head>
                                                <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>
                                                <title>#{title}</title>
                                                <link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link>
                                        </head>
                                        <body>
                                                <h1>#{title}</h1>
                                HTML
                        end
html_item( item, uri, agent ) click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 204
def html_item( item, uri, agent )
        aid = uri2aid( uri )
        return '' unless aid
        html = get_html_item( agent, uri )
        out_file = "#{@dst_dir}/#{aid}.html"

        begin
                open( out_file, 'w:utf-8' ) do |f|
                        f.puts canonical( html_header( (html / 'h1.cmn-article_title, h4.cmn-article_title, h2.cmn-article_title')[0].text.strip ) )
                        f.puts scrape_html_item( html )
                        (html / 'div.cmn-article_nation ul li a').map {|link|
                                link.attr( 'href' )
                        }.sort.uniq.each_with_index do |link,index|
                                f.puts scrape_html_item( get_html_item( agent, link, index + 2 ) )
                        end
                        f.puts html_footer
                end

                %Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>|
        rescue NoMethodError
                News2Kindle.logger.warn "cannot parse page #{aid}, skipped."
                News2Kindle.logger.debug $!
                File.delete out_file
                raise IllegalPage.new
        end
end
ncx_header() click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 238
                        def ncx_header
                                <<~XML
                                        <?xml version="1.0" encoding="UTF-8"?>
                                        <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
                                        <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
                                        <docTitle><text>日経電子版 (#{@now_str})</text></docTitle>
                                        <navMap>
                                                <navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint>
                                XML
                        end
ncx_item( item, uri, index ) click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 249
def ncx_item( item, uri, index )
        aid = uri2aid( uri )
        aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : ''
end
opf_header() click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 261
                        def opf_header
                                <<~XML
                                        <?xml version="1.0" encoding="utf-8"?>
                                        <package unique-identifier="uid">
                                                <metadata>
                                                        <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/">
                                                                <dc:Title>日経電子版 (#{@now_str})</dc:Title>
                                                                <dc:Language>ja-JP</dc:Language>
                                                                <dc:Creator>日本経済新聞社</dc:Creator>
                                                                <dc:Description>日経電子版、#{@now_str}生成</dc:Description>
                                                                <dc:Date>#{@now.strftime( '%d/%m/%Y' )}</dc:Date>
                                                        </dc-metadata>
                                                        <x-metadata>
                                                                <output encoding="utf-8" content-type="text/x-oeb1-document"></output>
                                                                <EmbeddedCover>nikkei.jpg</EmbeddedCover>
                                                        </x-metadata>
                                                </metadata>
                                                <manifest>
                                                        <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item>
                                                        <item id="style" media-type="text/css" href="nikkei.css"></item>
                                                        <item id="index" media-type="text/html" href="toc.html"></item>
                                XML
                        end
opf_item( uri ) click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 285
def opf_item( uri )
        aid = uri2aid( uri )
        aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : ''
end
retry_loop( times ) { || ... } click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 112
def retry_loop( times )
        count = 0
        begin
                yield
        rescue
                count += 1
                if count >= times
                        raise
                else
                        News2Kindle.logger.error $!
                        News2Kindle.logger.info "#{count} retry."
                        retry
                end
        end
end
scrape_html_item( html ) click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 167
def scrape_html_item( html )
        result = ''
        (html / 'div.cmn-article_text').each do |div|
                div.children.each do |e|
                #div.css('div.cmn-photo_style2 img', 'p', 'table').each do |e|
                        case e.name
                        when 'p'
                                next unless (e / 'a.cmnc-continue').empty?
                                (e / 'span.JSID_urlData').remove
                                para = canonical e.text.strip.sub( /^ /, '' )
                                result << "\t<p>#{para}</p>" unless para.empty?
                        when 'table'
                                result << e.to_html
                        when 'div'
                                e.css('img').each do |img|
                                        image_url = img['src']
                                        next if /^http/ =~ image_url # skip images in other server
                                        next if /^\/\// =~ image_url # skip assets
                                        image_file = File::basename( image_url )
                                        begin
                                                image = open( "#{TOP}#{image_url.sub /PN/, 'PB'}", &:read )
                                                open( "#{@dst_dir}/#{image_file}", 'w' ){|fp| fp.write image}
                                                result << %Q|\t<div>|
                                                result << %Q|\t\t<img src="#{image_file}">|
                                                result << %Q|\t\t<p>[#{e.text}]</p>| unless e.text.strip.empty?
                                                result << %Q|\t</div>|
                                        rescue
                                                News2Kindle.logger.debug $!
                                                News2Kindle.logger.warn "FAIL TO DOWNLOAD IMAGE: #{image_url}"
                                        end
                                end
                        end
                end
        end
        result
end
uri2aid( uri ) click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 307
def uri2aid( uri )
        uri.scan( %r|/article/([^/]*)/| ).flatten[0]
end