class News2Kindle::Generator::NikkeiPaid
Constants
- TOP
Public Class Methods
new( tmpdir )
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 17 def initialize( tmpdir ) @nikkei_id, @nikkei_pw = auth @current_dir = tmpdir @src_dir = @current_dir + '/src' Dir::mkdir( @src_dir ) @dst_dir = @current_dir + '/dst' Dir::mkdir( @dst_dir ) resource = Pathname(__FILE__) + '../../../../resource' FileUtils.cp(resource + "nikkei.jpg", @dst_dir) FileUtils.cp(resource + "nikkei.css", @dst_dir) end
Public Instance Methods
generate(opts) { |"#{dst_dir}/#{basename}.opf"| ... }
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 31 def generate(opts) @now = opts[:now] @now_str = @now.strftime '%Y-%m-%d %H:%M' agent = Mechanize::new agent.user_agent_alias = "Windows Chrome" agent.set_proxy( *ENV['HTTP_PROXY'].split( /:/ ) ) if ENV['HTTP_PROXY'] toc = [] if @nikkei_id and @nikkei_pw agent.get('https://id.nikkei.com/lounge/li/main/') agent.page.form_with(name: 'LA0120Form01') do |form| form['LA0120Form01:LA0120Email'] = @nikkei_id form['LA0120Form01:LA0120Password'] = @nikkei_pw form.click_button end begin # skip the confirmation page if exist agent.get(TOP) agent.page.form_with(name: 'autoPostForm').click_button rescue NoMethodError end else agent.get(TOP) end # # scraping top news # toc_top = ['TOP NEWS'] (agent.page / 'div.m-miM11_box h3 a').each do |a| uri = a.attr('href') next if News2Kindle::DupChecker.dup?(uri) toc_top << [canonical(a.text.strip), uri] end toc << toc_top # # scraping all categories # (agent.page / 'div.m-miM11_box').each do |genre| headline = genre / 'div.m-headline h3' toc_cat = [headline.text] agent.get((headline / 'a').attr('href')) (agent.page / '#CONTENTS_MAIN h3 a').each do |article| uri = article.attr('href') next unless article.attr('href') =~ %r|^/article/| next if News2Kindle::DupChecker.dup?(uri) toc_cat << [canonical(article.text), uri] end toc << toc_cat end begin generate_contents( toc, agent ) yield "#{@dst_dir}/#{basename}.opf" end if @nikkei_id and @nikkei_pw agent.get('https://regist.nikkei.com/ds/etc/accounts/logout') end end
Private Instance Methods
auth()
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 95 def auth require 'pit' login = Pit::get('news2kindle', require: { nikkei_user: 'your ID of Nikkei.', nikkei_pass: 'your Password of Nikkei.', }) return login[:nikkei_user], login[:nikkei_pass] end
basename()
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 104 def basename self.class.to_s.sub(/.*:/, '').gsub(/([A-Z])/, '-\\1').sub(/^-/, '').downcase end
canonical( str )
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 108 def canonical( str ) str.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem end
generate_contents( toc, agent )
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 311 def generate_contents( toc, agent ) open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |html| open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |ncx| open( "#{@dst_dir}/#{basename}.opf", 'w:utf-8' ) do |opf| first = true toc_index = 0 aids = [] ncx.puts ncx_header opf.puts opf_header toc.each do |category| category.each do |article| if article.class == String html.puts first ? html_header( 'Table of Contents' ) : "\t</ul>\n\t<mbp:pagebreak />" html.puts "\t<h2>#{article}</h2>" html.puts "\t<ul>" first = false else begin html.puts html_item( article[0], article[1], agent ) ncx.puts ncx_item( article[0], article[1], toc_index += 1 ) unless aids.index( uri2aid( article[1] ) ) opf.puts opf_item( article[1] ) aids << uri2aid( article[1] ) if uri2aid( article[1] ) end rescue IllegalPage end end end end html.puts "\t</ul>" html.puts html_footer ncx.puts ncx_footer opf.puts opf_footer( aids ) end end end end
get_html_item( agent, uri, sub = nil )
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 142 def get_html_item( agent, uri, sub = nil ) uri.sub!( %r|^https://www.nikkei.com|, '' ) aid = uri2aid( uri ) html = nil if File::exist?( "#{@src_dir}/#{aid}#{sub}.html" ) # loading cache html = Nokogiri( open( "#{@src_dir}/#{aid}#{sub}.html", 'r:utf-8', &:read ) ) else begin #puts "getting html #{aid}#{sub}" retry_loop( 5 ) do agent.get("#{TOP}/news/print-article/?ng=#{aid}") html = agent.page.root sleep 1 end rescue News2Kindle.logger.error "cannot get #{TOP}#{uri}." raise end open( "#{@src_dir}/#{aid}#{sub}.html", 'w:utf-8' ) do |f| f.write( html.to_html ) end end html end
html_header( title )
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 128 def html_header( title ) <<~HTML <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta> <title>#{title}</title> <link rel="stylesheet" href="nikkei.css" type="text/css" media="all"></link> </head> <body> <h1>#{title}</h1> HTML end
html_item( item, uri, agent )
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 204 def html_item( item, uri, agent ) aid = uri2aid( uri ) return '' unless aid html = get_html_item( agent, uri ) out_file = "#{@dst_dir}/#{aid}.html" begin open( out_file, 'w:utf-8' ) do |f| f.puts canonical( html_header( (html / 'h1.cmn-article_title, h4.cmn-article_title, h2.cmn-article_title')[0].text.strip ) ) f.puts scrape_html_item( html ) (html / 'div.cmn-article_nation ul li a').map {|link| link.attr( 'href' ) }.sort.uniq.each_with_index do |link,index| f.puts scrape_html_item( get_html_item( agent, link, index + 2 ) ) end f.puts html_footer end %Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>| rescue NoMethodError News2Kindle.logger.warn "cannot parse page #{aid}, skipped." News2Kindle.logger.debug $! File.delete out_file raise IllegalPage.new end end
ncx_header()
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 238 def ncx_header <<~XML <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd"> <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> <docTitle><text>日経電子版 (#{@now_str})</text></docTitle> <navMap> <navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint> XML end
ncx_item( item, uri, index )
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 249 def ncx_item( item, uri, index ) aid = uri2aid( uri ) aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : '' end
opf_header()
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 261 def opf_header <<~XML <?xml version="1.0" encoding="utf-8"?> <package unique-identifier="uid"> <metadata> <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/"> <dc:Title>日経電子版 (#{@now_str})</dc:Title> <dc:Language>ja-JP</dc:Language> <dc:Creator>日本経済新聞社</dc:Creator> <dc:Description>日経電子版、#{@now_str}生成</dc:Description> <dc:Date>#{@now.strftime( '%d/%m/%Y' )}</dc:Date> </dc-metadata> <x-metadata> <output encoding="utf-8" content-type="text/x-oeb1-document"></output> <EmbeddedCover>nikkei.jpg</EmbeddedCover> </x-metadata> </metadata> <manifest> <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item> <item id="style" media-type="text/css" href="nikkei.css"></item> <item id="index" media-type="text/html" href="toc.html"></item> XML end
opf_item( uri )
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 285 def opf_item( uri ) aid = uri2aid( uri ) aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : '' end
retry_loop( times ) { || ... }
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 112 def retry_loop( times ) count = 0 begin yield rescue count += 1 if count >= times raise else News2Kindle.logger.error $! News2Kindle.logger.info "#{count} retry." retry end end end
scrape_html_item( html )
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 167 def scrape_html_item( html ) result = '' (html / 'div.cmn-article_text').each do |div| div.children.each do |e| #div.css('div.cmn-photo_style2 img', 'p', 'table').each do |e| case e.name when 'p' next unless (e / 'a.cmnc-continue').empty? (e / 'span.JSID_urlData').remove para = canonical e.text.strip.sub( /^ /, '' ) result << "\t<p>#{para}</p>" unless para.empty? when 'table' result << e.to_html when 'div' e.css('img').each do |img| image_url = img['src'] next if /^http/ =~ image_url # skip images in other server next if /^\/\// =~ image_url # skip assets image_file = File::basename( image_url ) begin image = open( "#{TOP}#{image_url.sub /PN/, 'PB'}", &:read ) open( "#{@dst_dir}/#{image_file}", 'w' ){|fp| fp.write image} result << %Q|\t<div>| result << %Q|\t\t<img src="#{image_file}">| result << %Q|\t\t<p>[#{e.text}]</p>| unless e.text.strip.empty? result << %Q|\t</div>| rescue News2Kindle.logger.debug $! News2Kindle.logger.warn "FAIL TO DOWNLOAD IMAGE: #{image_url}" end end end end end result end
uri2aid( uri )
click to toggle source
# File lib/news2kindle/generator/nikkei-paid.rb, line 307 def uri2aid( uri ) uri.scan( %r|/article/([^/]*)/| ).flatten[0] end