class News2Kindle::Generator::WsjPaid
Constants
- LOGIN
- TOP
Public Class Methods
new( tmpdir )
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 18 def initialize( tmpdir ) begin require 'pit' login = Pit::get( 'wsj', :require => { 'user' => 'your ID of WSJ.', 'pass' => 'your Password of WSJ.', } ) @wsj_id = login['user'] @wsj_pw = login['pass'] rescue LoadError # no pit library, using environment variables @wsj_id = ENV['WSJ_ID'] @wsj_pw = ENV['WSJ_PW'] end @current_dir = tmpdir @src_dir = @current_dir + '/src' Dir::mkdir( @src_dir ) if(!File.exist?( @src_dir )) @dst_dir = @current_dir + '/dst' Dir::mkdir( @dst_dir ) if(!File.exist?( @dst_dir )) resource = Pathname(__FILE__) + '../../../../resource' FileUtils.cp(resource + "wsj.jpg", @dst_dir) FileUtils.cp(resource + "wsj.css", @dst_dir) end
Public Instance Methods
generate(opts) { |"#{dst_dir}/wsj-opf"| ... }
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 44 def generate(opts) @now = opts[:now] @now_str = @now.strftime '%Y-%m-%d %H:%M' @title = "WSJ日本版" @lang = "ja-JP" agent = Mechanize::new agent.set_proxy( *ENV['HTTP_PROXY'].split( /:/ ) ) if ENV['HTTP_PROXY'] toc = [] toc_cat = [] agent.get(LOGIN) form = agent.page.forms.first form.action = ('https://id.wsj.com/auth/submitlogin.json') form['username'] = @wsj_id form['password'] = @wsj_pw agent.page.forms.first.submit response = JSON.parse(agent.page.body) agent.get( response["url"] ) agent.get( TOP + "/home-page?_wsjregion=asia,jp&_homepage=/home/jp") # # scraping top news # toc_top = ['TOP NEWS'] (agent.page / "div.whatsNews ul.newsItem h2 a").each do |a| if(a.attr('href') =~ /^http:\/\/jp.wsj.com\/article\//) toc_top << [canonical( a.text.strip ), a.attr( 'href' )] end end toc << toc_top # # scraping all categories # first = true (agent.page.root / 'div.wsjMainNav li').each do |li| if(first) first = false next end a = (li / 'a').first toc_cat = [] toc_cat << canonical( a.text.strip ) begin retry_loop( 5 ) do agent.get(a.attr( 'href' )) sleep 1 end rescue News2Kindle.logger.error "cannot get #{uri}." raise end count = 0 (agent.page / "div.leadModule" ).remove newsLinks = (agent.page / "div.headlineSummary ul.newsItem h2 a" ) newsLinks.each do |a| if(a.attr('href') =~ /^http:\/\/jp.wsj.com\/article\//) toc_cat << [canonical( a.text.strip ), a.attr( 'href' )] count += 1 break if(count >= 10) end end toc << toc_cat end begin generate_contents( toc, agent ) yield "#{@dst_dir}/wsj-paid.opf" end end
Private Instance Methods
canonical( str )
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 124 def canonical( str ) str.gsub( /\uFF5E/, "\u301C" ) # for WAVE DASH problem end
generate_contents( toc, agent )
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 322 def generate_contents( toc, agent ) open( "#{@dst_dir}/toc.html", 'w:utf-8' ) do |html| open( "#{@dst_dir}/toc.ncx", 'w:utf-8' ) do |ncx| open( "#{@dst_dir}/wsj-paid.opf", 'w:utf-8' ) do |opf| first = true toc_index = 0 aids = [] ncx.puts ncx_header opf.puts opf_header toc.each do |category| category.each do |article| if article.class == String html.puts first ? html_header( 'Table of Contents' ) : "\t</ul>\n\t<mbp:pagebreak />" html.puts "\t<h2>#{article}</h2>" html.puts "\t<ul>" first = false else html.puts html_item( article[0], article[1], agent ) ncx.puts ncx_item( article[0], article[1], toc_index += 1 ) unless aids.index( uri2aid( article[1] ) ) opf.puts opf_item( article[1] ) aids << uri2aid( article[1] ) if uri2aid( article[1] ) end end end end html.puts "\t</ul>" html.puts html_footer ncx.puts ncx_footer opf.puts opf_footer( aids ) end end end end
get_html_item( agent, uri, sub = nil )
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 158 def get_html_item( agent, uri, sub = nil ) aid = uri2aid( uri ) html = nil if File::exist?( "#{@src_dir}/#{aid}#{sub}.html" ) # loading cache html = Nokogiri( open( "#{@src_dir}/#{aid}#{sub}.html", 'r:utf-8', &:read ) ) else begin #puts "getting html #{aid}#{sub}" retry_loop( 5 ) do agent.get( uri ) html = agent.page.root sleep 1 end rescue News2Kindle.logger.error "cannot get #{uri}." raise end open( "#{@src_dir}/#{aid}#{sub}.html", 'w:utf-8' ) do |f| f.write( html.to_html ) end end html end
html_header( title )
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 144 def html_header( title ) <<-HTML.gsub( /^\t/, '' ) <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta> <title>#{title}</title> <link rel="stylesheet" href="wsj.css" type="text/css" media="all"></link> </head> <body> <h1>#{title}</h1> HTML end
html_item( item, uri, agent )
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 221 def html_item( item, uri, agent ) aid = uri2aid( uri ) return '' unless aid html = get_html_item( agent, uri ) open( "#{@dst_dir}/#{aid}.html", 'w:utf-8' ) do |f| title_tag = (html / 'meta[@property="og:title"]') title = title_tag.size > 0 ? title_tag[0].attr("content").strip : item f.puts canonical( html_header( title ) ) f.puts scrape_html_item(html) f.puts html_footer end %Q|\t\t<li><a href="#{aid}.html">#{item}</a></li>| end
ncx_header()
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 245 def ncx_header <<-XML.gsub( /^\t/, '' ) <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd"> <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> <docTitle><text>#{@title} (#{@now_str})</text></docTitle> <navMap> <navPoint id="toc" playOrder="0"><navLabel><text>Table of Contents</text></navLabel><content src="toc.html" /></navPoint> XML end
ncx_item( item, uri, index )
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 256 def ncx_item( item, uri, index ) aid = uri2aid( uri ) aid ? %Q|\t\t<navPoint id="#{aid}" playOrder="#{index}"><navLabel><text>#{item}</text></navLabel><content src="#{aid}.html" /></navPoint>| : '' end
opf_header()
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 268 def opf_header <<-XML.gsub( /^\t/, '' ) <?xml version="1.0" encoding="utf-8"?> <package unique-identifier="uid"> <metadata> <dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core" xmlns:oebpackage="http://openebook.org/namespaces/oeb-package/1.0/"> <dc:Title>#{@title} (#{@now_str})</dc:Title> <dc:Language>#{@lang}</dc:Language> <dc:Creator>The Wall Street Journal Online</dc:Creator> <dc:Description>#{@title}、#{@now_str}生成</dc:Description> <dc:Date>#{@now.strftime( '%d/%m/%Y' )}</dc:Date> </dc-metadata> <x-metadata> <output encoding="utf-8" content-type="text/x-oeb1-document"></output> <EmbeddedCover>wsj.jpg</EmbeddedCover> </x-metadata> </metadata> <manifest> <item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx"></item> <item id="style" media-type="text/css" href="wsj.css"></item> <item id="index" media-type="text/html" href="toc.html"></item> XML end
opf_item( uri )
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 292 def opf_item( uri ) aid = uri2aid( uri ) aid ? %Q|\t\t<item id="#{aid}" media-type="text/html" href="#{aid}.html"></item>| : '' end
retry_loop( times ) { || ... }
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 128 def retry_loop( times ) count = 0 begin yield rescue count += 1 if count >= times raise else News2Kindle.logger.debug $! News2Kindle.logger.info "#{count} retry." retry end end end
scrape_html_item( html )
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 182 def scrape_html_item( html ) contents = (html / 'div#article_story_body') if(contents.size == 0) contents = (html / 'div#slideContainer') if(contents.size > 0) (contents / 'div.dSlideViewer').before((contents / 'div.dSlideViewer li.firstSlide').inner_html) (contents / 'div.dSlideViewer, h2.header, ul.nav-inline').remove end else signature = (contents / 'ul.socialByline') if(signature.size > 0) signature[0].before(signature.inner_text) signature.remove end (contents / 'div.insettipBox , div.insetButton').remove (contents / 'div.insetZoomTargetBox a').remove (contents / 'div.legacyInset div.embedType-interactive').each {|d| d.parent.remove} end (contents / 'img').each do |image_tag| image_url = image_tag.attr( 'src' ) image_file = File::basename( image_url ) if(File.exist?("#{@dst_dir}/#{image_file}")) image_tag.set_attribute("src", image_file) next end begin image = open( image_url, &:read ) open( "#{@dst_dir}/#{image_file}", 'w' ){|fp| fp.write image} image_tag.set_attribute("src", image_file) rescue News2Kindle.logger.warn "FAIL TO DOWNLOAD IMAGE: #{image_url}" end end contents.inner_html end
uri2aid( uri )
click to toggle source
# File lib/news2kindle/generator/wsj-paid.rb, line 318 def uri2aid( uri ) uri.scan( %r|/article/([^/]*).html| ).flatten[0] end