class Kindlizer::Generator::WsjusPaid

Constants

LOGIN
TOP

Public Instance Methods

generate(opts) { |"#{dst_dir}/wsj-opf"| ... } click to toggle source
# File lib/news2kindle/generator/wsjus-paid.rb, line 13
def generate(opts)
        @now = opts[:now]
        @now_str = now.strftime '%Y-%m-%d %H:%M'
        @title = "WSJ U.S."
        @lang = "en-US"
        resource = Pathname(__FILE__) + '../../../../resource'
        FileUtils.cp(rescue + "wsj-us.jpg", @dst_dir + "/wsj.jpg")

        agent = Mechanize::new
        agent.set_proxy( *ENV['HTTP_PROXY'].split( /:/ ) ) if ENV['HTTP_PROXY']

        toc = []

        agent.get(LOGIN)

        form = agent.page.forms.first
        form.action = ('https://id.wsj.com/auth/submitlogin.json')
        form['username'] = @wsj_id
        form['password'] = @wsj_pw
        agent.page.forms.first.submit

        response = JSON.parse(agent.page.body)
        agent.get( response["url"] )

        agent.get( TOP + "/home-page?_wsjregion=na,us&_homepage=/home/us")

        #
        # scraping top news
        #
        toc_top = ['TOP NEWS']
        (agent.page / "div.whatsNews ul.newsItem h2 a").each do |a|
                if(a.attr('href') =~ /^http:\/\/online.wsj.com\/article\// or a.attr('href') =~ /^\/article\//)
                        toc_top << [canonical( a.text.strip ), a.attr( 'href' )]
                end
        end
        toc << toc_top

        #
        # scraping all categories
        (agent.page.root / 'div.wsjMainNav li').each do |li|
                a = (li / 'a').first

                title = a.text.strip
                next if(title == "Home" or title == "Market Data" or title == "C-Suite")

                toc_cat = []
                toc_cat << canonical( title )
                begin
                        retry_loop( 5 ) do
                                agent.get(a.attr( 'href' ))
                                sleep 1
                        end
                rescue
                        News2Kindle.logger.error "cannot get #{uri}."
                        raise
                end

                count = 0
                newsLinks = (agent.page / "div.whatsNews ul.newsItem h2 a")
                newsLinks = (agent.page / "div.headlineSummary ul.newsItem h2 a" ) if(newsLinks.size == 0)
                newsLinks.each do |a|
                        if(a.attr('href') =~ /^http:\/\/online.wsj.com\/article\// or a.attr('href') =~ /^\/article\//)
                                toc_cat << [canonical( a.text.strip ), a.attr( 'href' )]
                                count += 1
                                break if(count >= 8)
                        end
                end
                toc << toc_cat
        end

        begin
                generate_contents( toc, agent )
                yield "#{@dst_dir}/wsj-paid.opf"
        end
end