class WxExt::SougouWeixin
Spider post from weixin.sogou.com
@author FuShengYang
Public Class Methods
spider_posts_from_sougou(openid, page_index = 1, date_last = (Time.now - 3600 * 24 * 10).strftime("%Y-%m-%d"))
click to toggle source
Spider posts from sougou, only one page.
@param [Enumerable<String>] openid @param [Integer] page_index @param [Enumerable<String>] date_last @return [Hash] A spider posts hash with total_pages etc.
# File lib/wx_ext/sougou_weixin.rb, line 19 def self.spider_posts_from_sougou(openid, page_index = 1, date_last = (Time.now - 3600 * 24 * 10).strftime("%Y-%m-%d")) json_url = "http://weixin.sogou.com/gzhjs?&openid=#{openid}&page=#{page_index}" res = RestClient.get json_url date_last_arr = date_last.to_s.split('-') date_last_to_com = Time.new(date_last_arr[0], date_last_arr[1], date_last_arr[2]) xml_articles = nil response_time = nil total_items = nil total_pages = nil page = nil reg = /gzh\((.*)\).*\/\/<\!--.*--><\!--(\d+)-->/m if reg =~ res.to_s xml_articles = JSON.parse($1)['items'] total_items = JSON.parse($1)['totalItems'] total_pages = JSON.parse($1)['totalPages'] page = JSON.parse($1)['page'] response_time = $2.to_i else return {} end spider_posts = [] xml_articles.each do |xml| doc = Nokogiri::XML(xml, nil, 'UTF-8') date = doc.at_xpath('//DOCUMENT/item/display/date').text spider_post = {} date_arr = date.to_s.split('-') date_to_com = Time.new(date_arr[0], date_arr[1], date_arr[2]) if date_last_to_com < date_to_com title = doc.at_xpath('//DOCUMENT/item/display/title1').text url = doc.at_xpath('//DOCUMENT/item/display/url').text img = doc.at_xpath('//DOCUMENT/item/display/imglink').text content_short = doc.at_xpath('//DOCUMENT/item/display/content168').text doc_post = Nokogiri::HTML(open(url), nil, 'UTF-8') node_author = doc_post.css('div.rich_media_meta_list > em.rich_media_meta.rich_media_meta_text')[1] author = node_author ? node_author.content : '无' content = doc_post.css('div#js_content').first.to_s spider_post = { title: title, url: url, img: img, content_short: content_short, author: author, content: content, date: date } spider_posts.push spider_post else break end end { total_items: total_items, total_pages: total_pages, page: page, response_time: response_time, spider_posts: spider_posts, original_count: xml_articles.count, count: spider_posts.count } end
spider_posts_later_date(openid, date_last = (Time.now - 3600 * 24 * 10).strftime("%Y-%m-%d"))
click to toggle source
Spider posts from sougou, last date.
@param [Enumerable<String>] openid @param [Enumerable<String>] date_last @return [Hash] A spider posts hash with total_pages etc.
# File lib/wx_ext/sougou_weixin.rb, line 90 def self.spider_posts_later_date(openid, date_last = (Time.now - 3600 * 24 * 10).strftime("%Y-%m-%d")) spider_posts_first_page_hash = spider_posts_from_sougou(openid, 1, date_last) total_pages = spider_posts_first_page_hash[:total_pages].to_i spider_posts = [] 1.upto(total_pages).each do |page_index| spider_posts_hash = spider_posts_from_sougou(openid, page_index, date_last) if spider_posts_hash[:original_count] == spider_posts_hash[:count] spider_posts += spider_posts_hash[:spider_posts] else break end end { total_items: spider_posts_first_page_hash[:total_items], total_pages: total_pages, spider_posts: spider_posts.uniq } end