module MMonitor::Spider
Public Class Methods
get_html(url, params={})
click to toggle source
抓取HTML
# File lib/mmonitor/spider.rb, line 11 def get_html(url, params={}) body = get(url, params) $body = body ::Nokogiri::HTML(body) end
get_json(url, params={})
click to toggle source
抓取JSON
# File lib/mmonitor/spider.rb, line 17 def get_json(url, params={}) body = get(url, params) ::Oj.load(body) rescue {} end
get_ocr(photo_url)
click to toggle source
抓取图片上的文字
# File lib/mmonitor/spider.rb, line 22 def get_ocr(photo_url) image = MiniMagick::Image.open(photo_url) image.combine_options do |c| c.background '#FFFFFF' c.colorspace 'GRAY' c.alpha 'remove' end image.format 'jpg' ocr = RTesseract.new(image.path, processor: 'mini_magick') str = ocr.to_s image.destroy! return str end
number_page(total, limit)
click to toggle source
分页
# File lib/mmonitor/spider.rb, line 36 def number_page(total, limit) count = total / limit count += 1 if total % limit > 0 count end
Private Class Methods
conn()
click to toggle source
连接
# File lib/mmonitor/spider.rb, line 58 def conn @conn ||= Faraday.new(ssl: false) @conn.headers[:user_agent] = switcher @conn.headers[:accept] = 'text/html,application/json;q=0.9' @conn end
get(url, params={})
click to toggle source
# File lib/mmonitor/spider.rb, line 44 def get(url, params={}) resp = conn.get url, params # 根据状态返回/跳转/终止 case resp.status when 200 # 正常 resp.body when 302 # 跳转 get( resp.headers['location'] ) else nil end end
switcher()
click to toggle source
混淆
# File lib/mmonitor/spider.rb, line 65 def switcher [ 'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; 360se)', 'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0; .NET CLR 2.0.50727)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; SE 2.X MetaSr 1.0)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.802.30 Safari/535.1 SE 2.X MetaSr 1.0', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.3 (KHTML, like Gecko) Maxthon/3.3.2.1000 Chrome/16.0.883.0 Safari/535.3', ].sample end