module MMonitor::Spider

Public Class Methods

get_html(url, params={}) click to toggle source

抓取HTML

# File lib/mmonitor/spider.rb, line 11
def get_html(url, params={})
  body = get(url, params)
  $body = body
  ::Nokogiri::HTML(body)
end
get_json(url, params={}) click to toggle source

抓取JSON

# File lib/mmonitor/spider.rb, line 17
def get_json(url, params={})
  body = get(url, params)
  ::Oj.load(body) rescue {}
end
get_ocr(photo_url) click to toggle source

抓取图片上的文字

# File lib/mmonitor/spider.rb, line 22
def get_ocr(photo_url)
  image = MiniMagick::Image.open(photo_url)
  image.combine_options do |c|
    c.background '#FFFFFF'
    c.colorspace 'GRAY'
    c.alpha 'remove'
  end
  image.format 'jpg'
  ocr = RTesseract.new(image.path, processor: 'mini_magick')
  str = ocr.to_s
  image.destroy!
  return str
end
number_page(total, limit) click to toggle source

分页

# File lib/mmonitor/spider.rb, line 36
def number_page(total, limit)
  count = total / limit
  count += 1 if total % limit > 0
  count
end

Private Class Methods

conn() click to toggle source

连接

# File lib/mmonitor/spider.rb, line 58
def conn
  @conn ||= Faraday.new(ssl: false)
  @conn.headers[:user_agent] = switcher
  @conn.headers[:accept]     = 'text/html,application/json;q=0.9'
  @conn
end
get(url, params={}) click to toggle source
# File lib/mmonitor/spider.rb, line 44
def get(url, params={})
  resp = conn.get url, params
  # 根据状态返回/跳转/终止
  case resp.status
  when 200 # 正常
    resp.body
  when 302 # 跳转
    get( resp.headers['location'] )
  else
    nil
  end
end
switcher() click to toggle source

混淆

# File lib/mmonitor/spider.rb, line 65
def switcher
  [
    'Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1; 360se)',
    'Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0; .NET CLR 2.0.50727)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; SE 2.X MetaSr 1.0)',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.802.30 Safari/535.1 SE 2.X MetaSr 1.0',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.3 (KHTML, like Gecko) Maxthon/3.3.2.1000 Chrome/16.0.883.0 Safari/535.3',
  ].sample
end