class Meiriyigua::ListCrawl
Attributes
detail_urls[R]
Public Class Methods
new()
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 7 def initialize @list_urls = Queue.new @detail_urls = Queue.new @agent = CrawlClient.create_agent init_url end
Public Instance Methods
handle_1234wg(page)
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 40 def handle_1234wg(page) urls = page.search('td[width="470"] a[href^="/1234/"]') urls.collect {|a| "http://www.1234wg.com#{a['href']}" } end
handle_dongdongwg(page)
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 65 def handle_dongdongwg(page) urls = page.search('span.list_title > a') urls.collect{|a| "http://www.dongdongwg.com#{a['href']}"} end
handle_gg1z(page)
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 60 def handle_gg1z(page) urls = page.search('span.app-name a') urls.collect{|a| "http://www.gg1z.com#{a['href']}"} end
handle_nanawg(page)
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 50 def handle_nanawg(page) urls = page.search('td[width="362"] a:last-of-type') urls.collect{|a| "http://www.nanawg.com#{a['href']}"} end
handle_qh24(page)
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 45 def handle_qh24(page) urls = page.search('div#downhot table a') urls.collect{|a| "http://www.qh24.com#{a['href']}"} end
handle_ucbug(page)
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 55 def handle_ucbug(page) urls = page.search('li.slmain2_2_2 a') urls.collect{|a| a['href']} end
handle_url(uri)
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 22 def handle_url(uri) page = @agent.get(uri) CrawlClient.set_page_encoding(page) name = uri.host.to_s.split('.')[1] urls = send("handle_#{name}", page) if urls.empty? print "抓取列表页 #{uri.to_s} " puts "失败" else urls.each {|a| @detail_urls << a} print "抓取列表页 #{uri.to_s} " puts "成功" end CrawlClient.random_sleep rescue puts "抓取列表出错了 #{$!.class} #{$!.message}\n#{$!.backtrace.join("\n")}" end
handle_uuuwg(page)
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 70 def handle_uuuwg(page) urls = page.search('table.main_table tr > td:nth-child(2) a') urls.collect{|a| "http://www.uuuwg.com#{a['href']}"} end
handle_xiaolinzi(page)
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 80 def handle_xiaolinzi(page) urls = page.search('td.rewid1 > a') urls.collect{|a| "http://www.xiaolinzi.com#{a['href']}"} end
handle_xixiwg(page)
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 75 def handle_xixiwg(page) urls = page.search('div.entry > h2 > a') urls.collect{|a| "http://www.xixiwg.com#{a['href']}"} end
run()
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 15 def run while !@list_urls.empty? uri = URI(@list_urls.pop) handle_url(uri) end end
Private Instance Methods
init_url()
click to toggle source
# File lib/meiriyigua/list_crawl.rb, line 86 def init_url @list_urls << 'http://www.1234wg.com/new.html' @list_urls << 'http://www.qh24.com/new.html' @list_urls << 'http://www.nanawg.com/soft/html/newlist-1.html' @list_urls << 'http://www.ucbug.com/new.html' @list_urls << 'http://www.gg1z.com/soft/html/newlist-1.html' @list_urls << 'http://www.dongdongwg.com/soft/html/newlist-1.html' @list_urls << 'http://www.uuuwg.com/newlist.html' @list_urls << 'http://www.xixiwg.com/new/' @list_urls << 'http://www.xiaolinzi.com/update/' end