class Meiriyigua::DetailCrawl

Public Class Methods

new(detail_urls) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 8
def initialize(detail_urls)
  @detail_urls = detail_urls
  @agent = CrawlClient.create_agent
end

Public Instance Methods

handle_1234wg(page, page_record) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 57
def handle_1234wg(page, page_record)
  page_record.title = page.search('td[width="583"] > font > strong font').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('body > table[background="/images/hgf-4.gif"] td[style="padding-left:6px;"] a:last-of-type').text
  page_record.content = strip_content(page.search('td#intro'))
  filename = page.search('td[valign="top"] > script:last-of-type').text.split(',')[1][6..-2]
  page_record.downloads = "http://dx2down.bugwg.com:801/#{URI.escape filename}"
  page_record
end
handle_dongdongwg(page, page_record) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 114
def handle_dongdongwg(page, page_record)
  page_record.title = page.search('//div[@class="pageMainArea"]/h1/text()').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('span.current1 a:last-of-type').text
  content = page.search('div#mainSoftIntro')
  content.search('p:last-of-type').remove
  page_record.content = strip_content(content)
  page_record.downloads = join_downloads(page.search('ul.downlistbox a').collect{|a| a['href']})
  page_record
end
handle_gg1z(page, page_record) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 94
def handle_gg1z(page, page_record)
  page_record.title = page.search('div.software-info > div.cp-top > h3').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.nav-breadcrumb a:nth-last-of-type(2)').text
  content = page.search('div.cp-main > div.cp-main')
  content.search('font[color="red"]').remove
  page_record.content = strip_content(content)

  downloads = page.search('ul.download-list a').collect{|a| "http://www.gg1z.com#{a['href']}"}
  downloads = [downloads.first, downloads.last].uniq
  final_downloads = []
  downloads.each do |down|
    down_page = @agent.get(down, nil, page.uri.to_s)
    CrawlClient.set_page_encoding(down_page)
    final_downloads.concat( down_page.search('div.downarea a').collect{|a| a['href'] =~ /^http/ ? a['href'] : "http://www.gg1z.com#{a['href']}"} )
  end
  page_record.downloads = join_downloads(final_downloads)
  page_record
end
handle_nanawg(page, page_record) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 76
def handle_nanawg(page, page_record)
  page_record.title = page.search('div.right_tit').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div#index3 a:last-of-type').text
  page_record.content = page.search('div.rightsum_text4').text
  page_record.downloads = join_downloads(page.search('ul.ul2 a').collect{|a| a['href'] =~ /^http/ ? a['href'] : "http://www.nanawg.com#{a['href']}"})
  page_record
end
handle_qh24(page, page_record) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 67
def handle_qh24(page, page_record)
  page_record.title = page.search('//*[@id="sintro"]/h1/text()').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('h2.classname > a:last-of-type').text
  page_record.content = strip_content(page.search('div.cnt'))
  page_record.downloads = join_downloads(page.search('div#intext dd a').collect{|a| a['href']})
  page_record
end
handle_ucbug(page, page_record) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 85
def handle_ucbug(page, page_record)
  page_record.title = page.search('div.spmain_1 a').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.slhead_1 a:last-of-type').text
  page_record.content = page.search('div.spmain_5').text
  page_record.downloads = join_downloads(page.search('ul.ul_Address a').collect{|a| a['href']})
  page_record
end
handle_url(uri) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 20
def handle_url(uri)

  if UrlRecord.exist_url?(uri.to_s)
    #print "抓取详情页 #{uri.to_s} "
    #puts "重复,跳过"
    return
  end

  page = @agent.get(uri)
  CrawlClient.set_page_encoding(page)
  name = uri.host.to_s.split('.')[1]

  url_record = UrlRecord.new
  url_record.url = uri.to_s
  url_record.detail_at = Time.now

  page_record = PageRecord.new
  url_record.page_record = page_record

  page_record = send("handle_#{name}", page, page_record)
  if page_record.nil?
    print "抓取详情页 #{uri.to_s} "
    puts "失败"
  else
    print "抓取详情页 #{uri.to_s} "
    if url_record.save
      puts "成功"
    else
      puts "保存失败"
    end
  end

  CrawlClient.random_sleep
rescue
    puts "抓取详情出错了 #{$!.class} #{$!.message}\n#{$!.backtrace.join("\n")}"
end
handle_uuuwg(page, page_record) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 125
def handle_uuuwg(page, page_record)
  page_record.title = page.search('div.spmain_1').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.slhead_1 a:last-of-type').text
  page_record.content = strip_content(page.search('div.spmain_5'))
  page_record.downloads = join_downloads(page.search('ul.spmain_3_2 > li:last-of-type a').collect{|a| a['href']})
  page_record
end
handle_xiaolinzi(page, page_record) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 144
def handle_xiaolinzi(page, page_record)
  page_record.title = page.search('div.dlbt_wz').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.head_dh a:last-of-type').text
  page_record.content = strip_content(page.search('div#content_all'))
  page_record.downloads = join_downloads(page.search('div.dl_link_bd a[target="_blank"]').collect{|a| a['href']})
  page_record
end
handle_xixiwg(page, page_record) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 134
def handle_xixiwg(page, page_record)
  page_record.title = page.search('div.r2 h2').text.strip
  return if page_record.title.empty?
  page_record.category = page.search('div.location a:last-of-type').text
  page_record.content = strip_content(page.search('div#intro'))
  filename = page.search('div.xzk script:last-of-type').text.split(',')[1].strip[6..-3]
  page_record.downloads = "http://dxdown1.xixiwg.com/#{URI.escape filename}"
  page_record
end
join_downloads(downloads) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 157
def join_downloads(downloads)
  downloads.uniq.join('#!#')
end
run() click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 13
def run
  while !@detail_urls.empty?
    uri = URI(@detail_urls.pop)
    handle_url(uri)
  end
end
strip_content(content) click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 153
def strip_content(content)
  content.text
end