class Meiriyigua::DetailCrawl
Public Class Methods
new(detail_urls)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 8 def initialize(detail_urls) @detail_urls = detail_urls @agent = CrawlClient.create_agent end
Public Instance Methods
handle_1234wg(page, page_record)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 57 def handle_1234wg(page, page_record) page_record.title = page.search('td[width="583"] > font > strong font').text.strip return if page_record.title.empty? page_record.category = page.search('body > table[background="/images/hgf-4.gif"] td[style="padding-left:6px;"] a:last-of-type').text page_record.content = strip_content(page.search('td#intro')) filename = page.search('td[valign="top"] > script:last-of-type').text.split(',')[1][6..-2] page_record.downloads = "http://dx2down.bugwg.com:801/#{URI.escape filename}" page_record end
handle_dongdongwg(page, page_record)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 114 def handle_dongdongwg(page, page_record) page_record.title = page.search('//div[@class="pageMainArea"]/h1/text()').text.strip return if page_record.title.empty? page_record.category = page.search('span.current1 a:last-of-type').text content = page.search('div#mainSoftIntro') content.search('p:last-of-type').remove page_record.content = strip_content(content) page_record.downloads = join_downloads(page.search('ul.downlistbox a').collect{|a| a['href']}) page_record end
handle_gg1z(page, page_record)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 94 def handle_gg1z(page, page_record) page_record.title = page.search('div.software-info > div.cp-top > h3').text.strip return if page_record.title.empty? page_record.category = page.search('div.nav-breadcrumb a:nth-last-of-type(2)').text content = page.search('div.cp-main > div.cp-main') content.search('font[color="red"]').remove page_record.content = strip_content(content) downloads = page.search('ul.download-list a').collect{|a| "http://www.gg1z.com#{a['href']}"} downloads = [downloads.first, downloads.last].uniq final_downloads = [] downloads.each do |down| down_page = @agent.get(down, nil, page.uri.to_s) CrawlClient.set_page_encoding(down_page) final_downloads.concat( down_page.search('div.downarea a').collect{|a| a['href'] =~ /^http/ ? a['href'] : "http://www.gg1z.com#{a['href']}"} ) end page_record.downloads = join_downloads(final_downloads) page_record end
handle_nanawg(page, page_record)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 76 def handle_nanawg(page, page_record) page_record.title = page.search('div.right_tit').text.strip return if page_record.title.empty? page_record.category = page.search('div#index3 a:last-of-type').text page_record.content = page.search('div.rightsum_text4').text page_record.downloads = join_downloads(page.search('ul.ul2 a').collect{|a| a['href'] =~ /^http/ ? a['href'] : "http://www.nanawg.com#{a['href']}"}) page_record end
handle_qh24(page, page_record)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 67 def handle_qh24(page, page_record) page_record.title = page.search('//*[@id="sintro"]/h1/text()').text.strip return if page_record.title.empty? page_record.category = page.search('h2.classname > a:last-of-type').text page_record.content = strip_content(page.search('div.cnt')) page_record.downloads = join_downloads(page.search('div#intext dd a').collect{|a| a['href']}) page_record end
handle_ucbug(page, page_record)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 85 def handle_ucbug(page, page_record) page_record.title = page.search('div.spmain_1 a').text.strip return if page_record.title.empty? page_record.category = page.search('div.slhead_1 a:last-of-type').text page_record.content = page.search('div.spmain_5').text page_record.downloads = join_downloads(page.search('ul.ul_Address a').collect{|a| a['href']}) page_record end
handle_url(uri)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 20 def handle_url(uri) if UrlRecord.exist_url?(uri.to_s) #print "抓取详情页 #{uri.to_s} " #puts "重复,跳过" return end page = @agent.get(uri) CrawlClient.set_page_encoding(page) name = uri.host.to_s.split('.')[1] url_record = UrlRecord.new url_record.url = uri.to_s url_record.detail_at = Time.now page_record = PageRecord.new url_record.page_record = page_record page_record = send("handle_#{name}", page, page_record) if page_record.nil? print "抓取详情页 #{uri.to_s} " puts "失败" else print "抓取详情页 #{uri.to_s} " if url_record.save puts "成功" else puts "保存失败" end end CrawlClient.random_sleep rescue puts "抓取详情出错了 #{$!.class} #{$!.message}\n#{$!.backtrace.join("\n")}" end
handle_uuuwg(page, page_record)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 125 def handle_uuuwg(page, page_record) page_record.title = page.search('div.spmain_1').text.strip return if page_record.title.empty? page_record.category = page.search('div.slhead_1 a:last-of-type').text page_record.content = strip_content(page.search('div.spmain_5')) page_record.downloads = join_downloads(page.search('ul.spmain_3_2 > li:last-of-type a').collect{|a| a['href']}) page_record end
handle_xiaolinzi(page, page_record)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 144 def handle_xiaolinzi(page, page_record) page_record.title = page.search('div.dlbt_wz').text.strip return if page_record.title.empty? page_record.category = page.search('div.head_dh a:last-of-type').text page_record.content = strip_content(page.search('div#content_all')) page_record.downloads = join_downloads(page.search('div.dl_link_bd a[target="_blank"]').collect{|a| a['href']}) page_record end
handle_xixiwg(page, page_record)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 134 def handle_xixiwg(page, page_record) page_record.title = page.search('div.r2 h2').text.strip return if page_record.title.empty? page_record.category = page.search('div.location a:last-of-type').text page_record.content = strip_content(page.search('div#intro')) filename = page.search('div.xzk script:last-of-type').text.split(',')[1].strip[6..-3] page_record.downloads = "http://dxdown1.xixiwg.com/#{URI.escape filename}" page_record end
join_downloads(downloads)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 157 def join_downloads(downloads) downloads.uniq.join('#!#') end
run()
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 13 def run while !@detail_urls.empty? uri = URI(@detail_urls.pop) handle_url(uri) end end
strip_content(content)
click to toggle source
# File lib/meiriyigua/detail_crawl.rb, line 153 def strip_content(content) content.text end