class Collection::Taobao
Public Class Methods
down(urls)
click to toggle source
# File lib/collection/taobao.rb, line 43 def self.down(urls) urls.each_with_index do |url, idx| url = url[0..3] == "http" ? url : "https:#{url}" data = open(url, 'User-Agent' => 'ruby') {|f| f.read} fmidx = url.rindex(".") fm = url[fmidx + 1, url.length - fmidx] path = "#{Rails.root}/taobao" FileUtils.mkdir(path) unless Dir.exists? path file = File.new("#{path}/#{idx}.#{fm}", 'w+') file.binmode file << data file.flush file.close end merge_image end
get_text(urls)
click to toggle source
# File lib/collection/taobao.rb, line 27 def self.get_text(urls) gettoken = Struggle::Http.new("https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=ob5etUcqOmPr7HA5co98yreC&client_secret=puRIlGWeOtaLbbPr5zZfeyNgLEC88wyF") tokenresult = eval gettoken.post.body token = tokenresult[:access_token] words = [] urls.each do |url| getText = Struggle::Http.new("https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic?access_token=" + token) text = getText.post({url: url}, {"Content-Type" => "application/x-www-form-urlencoded"}) r = eval text.body if r[:words_result] && r[:words_result].length > 0 words += r[:words_result].collect {|w| w[:words]} end end words.join("\r\n") end
merge_image()
click to toggle source
# File lib/collection/taobao.rb, line 61 def self.merge_image images = [] Dir.foreach("#{Rails.root}/taobao").each do |file| if file != "." and file != ".." images << "#{Rails.root}/taobao/" + file end end saveImgPath = "#{Rails.root}/taobao_img/#{Time.now.strftime "%Y%m%d%H%M%S"}.jpg" system <<-EOF convert -append #{images.join(" ")} #{saveImgPath} EOF saveImgPath end
run(url)
click to toggle source
# File lib/collection/taobao.rb, line 6 def self.run(url) # 注释部分为浏览器驱动,可以隐藏运行浏览器。但是抓取淘宝失效,淘宝必须开启浏览器。 # filePath = File.expand_path(File.dirname(File.dirname(__FILE__))) # chromedriverPath = File.expand_path("collection/chromedriver", filePath) # Selenium::WebDriver::Chrome.driver_path = chromedriverPath # options = Selenium::WebDriver::Chrome::Options.new # options.add_argument("headless") # browser = Watir::Browser.new :chrome, options: options browser = Watir::Browser.new :chrome browser.goto url images = [] browser.div(id: "description").imgs.each do |img| img.scroll_into_view sleep 2 puts img.src images << img.src end browser.close {imgpath: down(images), text: get_text(images)} end