class Collection::Taobao

Public Class Methods

down(urls) click to toggle source
# File lib/collection/taobao.rb, line 43
def self.down(urls)
  urls.each_with_index do |url, idx|
    url = url[0..3] == "http" ? url : "https:#{url}"
    data = open(url, 'User-Agent' => 'ruby') {|f| f.read}
    fmidx = url.rindex(".")
    fm = url[fmidx + 1, url.length - fmidx]

    path = "#{Rails.root}/taobao"
    FileUtils.mkdir(path) unless Dir.exists? path
    file = File.new("#{path}/#{idx}.#{fm}", 'w+')
    file.binmode
    file << data
    file.flush
    file.close
  end
  merge_image
end
get_text(urls) click to toggle source
# File lib/collection/taobao.rb, line 27
def self.get_text(urls)
  gettoken = Struggle::Http.new("https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=ob5etUcqOmPr7HA5co98yreC&client_secret=puRIlGWeOtaLbbPr5zZfeyNgLEC88wyF")
  tokenresult = eval gettoken.post.body
  token = tokenresult[:access_token]
  words = []
  urls.each do |url|
    getText = Struggle::Http.new("https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic?access_token=" + token)
    text = getText.post({url: url}, {"Content-Type" => "application/x-www-form-urlencoded"})
    r = eval text.body
    if r[:words_result] && r[:words_result].length > 0
      words += r[:words_result].collect {|w| w[:words]}
    end
  end
  words.join("\r\n")
end
merge_image() click to toggle source
# File lib/collection/taobao.rb, line 61
    def self.merge_image
      images = []
      Dir.foreach("#{Rails.root}/taobao").each do |file|
        if file != "." and file != ".."
          images << "#{Rails.root}/taobao/" + file
        end
      end
      saveImgPath = "#{Rails.root}/taobao_img/#{Time.now.strftime "%Y%m%d%H%M%S"}.jpg"
      system <<-EOF
      convert -append #{images.join(" ")} #{saveImgPath}
      EOF
      saveImgPath
    end
run(url) click to toggle source
# File lib/collection/taobao.rb, line 6
def self.run(url)
  # 注释部分为浏览器驱动,可以隐藏运行浏览器。但是抓取淘宝失效,淘宝必须开启浏览器。
  # filePath = File.expand_path(File.dirname(File.dirname(__FILE__)))
  # chromedriverPath = File.expand_path("collection/chromedriver", filePath)
  # Selenium::WebDriver::Chrome.driver_path = chromedriverPath
  # options = Selenium::WebDriver::Chrome::Options.new
  # options.add_argument("headless")
  # browser = Watir::Browser.new :chrome, options: options
  browser = Watir::Browser.new :chrome
  browser.goto url
  images = []
  browser.div(id: "description").imgs.each do |img|
    img.scroll_into_view
    sleep 2
    puts img.src
    images << img.src
  end
  browser.close
  {imgpath: down(images), text: get_text(images)}
end