class QueenShop::Scraper

extract_data class uses xpath selectors to get attribs

Constants

ACCESSORIES_URI
BASE_SCRAPE_URL
BASE_URL
IMAGE_SELECTOR
ITEM_SELECTOR

xml selectors that will be used to scrape data

LATEST_URI
PAGES_SELECTOR
PANTS_URI
PRICE_SELECTOR
TITLE_SELECTOR
TOPS_URI

Public Instance Methods

accessories(page, options = {}) click to toggle source
# File lib/queenshop/scraper.rb, line 47
def accessories(page, options = {})
  uri  = uri_with_options(build_uri(ACCESSORIES_URI, options), page)
  process_request(uri, options)
end
latest(page, options = {}) click to toggle source
# File lib/queenshop/scraper.rb, line 27
def latest(page, options = {})
  uri  = uri_with_options(build_uri(LATEST_URI, options), page)
  process_request(uri, options)
end
pants(page, options = {}) click to toggle source
# File lib/queenshop/scraper.rb, line 42
def pants(page, options = {})
  uri  = uri_with_options(build_uri(PANTS_URI, options), page)
  process_request(uri, options)
end
scrape(type, options = {}) click to toggle source
# File lib/queenshop/scraper.rb, line 57
def scrape(type, options = {})
  records = []
  valid_args = [:tops, :popular, :pants, :pants,
    :accessories, :latest, :search]
  abort 'invalid parameter - scrape type' unless valid_args.include?(type.to_sym)
  scrape_what(type, options)
end
tops(page, options = {}) click to toggle source
# File lib/queenshop/scraper.rb, line 37
def tops(page, options = {})
  uri  = uri_with_options(build_uri(TOPS_URI, options), page)
  process_request(uri, options)
end

Private Instance Methods

build_uri(uri, options = {}) click to toggle source
# File lib/queenshop/scraper.rb, line 90
def build_uri(uri, options = {})
  opts = { uri: uri }
  unless options.empty?
    opts[:keyword] = options[:keyword] if options[:keyword]
  end
  opts
end
extract_data(raw) click to toggle source

iterate over every element of item using xpath

# File lib/queenshop/scraper.rb, line 118
def extract_data(raw)
  Oga.parse_html(raw)
     .xpath(ITEM_SELECTOR)
     .map { |item| parse(item) }
end
extract_images(item) click to toggle source

extract two images and return array or urls

# File lib/queenshop/scraper.rb, line 147
def extract_images(item)
  image       = item.xpath(IMAGE_SELECTOR).text
  image_hover = image.sub(/\.jpg/, '-h.jpg')
  image_hover = image.sub(/\.png/, '-h.png') unless image_hover != image
  ["#{BASE_URL}#{image}", "#{BASE_URL}#{image_hover}"]
end
extract_price(item) click to toggle source

get rid of the NT and convert to integer

# File lib/queenshop/scraper.rb, line 142
def extract_price(item)
  item.xpath(PRICE_SELECTOR).text.sub(/NT. /, '').to_i
end
extract_title(item) click to toggle source

Iconv is neccessary here otherwise text is unreadable

# File lib/queenshop/scraper.rb, line 135
def extract_title(item)
  ic = Iconv.new('UTF-8','big5')
  raw_title = item.xpath(TITLE_SELECTOR).text
  ic.iconv(raw_title)
end
filter(data, options) click to toggle source

filter by price if the options are not empty

# File lib/queenshop/scraper.rb, line 74
def filter(data, options)
  results = data
  unless options.empty?
    results = match_price(results, options[:price_boundary]) if options[:price_boundary]
  end
  results
end
match_price(data, boundary) click to toggle source

do the actual extraction of prices from the result set

# File lib/queenshop/scraper.rb, line 83
def match_price(data, boundary)
  lower_bound = boundary.first || 0
  upper_bound = boundary.last  || Float::INFINITY

  data.select { |item| lower_bound <= item[:price] && item[:price] <= upper_bound }
end
open_uri(uri) click to toggle source

try open the URL, fail on error

# File lib/queenshop/scraper.rb, line 111
def open_uri(uri)
  open(uri) {|file| file.read}
rescue StandardError
  'error opening site url'
end
parse(item) click to toggle source

call methods to extract the data using xpath

# File lib/queenshop/scraper.rb, line 125
def parse(item)
  {
    title:  extract_title(item),
    price:  extract_price(item),
    images: extract_images(item),
    link:   extract_link(item)
  }
end
process_request(uri, options) click to toggle source
# File lib/queenshop/scraper.rb, line 67
def process_request(uri, options)
  body = open_uri(uri)
  data = extract_data(body)
  filter(data, options)
end
scrape_what(type, options) click to toggle source
# File lib/queenshop/scraper.rb, line 159
def scrape_what(type, options)
  records = []
  pl = options[:page_limit].to_i
  page_limit = pl != 0 ? pl : 5

  1.upto(page_limit) do |page|
    method = self.method(type)
    records.push(method.call(page, options))
  end
  records
end
uri_with_options(options = {}, page) click to toggle source
# File lib/queenshop/scraper.rb, line 98
def uri_with_options(options = {}, page)
  uri = ''
  unless options.empty?
    kw = options[:keyword] || nil
    ic = Iconv.new('big5','UTF-8')
    keyword = ic.iconv(kw)
    uri << "#{options[:uri]}&pageno=#{page}" if options[:uri]
    uri << "br=X&keyword=#{URI.escape(keyword)}" if options[:keyword]
  end
  uri
end