class QueenShop::Scraper
extract_data
class uses xpath selectors to get attribs
Constants
- ACCESSORIES_URI
- BASE_SCRAPE_URL
- BASE_URL
- IMAGE_SELECTOR
- ITEM_SELECTOR
xml selectors that will be used to scrape data
- LATEST_URI
- LINK_SELECTOR
- PAGES_SELECTOR
- PANTS_URI
- POPULAR_URI
- PRICE_SELECTOR
- TITLE_SELECTOR
- TOPS_URI
Public Instance Methods
accessories(page, options = {})
click to toggle source
# File lib/queenshop/scraper.rb, line 47 def accessories(page, options = {}) uri = uri_with_options(build_uri(ACCESSORIES_URI, options), page) process_request(uri, options) end
latest(page, options = {})
click to toggle source
# File lib/queenshop/scraper.rb, line 27 def latest(page, options = {}) uri = uri_with_options(build_uri(LATEST_URI, options), page) process_request(uri, options) end
pants(page, options = {})
click to toggle source
# File lib/queenshop/scraper.rb, line 42 def pants(page, options = {}) uri = uri_with_options(build_uri(PANTS_URI, options), page) process_request(uri, options) end
popular(page, options = {})
click to toggle source
# File lib/queenshop/scraper.rb, line 32 def popular(page, options = {}) uri = uri_with_options(build_uri(POPULAR_URI, options), page) process_request(uri, options) end
scrape(type, options = {})
click to toggle source
# File lib/queenshop/scraper.rb, line 57 def scrape(type, options = {}) records = [] valid_args = [:tops, :popular, :pants, :pants, :accessories, :latest, :search] abort 'invalid parameter - scrape type' unless valid_args.include?(type.to_sym) scrape_what(type, options) end
search(page, options = {})
click to toggle source
# File lib/queenshop/scraper.rb, line 52 def search(page, options = {}) uri = uri_with_options(build_uri(BASE_SCRAPE_URL, options), page) process_request(uri, options) end
tops(page, options = {})
click to toggle source
# File lib/queenshop/scraper.rb, line 37 def tops(page, options = {}) uri = uri_with_options(build_uri(TOPS_URI, options), page) process_request(uri, options) end
Private Instance Methods
build_uri(uri, options = {})
click to toggle source
# File lib/queenshop/scraper.rb, line 90 def build_uri(uri, options = {}) opts = { uri: uri } unless options.empty? opts[:keyword] = options[:keyword] if options[:keyword] end opts end
extract_data(raw)
click to toggle source
iterate over every element of item using xpath
# File lib/queenshop/scraper.rb, line 118 def extract_data(raw) Oga.parse_html(raw) .xpath(ITEM_SELECTOR) .map { |item| parse(item) } end
extract_images(item)
click to toggle source
extract two images and return array or urls
# File lib/queenshop/scraper.rb, line 147 def extract_images(item) image = item.xpath(IMAGE_SELECTOR).text image_hover = image.sub(/\.jpg/, '-h.jpg') image_hover = image.sub(/\.png/, '-h.png') unless image_hover != image ["#{BASE_URL}#{image}", "#{BASE_URL}#{image_hover}"] end
extract_link(item)
click to toggle source
get the link to the item
# File lib/queenshop/scraper.rb, line 155 def extract_link(item) "#{BASE_URL}/#{item.xpath(LINK_SELECTOR).text}" end
extract_price(item)
click to toggle source
get rid of the NT and convert to integer
# File lib/queenshop/scraper.rb, line 142 def extract_price(item) item.xpath(PRICE_SELECTOR).text.sub(/NT. /, '').to_i end
extract_title(item)
click to toggle source
Iconv is neccessary here otherwise text is unreadable
# File lib/queenshop/scraper.rb, line 135 def extract_title(item) ic = Iconv.new('UTF-8','big5') raw_title = item.xpath(TITLE_SELECTOR).text ic.iconv(raw_title) end
filter(data, options)
click to toggle source
filter by price if the options are not empty
# File lib/queenshop/scraper.rb, line 74 def filter(data, options) results = data unless options.empty? results = match_price(results, options[:price_boundary]) if options[:price_boundary] end results end
match_price(data, boundary)
click to toggle source
do the actual extraction of prices from the result set
# File lib/queenshop/scraper.rb, line 83 def match_price(data, boundary) lower_bound = boundary.first || 0 upper_bound = boundary.last || Float::INFINITY data.select { |item| lower_bound <= item[:price] && item[:price] <= upper_bound } end
open_uri(uri)
click to toggle source
try open the URL, fail on error
# File lib/queenshop/scraper.rb, line 111 def open_uri(uri) open(uri) {|file| file.read} rescue StandardError 'error opening site url' end
parse(item)
click to toggle source
call methods to extract the data using xpath
# File lib/queenshop/scraper.rb, line 125 def parse(item) { title: extract_title(item), price: extract_price(item), images: extract_images(item), link: extract_link(item) } end
process_request(uri, options)
click to toggle source
# File lib/queenshop/scraper.rb, line 67 def process_request(uri, options) body = open_uri(uri) data = extract_data(body) filter(data, options) end
scrape_what(type, options)
click to toggle source
# File lib/queenshop/scraper.rb, line 159 def scrape_what(type, options) records = [] pl = options[:page_limit].to_i page_limit = pl != 0 ? pl : 5 1.upto(page_limit) do |page| method = self.method(type) records.push(method.call(page, options)) end records end
uri_with_options(options = {}, page)
click to toggle source
# File lib/queenshop/scraper.rb, line 98 def uri_with_options(options = {}, page) uri = '' unless options.empty? kw = options[:keyword] || nil ic = Iconv.new('big5','UTF-8') keyword = ic.iconv(kw) uri << "#{options[:uri]}&pageno=#{page}" if options[:uri] uri << "br=X&keyword=#{URI.escape(keyword)}" if options[:keyword] end uri end