class ClosestWeightliftingGem::Scraper
Constants
- BASE_URL
Public Class Methods
extract_details(info_div)
click to toggle source
# File lib/closest_weightlifting_gem/scraper.rb, line 79 def self.extract_details(info_div) details = { "street": "", "city": "", "state": "", "zipcode": "", "phone": "", "website": "", "director": "" } begin aa = info_div.children.select { |el| !["br", "b"].include?(el.name) } case aa.length when 1 details[:phone] = aa[0].text when 3 details[:phone] = aa[0].text details[:director] = aa[1].text when 4 details[:phone] = aa[0].text details[:website] = aa[1].text details[:director] = aa[2].text else if aa[1].text.include?(", ") city, state_zip = aa[1].text.split(", ") else city, state_zip = aa[2].text.split(", ") end state, empty, zip = state_zip.split(/[[:space:]]/) details[:street] = aa[0].text details[:city] = city details[:state] = state details[:zipcode] = zip details[:phone] = aa[2].text details[:website] = aa[3]["href"] details[:director] = aa[4].text end rescue StandardError => e puts "[ERROR] extracting details from: #{aa}" puts "[ERROR] #{e}" end details.each { |k, v| details[k] = "" if v.nil? } details end
get_state_abbreviations(index)
click to toggle source
# File lib/closest_weightlifting_gem/scraper.rb, line 9 def self.get_state_abbreviations(index) index.search("select#CompanyState").children[2..-1].collect { |child| child.attr("value") } end
scrape_gym_page(gym_row)
click to toggle source
# File lib/closest_weightlifting_gem/scraper.rb, line 62 def self.scrape_gym_page(gym_row) gym_doc = Nokogiri::HTML(open("#{BASE_URL + gym_row.search("a").first.attr("onclick").match(/\/V.+true/)[0]}")) ClosestWeightliftingGem::Gym.new({ :name => gym_doc.search("h3").text, :street => gym_doc.search("p").children[0].to_s, :city => gym_row.search("p").children[2].to_s.split(/\W+/)[0], :state => gym_row.search("p").children[2].to_s.split(/\W+/)[1], :zipcode => gym_row.search("p").children[2].to_s.split(/\W+/)[-1], :phone => gym_row.search("p").children[4].to_s, :director => gym_doc.search(".fe_big_row:nth-child(2) td+ td").text, :coach => gym_doc.search(".fe_big_row+ .fe_big_row td+ td").text, :website => gym_doc.text.split("site:")[1].split("\r").first[1..-1], :usaw_url => gym_row.search("a").first.attr("onclick").match(/\/V.+true/)[0] }) end
scrape_main()
click to toggle source
# File lib/closest_weightlifting_gem/scraper.rb, line 13 def self.scrape_main puts "Fetching index..." index = Nokogiri::HTML(open("#{BASE_URL}/Clubs.wp?frm=t&RF=Zp%2CST")) get_state_abbreviations(index).each { |state| scrape_state_page(state) } puts "\n\nSorry that took so long." end
scrape_state_page(state)
click to toggle source
Form Data do I need cookies, etc now?
# File lib/closest_weightlifting_gem/scraper.rb, line 24 def self.scrape_state_page(state) puts "Fetching gym data in #{state}..." data = { 'wp_ClientOrgID' => '', 'CompanyParentID' => '', 'CompanyName' => '', 'CompanyState' => state, 'geo_Zip' => '', 'geo_Miles' => 25, 'submit' => 'Go' } url = URI("#{BASE_URL}/Clubs.wp?frm=t&RF=Zp%2CST") http = Net::HTTP.new(url.host, url.port) http.use_ssl = true request = Net::HTTP::Post.new(url) request["Content-Type"] = 'application/x-www-form-urlencoded' request["cache-control"] = 'no-cache' request.body = "wp_ClientOrgID=&CompanyParentID=&CompanyName=&CompanyState=#{state}&geo_Zip=&geo_Miles=25&submit=Go" response = http.request(request) state_doc = Nokogiri::HTML(response.read_body) state_doc.search("#wp_Clubs li").each do |gym_row| details = extract_details(gym_row.search("p")[0]) ClosestWeightliftingGem::Gym.new({ :name => gym_row.search("h3").text.strip, :street => details[:street], #gym_row.search("p").children[0].to_s, :city => details[:city], #gym_row.search("p").children[2].to_s.split(/\W+/)[0], :state => state, :zipcode => details[:zipcode], #gym_row.search("p").children[2].to_s.split(/\W+/)[-1], :phone => details[:phone], #gym_row.search("p").children[4].to_s, :website => details[:website], :director => details[:director] }) end end