module RestaurantCrawler
Constants
- RESTOPOLITAN_URL
- VERSION
Public Class Methods
crawl()
click to toggle source
# File lib/restaurant_crawler.rb, line 12 def self.crawl database = SQLite3::Database.new "restaurants.sqlite3" Anemone.crawl(RESTOPOLITAN_URL, delay: 0.5) do |anemone| anemone.on_pages_like(/.*\/restaurant\/.*/) do |page| begin restaurant = Restaurant.new page.doc if restaurant.save database puts "[x] " + restaurant.to_s + " saved" else puts "[ ] failed to save " + restaurant.to_s end rescue RuntimeError => e puts "[ ] #{e} : #{page.url} craweld" end end end end
find_emails()
click to toggle source
# File lib/restaurant_crawler.rb, line 30 def self.find_emails database = SQLite3::Database.new "restaurants.sqlite3" # add columns if needed ['email', 'telephone', 'error'].each do |column| begin database.execute "ALTER TABLE restaurants ADD COLUMN #{column} TEXT" rescue SQLite3::SQLException end end database.execute("SELECT * FROM restaurants").each do |row| id = row[0] name = row[1] website = row[2] email = telephone = nil begin doc = Nokogiri::HTML(open website) # get all link doc.css('a').each do |link| # get mailto / telto email = link['href'] if link['href'].include? 'mailto:' telephone = link['href'] if link['href'].include? 'telto:' end if email || telephone stm = database.prepare "UPDATE restaurants SET email = :email, telephone = :telephone WHERE id = :id" stm.bind_param 'id', id stm.bind_param 'email', email stm.bind_param 'telephone', telephone stm.execute puts "[x] #{name} => #{email} / #{telephone}" else raise RuntimeError.new "Restaurant's email / telephone not found" end rescue Exception => e stm = database.prepare "UPDATE restaurants SET error = :error WHERE id = :id" stm.bind_param 'id', id stm.bind_param 'error', e.message stm.execute puts "[ ] #{name} => " + e.message end end database.close end