class Hongkong::News::Scrapers::OrientalDailyScraper
Constants
- LIST_URL
Public Instance Methods
name()
click to toggle source
# File lib/hongkong/news/scrapers/oriental_daily_scraper.rb, line 12 def name "orientaldaily" end
news(url)
click to toggle source
Extract article from page
# File lib/hongkong/news/scrapers/oriental_daily_scraper.rb, line 30 def news(url) visit url # wait for content to be loaded first("#contentCTN-right") document = Document.new document.source = name document.title = doc.css("h1").text document.url = url document.html = html document.content = page.evaluate_script("HongKongNews.getInnerText('#contentCTN-top')") + "\n" + page.evaluate_script("HongKongNews.getInnerText('#contentCTN-right')") image = doc.search("#contentCTN .photo img").first document.image_url = URI::join(url, image["src"]).to_s if image document end
news_links()
click to toggle source
Extract all news links
# File lib/hongkong/news/scrapers/oriental_daily_scraper.rb, line 17 def news_links visit LIST_URL links = doc.css("#articleListSELECT option").collect do |option| link = Link.new link.title = option.text link.url = URI::join(LIST_URL, option["value"]).to_s link end.reject { |l| l.url.to_s.end_with?("#") } links end