class NHKore::BingScraper

@author Jonathan Bradley Whited @since 0.2.0

Attributes

regex[R]
site[R]

Public Class Methods

build_url(site,count: DEFAULT_RESULT_COUNT,**kargs) click to toggle source
# File lib/nhkore/search_scraper.rb, line 98
def self.build_url(site,count: DEFAULT_RESULT_COUNT,**kargs)
  url = ''.dup

  url << 'https://www.bing.com/search?'
  url << URI.encode_www_form(
    q: "site:#{site}",
    count: count
  )

  return url
end
new(site,regex: nil,url: nil,**kargs) click to toggle source
Calls superclass method NHKore::SearchScraper::new
# File lib/nhkore/search_scraper.rb, line 74
def initialize(site,regex: nil,url: nil,**kargs)
  case site
  when :futsuu
    regex = FUTSUU_REGEX if regex.nil?
    site = FUTSUU_SITE
  when :yasashii
    regex = YASASHII_REGEX if regex.nil?
    site = YASASHII_SITE
  else
    raise ArgumentError,"invalid site[#{site}]"
  end

  raise ArgumentError,"empty regex[#{regex}]" if regex.nil?

  @regex = regex
  @site = site
  url = self.class.build_url(site,**kargs) if url.nil?

  # Delete class-specific args (don't pass to Open-URI).
  kargs.delete(:count)

  super(url,**kargs)
end

Public Instance Methods

scrape(slinks,page=NextPage.new()) click to toggle source
# File lib/nhkore/search_scraper.rb, line 110
def scrape(slinks,page=NextPage.new())
  next_page,link_count = scrape_html(slinks,page)

  if link_count <= 0
    scrape_rss(slinks,page,next_page)
  end

  return next_page
end
scrape_html(slinks,page,next_page=NextPage.new()) click to toggle source
# File lib/nhkore/search_scraper.rb, line 120
def scrape_html(slinks,page,next_page=NextPage.new())
  doc = html_doc
  link_count = 0

  anchors = doc.css('a')

  anchors.each do |anchor|
    href = anchor['href'].to_s
    href = Util.unspace_web_str(href).downcase

    next if ignore_link?(href)

    if (md = href.match(/first=(\d+)/))
      count = md[1].to_i

      if count > page.count && (next_page.count < 0 || count < next_page.count)
        next_page.count = count
        next_page.url = join_url(href)
      end
    elsif href =~ regex
      slinks.add_link(SearchLink.new(href))

      link_count += 1
    end
  end

  return [next_page,link_count]
end
scrape_rss(slinks,page,next_page=NextPage.new()) click to toggle source
# File lib/nhkore/search_scraper.rb, line 149
def scrape_rss(slinks,page,next_page=NextPage.new())
  link_count = 0

  if !@is_file
    uri = URI(@url)

    Util.replace_uri_query!(uri,format: 'rss')
    self.open(uri)

    doc = rss_doc
    rss_links = []

    doc.items.each do |item|
      link = item.link.to_s
      link = Util.unspace_web_str(link).downcase

      rss_links << link

      next if ignore_link?(link)
      next if link !~ regex

      slinks.add_link(SearchLink.new(link))

      link_count += 1
    end

    # For RSS, Bing will keep returning the same links over and over
    # if it's the last page or the "first=" query is the wrong count.
    # Therefore, we have to test the previous RSS links (+page.rss_links+).
    if next_page.empty? && doc.items.length >= 1 && page.rss_links != rss_links
      next_page.count = (page.count < 0) ? 0 : page.count
      next_page.count += doc.items.length
      next_page.rss_links = rss_links

      uri = URI(page.url.nil? ? @url : page.url)

      Util.replace_uri_query!(uri,first: next_page.count)

      next_page.url = uri
    end
  end

  return [next_page,link_count]
end