module RelatonGb::GbScrapper

National standard scrapper.

Public Class Methods

get_committee(doc, _ref) click to toggle source

@param doc [Nokogiri::HTML] @param _ref [String] @return [Hash]

* :type [String]
* :name [String]
# File lib/relaton_gb/gb_scrapper.rb, line 50
def get_committee(doc, _ref)
  name = doc.at("//div[contains(text(), '归口单位')]/following-sibling::div")
  { type: "technical", name: name.text.delete("\r\n\t\t") }
end
scrape_doc(hit) click to toggle source

@param hit [RelatonGb::Hit] standard's page id @return [RelatonGb::GbBibliographicItem]

# File lib/relaton_gb/gb_scrapper.rb, line 37
def scrape_doc(hit)
  src = "http://openstd.samr.gov.cn/bzgk/gb/newGbInfo?hcno=" + hit.pid
  doc = Nokogiri::HTML OpenURI.open_uri(src)
  GbBibliographicItem.new **scrapped_data(doc, src, hit)
rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError, Net::OpenTimeout
  raise RelatonBib::RequestError, "Cannot access #{src}"
end
scrape_page(text) click to toggle source

@param text [Strin] code of standard for serarch @return [RelatonGb::HitCollection]

# File lib/relaton_gb/gb_scrapper.rb, line 17
def scrape_page(text)
  search_html = OpenURI.open_uri(
    "http://openstd.samr.gov.cn/bzgk/gb/std_list?p.p2=" + text,
  )
  result = Nokogiri::HTML search_html
  hits = result.xpath(
    "//table[contains(@class, 'result_list')]/tbody[2]/tr",
  ).map do |h|
    ref = h.at "./td[2]/a"
    pid = ref[:onclick].match(/[0-9A-F]+/).to_s
    rdate = h.at("./td[7]").text
    Hit.new pid: pid, docref: ref.text, scrapper: self, release_date: rdate
  end
  HitCollection.new hits.sort_by(&:release_date).reverse
rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError, Net::OpenTimeout
  raise RelatonBib::RequestError, "Cannot access http://www.std.gov.cn/bzgk/gb/std_list"
end