module RelatonGb::SecScrapper

Sector standard scrapper

Public Class Methods

scrape_doc(hit) click to toggle source

@param hit [RelatonGb::Hit] @return [RelatonGb::GbBibliographicItem]

# File lib/relaton_gb/sec_scrapper.rb, line 41
def scrape_doc(hit)
  src = "http://hbba.sacinfo.org.cn/stdDetail/#{hit.pid}"
  page_uri = URI src
  doc = Nokogiri::HTML Net::HTTP.get(page_uri)
  GbBibliographicItem.new **scrapped_data(doc, src, hit)
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
       Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
       OpenSSL::SSL::SSLError, Errno::ETIMEDOUT, Net::OpenTimeout
  raise RelatonBib::RequestError, "Cannot access #{src}"
end
scrape_page(text) click to toggle source

@param text [String] code of standard for serarch @return [RelatonGb::HitCollection]

# File lib/relaton_gb/sec_scrapper.rb, line 20
def scrape_page(text)
  # uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}"
  uri = URI "http://hbba.sacinfo.org.cn/stdQueryList"
  resp = Net::HTTP.post uri, URI.encode_www_form({ key: text })
  # res = JSON.parse Net::HTTP.get(uri)
  json = JSON.parse resp.body
  hits = json["records"].map do |h|
    Hit.new pid: h["pk"], docref: h["code"], status: h["status"], scrapper: self
  end
  # hits = res["rows"].map do |r|
  #   Hit.new pid: r["id"], title: r["STD_CODE"], scrapper: self
  # end
  HitCollection.new hits
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
       Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
       OpenSSL::SSL::SSLError, Errno::ETIMEDOUT, Net::OpenTimeout
  raise RelatonBib::RequestError, "Cannot access #{uri}"
end

Private Class Methods

get_ccs(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Array<String>]

# File lib/relaton_gb/sec_scrapper.rb, line 91
def get_ccs(doc)
  [doc.at("//dt[contains(text(), '中国标准分类号')]/following-sibling::dd").text]
end
get_committee(_doc, ref) click to toggle source

@param _doc [Nokogiri::HTML::Document] @param ref [String] @return [Hash]

* :type [String]
* :name [String]
# File lib/relaton_gb/sec_scrapper.rb, line 77
def get_committee(_doc, ref)
  # ref = get_ref(doc)
  name = get_prefix(ref)["administration"]
  { type: "technical", name: name }
end
get_scope(_doc) click to toggle source

@param _doc [Nokogiri::HTML::Document] @return [String]

# File lib/relaton_gb/sec_scrapper.rb, line 85
def get_scope(_doc)
  "sector"
end
get_titles(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

* :title_intro [String]
* :title_main [String]
* :language [String]
* :script [String]
# File lib/relaton_gb/sec_scrapper.rb, line 60
def get_titles(doc)
  # titles = [{ title_main: doc.at("//h4").text.delete("\r\n\t"),
  #             title_intro: nil, language: "zh", script: "Hans" }]
  tzh = doc.at("//h4").text.delete("\r\n\t")
  RelatonBib::TypedTitleString.from_string tzh, "zh", "Hans"
  # title_main = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
  # unless title_main.empty?
  #   titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" }
  # end
  # titles
end