module RelatonGb::SecScrapper
Sector standard scrapper
Public Class Methods
scrape_doc(hit)
click to toggle source
@param hit [RelatonGb::Hit] @return [RelatonGb::GbBibliographicItem]
# File lib/relaton_gb/sec_scrapper.rb, line 41 def scrape_doc(hit) src = "http://hbba.sacinfo.org.cn/stdDetail/#{hit.pid}" page_uri = URI src doc = Nokogiri::HTML Net::HTTP.get(page_uri) GbBibliographicItem.new **scrapped_data(doc, src, hit) rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, OpenSSL::SSL::SSLError, Errno::ETIMEDOUT, Net::OpenTimeout raise RelatonBib::RequestError, "Cannot access #{src}" end
scrape_page(text)
click to toggle source
@param text [String] code of standard for serarch @return [RelatonGb::HitCollection]
# File lib/relaton_gb/sec_scrapper.rb, line 20 def scrape_page(text) # uri = URI "http://www.std.gov.cn/hb/search/hbPage?searchText=#{text}" uri = URI "http://hbba.sacinfo.org.cn/stdQueryList" resp = Net::HTTP.post uri, URI.encode_www_form({ key: text }) # res = JSON.parse Net::HTTP.get(uri) json = JSON.parse resp.body hits = json["records"].map do |h| Hit.new pid: h["pk"], docref: h["code"], status: h["status"], scrapper: self end # hits = res["rows"].map do |r| # Hit.new pid: r["id"], title: r["STD_CODE"], scrapper: self # end HitCollection.new hits rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, OpenSSL::SSL::SSLError, Errno::ETIMEDOUT, Net::OpenTimeout raise RelatonBib::RequestError, "Cannot access #{uri}" end
Private Class Methods
get_ccs(doc)
click to toggle source
@param doc [Nokogiri::HTML::Document] @return [Array<String>]
# File lib/relaton_gb/sec_scrapper.rb, line 91 def get_ccs(doc) [doc.at("//dt[contains(text(), '中国标准分类号')]/following-sibling::dd").text] end
get_committee(_doc, ref)
click to toggle source
@param _doc [Nokogiri::HTML::Document] @param ref [String] @return [Hash]
* :type [String] * :name [String]
# File lib/relaton_gb/sec_scrapper.rb, line 77 def get_committee(_doc, ref) # ref = get_ref(doc) name = get_prefix(ref)["administration"] { type: "technical", name: name } end
get_scope(_doc)
click to toggle source
@param _doc [Nokogiri::HTML::Document] @return [String]
# File lib/relaton_gb/sec_scrapper.rb, line 85 def get_scope(_doc) "sector" end
get_titles(doc)
click to toggle source
@param doc [Nokogiri::HTML::Document] @return [Array<Hash>]
* :title_intro [String] * :title_main [String] * :language [String] * :script [String]
# File lib/relaton_gb/sec_scrapper.rb, line 60 def get_titles(doc) # titles = [{ title_main: doc.at("//h4").text.delete("\r\n\t"), # title_intro: nil, language: "zh", script: "Hans" }] tzh = doc.at("//h4").text.delete("\r\n\t") RelatonBib::TypedTitleString.from_string tzh, "zh", "Hans" # title_main = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s # unless title_main.empty? # titles << { title_main: title_main, title_intro: nil, language: "en", script: "Latn" } # end # titles end