module RelatonGb::TScrapper
Social standard scarpper.
Public Class Methods
scrape_doc(hit)
click to toggle source
@param hit [RelatonGb::Hit] standard's page path @return [RelatonGb::GbBibliographicItem]
# File lib/relaton_gb/t_scrapper.rb, line 43 def scrape_doc(hit) src = "http://www.ttbz.org.cn#{hit.pid}" doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s GbBibliographicItem.new **scrapped_data(doc, src, hit) rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError, Net::OpenTimeout raise RelatonBib::RequestError, "Cannot access #{src}" end
scrape_page(text)
click to toggle source
rubocop:disable Metrics/MethodLength, Metrics/AbcSize @param text [String] @return [RelatonGb::HitCollection]
# File lib/relaton_gb/t_scrapper.rb, line 21 def scrape_page(text) search_html = OpenURI.open_uri( "http://www.ttbz.org.cn/Home/Standard?searchType=2&key=" + CGI.escape(text.tr("-", [8212].pack("U"))) ).read header = Nokogiri::HTML search_html xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a' t_xpath = "../preceding-sibling::td[4]" hits = header.xpath(xpath).map do |h| docref = h.at(t_xpath).text.gsub(/â\u0080\u0094/, "-") status = h.at("../preceding-sibling::td[1]").text.delete "\r\n" pid = h[:href].sub(%r{\/$}, "") Hit.new pid: pid, docref: docref, status: status, scrapper: self end HitCollection.new hits rescue OpenURI::HTTPError, SocketError, OpenSSL::SSL::SSLError, Net::OpenTimeout raise RelatonBib::RequestError, "Cannot access http://www.ttbz.org.cn/Home/Standard" end
Private Class Methods
gbtype()
click to toggle source
# File lib/relaton_gb/t_scrapper.rb, line 97 def gbtype { scope: "social-group", prefix: "T", mandate: "mandatory", topic: "other" } end
get_ccs(doc)
click to toggle source
# File lib/relaton_gb/t_scrapper.rb, line 102 def get_ccs(doc) [doc.xpath('//td[contains(.,"中国标准分类号")]/following-sibling::td[1]') .text.gsub(/[\r\n]/, "").strip.match(/^[^\s]+/).to_s] end
get_committee(doc, _ref)
click to toggle source
rubocop:enable Metrics/MethodLength
# File lib/relaton_gb/t_scrapper.rb, line 79 def get_committee(doc, _ref) { name: doc.xpath('//td[.="团体名称"]/following-sibling::td[1]').text, type: "technical", } end
get_dates(doc)
click to toggle source
# File lib/relaton_gb/t_scrapper.rb, line 114 def get_dates(doc) d = doc.xpath('//td[contains(.,"发布日期")]/following-sibling::td[1]/span') .text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/) [{ type: "published", on: "#{d[:y]}-#{d[:m]}-#{d[:d]}" }] end
get_ics(doc)
click to toggle source
# File lib/relaton_gb/t_scrapper.rb, line 107 def get_ics(doc) xpath = '//td[contains(.,"国际标准分类号")]/following-sibling::td[1]/span' ics = doc.xpath(xpath).text.match(/^[^\s]+/).to_s field, group, subgroup = ics.split "." [{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }] end
get_titles(doc)
click to toggle source
# File lib/relaton_gb/t_scrapper.rb, line 86 def get_titles(doc) xpz = '//td[contains(.,"中文标题")]/following-sibling::td[1]' titles = RelatonBib::TypedTitleString.from_string doc.at(xpz) .text, "zh", "Hans" xpe = '//td[contains(.,"英文标题")]/following-sibling::td[1]' ten = doc.xpath(xpe).text return titles if ten.empty? titles + RelatonBib::TypedTitleString.from_string(ten, "en", "Latn") end
scrapped_data(doc, src, hit)
click to toggle source
rubocop:disable Metrics/MethodLength @param doc [Nokogiri::HTML::Document] @param src [String] @param hit [RelatonGb::Hit] @return [Hash]
# File lib/relaton_gb/t_scrapper.rb, line 58 def scrapped_data(doc, src, hit) # docid_xpt = '//td[contains(.,"标准编号")]/following-sibling::td[1]' # status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span' { committee: get_committee(doc, hit.docref), docid: get_docid(hit.docref), title: get_titles(doc), doctype: get_type, docstatus: get_status(doc, hit.status), gbtype: gbtype, ccs: get_ccs(doc), ics: get_ics(doc), link: [{ type: "src", content: src }], date: get_dates(doc), language: ["zh"], script: ["Hans"], structuredidentifier: fetch_structuredidentifier(hit.docref), } end