module RelatonGb::Scrapper

Common scrapping methods.

Constants

STAGES

Public Instance Methods

fetch_structuredidentifier(docref) click to toggle source

@param docref [String] @return [RelatonIsoBib::StructuredIdentifier]

# File lib/relaton_gb/scrapper.rb, line 51
def fetch_structuredidentifier(docref)
  m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
  RelatonIsoBib::StructuredIdentifier.new(
    project_number: m[1], part_number: m[2], prefix: nil,
    id: docref, type: "Chinese Standard"
  )
end
get_contributors(doc, docref) click to toggle source

@param doc [Nokogiri::HTML::Document] @param docref [Strings] @return [Array<Hash>]

# File lib/relaton_gb/scrapper.rb, line 62
def get_contributors(doc, docref)
  name = docref.match(/^[^\s]+/).to_s
  name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
  gbtype = get_gbtype(doc, docref)
  orgs = %w[en zh].map { |l| org(l, name, gbtype) }.compact
  return [] unless orgs.any?

  entity = RelatonBib::Organization.new name: orgs
  [{ entity: entity, role: [type: "publisher"] }]
end
get_docid(docref) click to toggle source

@param docref [String] @return [Array<RelatonBib::DocumentIdentifier>]

# File lib/relaton_gb/scrapper.rb, line 45
def get_docid(docref)
  [RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard")]
end
get_status(doc, status = nil) click to toggle source

@param doc [Nokogiri::HTML::Document] @param status [String, NilClass] @return [RelatonBib::DocumentStatus]

# File lib/relaton_gb/scrapper.rb, line 103
def get_status(doc, status = nil)
  status ||= doc.at("//td[contains(., '标准状态')]/span")&.text
  RelatonBib::DocumentStatus.new stage: STAGES[status]
end
get_titles(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Array<RelatonBib::TypedTitleString>]

# File lib/relaton_gb/scrapper.rb, line 87
def get_titles(doc)
  tzh = doc.at("//td[contains(text(), '中文标准名称')]/b").text
  titles = RelatonBib::TypedTitleString.from_string tzh, "zh", "Hans"
  ten = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
  return titles if ten.empty?

  titles + RelatonBib::TypedTitleString.from_string(ten, "en", "Latn")
end
get_type() click to toggle source
# File lib/relaton_gb/scrapper.rb, line 96
def get_type
  "standard"
end
org(lang, name, gbtype) click to toggle source

@param lang [String] @param name [String] @param gbtype [Hash] @return [Hash]

# File lib/relaton_gb/scrapper.rb, line 77
def org(lang, name, gbtype)
  ag = GbAgencies::Agencies.new(lang, {}, "")
  content = ag.standard_agency1(gbtype[:scope], name, gbtype[:mandate])
  return unless content

  { language: lang, content: content }
end
scrapped_data(doc, src, hit) click to toggle source

rubocop:disable Metrics/MethodLength @param doc [Nokogiri::HTML::Document] @param src [String] @param hit [RelatonGb::Hit] @return [Hash]

# File lib/relaton_gb/scrapper.rb, line 22
def scrapped_data(doc, src, hit)
  {
    fetched: Date.today.to_s,
    committee: get_committee(doc, hit.docref),
    docid: get_docid(hit.docref),
    title: get_titles(doc),
    contributor: get_contributors(doc, hit.docref),
    doctype: get_type,
    docstatus: get_status(doc, hit.status),
    gbtype: get_gbtype(doc, hit.docref),
    ccs: get_ccs(doc),
    ics: get_ics(doc),
    link: [{ type: "src", content: src }],
    date: get_dates(doc),
    language: ["zh"],
    script: ["Hans"],
    structuredidentifier: fetch_structuredidentifier(hit.docref),
  }
end

Private Instance Methods

get_ccs(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Array<String>]

# File lib/relaton_gb/scrapper.rb, line 124
def get_ccs(doc)
  [doc.at("//div[contains(text(), '中国标准分类号')]/following-sibling::div").
    text.delete("\r\n\t\t")]
end
get_dates(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

* :type [String] type of date
* :on [String] date
# File lib/relaton_gb/scrapper.rb, line 181
def get_dates(doc)
  date = doc.at("//div[contains(text(), '发布日期')]/following-sibling::div"\
                " | //dt[contains(text(), '发布日期')]/following-sibling::dd")
  [{ type: "published", on: date.text.delete("\r\n\t\t") }]
end
get_gbtype(doc, ref) click to toggle source

@param doc [Nokogiri::HTML::Document] @param ref [String] @return [Hash]

* :scope [String]
* :prefix [String]
* :mandate [String]
# File lib/relaton_gb/scrapper.rb, line 116
def get_gbtype(doc, ref)
  # ref = get_ref(doc)
  { scope: get_scope(doc), prefix: get_prefix(ref)["prefix"],
    mandate: get_mandate(ref), topic: "other" }
end
get_ics(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

* :field [String]
* :group [String]
* :subgroup [String]
# File lib/relaton_gb/scrapper.rb, line 134
def get_ics(doc)
  ics = doc.at("//div[contains(text(), '国际标准分类号')]/following-sibling::div"\
               " | //dt[contains(text(), '国际标准分类号')]/following-sibling::dd")
  return [] unless ics

  field, group, subgroup = ics.text.delete("\r\n\t\t").split "."
  [{ field: field, group: group.ljust(3, "0"), subgroup: subgroup }]
end
get_mandate(ref) click to toggle source

@param ref [String] @return [String]

# File lib/relaton_gb/scrapper.rb, line 169
def get_mandate(ref)
  case ref.match(%r{(?<=\/)[^\s]+}).to_s
  when "T" then "recommended"
  when "Z" then "guidelines"
  else "mandatory"
  end
end
get_prefix(ref) click to toggle source

@param ref [String] @return [String]

# File lib/relaton_gb/scrapper.rb, line 155
def get_prefix(ref)
  pref = ref.match(/^[^\s]+/).to_s.split("/").first
  prefix pref
end
get_scope(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [String]

# File lib/relaton_gb/scrapper.rb, line 145
def get_scope(doc)
  issued = doc.at("//div[contains(., '发布单位')]/following-sibling::div")
  case issued&.text
  when /国家标准/ then "national"
  when /^行业标准/ then "sector"
  end
end
prefix(pref) click to toggle source

@param pref [String] @return [Hash{String=>String}]

# File lib/relaton_gb/scrapper.rb, line 162
def prefix(pref)
  @prefixes ||= YAML.load_file File.join(__dir__, "yaml/prefixes.yaml")
  @prefixes[pref]
end