module Gbbib::Scrapper

Common scrapping methods.

Public Instance Methods

get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') click to toggle source
# File lib/gbbib/scrapper.rb, line 48
def get_contributors(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
  gb_en = GbAgencies::Agencies.new("en", {}, "")
  gb_zh = GbAgencies::Agencies.new("zh", {}, "")
  name = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
  name.sub!(%r{/[TZ]$}, "") unless name.match(/^GB/)
  gbtype = get_gbtype(doc)
  entity = IsoBibItem::Organization.new name: [
    {language: "en", content: gb_en.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
    {language: "zh", content: gb_zh.standard_agency1(gbtype[:scope], name, gbtype[:mandate]) },
  ]
  [{ entity: entity, roles: ['publisher'] }]
end
get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]') click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Hash]

* :project_number [String]
* :part_number [String]
# File lib/gbbib/scrapper.rb, line 39
def get_docid(doc, xpt = '//dt[text()="标准号"]/following-sibling::dd[1]')
  item_ref = doc.xpath(xpt) or
    return { project_number: "?", part_number: "?", prefix: nil, id: "?" }
  m = item_ref.text.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
  #prefix = doc.xpath(xpt).text.match(/^[^\s]+/).to_s
  { project_number: m[1], part_number: m[2], prefix: nil,
    id: item_ref.text, type: "Chinese Standard" }
end
get_status(doc, xpt = '.s-status.label:nth-child(3)') click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Hash]

* :status [String]
* :stage [String]
* :substage [String]
# File lib/gbbib/scrapper.rb, line 87
def get_status(doc, xpt = '.s-status.label:nth-child(3)')
  status = case doc.at(xpt).text.gsub(/\s/, '')
           when '即将实施' then 'published'
           when '现行' then 'activated'
           when '废止' then 'obsoleted'
           end
  { status: status, stage: '', substage: '' }
end
get_titles(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

* :title_intro [String]
* :title_main [String]
* :language [String]
* :script [String]
# File lib/gbbib/scrapper.rb, line 68
def get_titles(doc)
  titles = [{ title_main: doc.css('div.page-header h4').text, title_intro: nil,
              language: 'zh', script: 'Hans' }]
  title_main = doc.css('div.page-header h5').text
  unless title_main.empty?
    titles << { title_main: title_main, title_intro: nil, language: 'en', script: 'Latn' }
  end
  titles
end
get_type(_doc) click to toggle source
# File lib/gbbib/scrapper.rb, line 78
def get_type(_doc)
  'standard'
end
scrapped_data(doc, src:) click to toggle source

rubocop:disable Metrics/MethodLength @param doc [Nokogiri::HTML::Document] @param src [String] url of scrapped page @return [Hash]

# File lib/gbbib/scrapper.rb, line 16
def scrapped_data(doc, src:)
  {
    committee:    get_committee(doc),
    docid:        get_docid(doc),
    titles:       get_titles(doc),
    contributors: get_contributors(doc),
    type:         get_type(doc),
    docstatus:    get_status(doc),
    gbtype:       get_gbtype(doc),
    ccs:          get_ccs(doc),
    ics:          get_ics(doc),
    link:       [{ type: 'src', content: src }],
    dates:        get_dates(doc),
    language:     ['zh'],
    script:       ['Hans']
  }
end

Private Instance Methods

get_ccs(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Array<String>]

# File lib/gbbib/scrapper.rb, line 117
def get_ccs(doc)
  [doc&.xpath('//dt[text()="中国标准分类号"]/following-sibling::dd[1]')&.text]
end
get_dates(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

* :type [String] type of date
* :on [String] date
# File lib/gbbib/scrapper.rb, line 173
def get_dates(doc)
  date = doc.xpath('//dt[.="发布日期"]/following-sibling::dd[1]').text
  [{ type: 'published', on: date }]
end
get_gbtype(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Hash]

* :scope [String]
* :prefix [String]
* :mandate [String]
# File lib/gbbib/scrapper.rb, line 103
def get_gbtype(doc)
  ref = get_ref(doc)
  { scope: get_scope(doc), prefix: get_prefix(ref)['prefix'],
    mandate: get_mandate(ref) }
end
get_ics(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

* :field [String]
* :group [String]
* :subgroup [String]
# File lib/gbbib/scrapper.rb, line 126
def get_ics(doc)
  ics = doc.xpath('//dt[(.="国际标准分类号")]/following-sibling::dd[1]/span')
  ics.empty? and return []
  field, group, subgroup = ics.text.split '.'
  [{ field: field, group: group.ljust(3, '0'), subgroup: subgroup }]
end
get_mandate(ref) click to toggle source

@param ref [String] @return [String]

# File lib/gbbib/scrapper.rb, line 161
def get_mandate(ref)
  case ref.match(%r{(?<=\/)[^\s]+}).to_s
  when 'T' then 'recommended'
  when 'Z' then 'guidelines'
  else 'mandatory'
  end
end
get_prefix(ref) click to toggle source

@param ref [String] @return [String]

# File lib/gbbib/scrapper.rb, line 146
def get_prefix(ref)
  pref = ref.match(/^[^\s]+/).to_s.split('/').first
  prefix pref
end
get_ref(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [String]

# File lib/gbbib/scrapper.rb, line 111
def get_ref(doc)
  doc.xpath('//dt[text()="标准号"]/following-sibling::dd[1]').text
end
get_scope(doc) click to toggle source

@param doc [Nokogiri::HTML::Document] @return [String]

# File lib/gbbib/scrapper.rb, line 135
def get_scope(doc)
  scope = doc.at('.s-status.label-info').text
  if scope == '国家标准'
    'national'
  elsif scope =~ /^行业标准/
    'sector'
  end
end
prefix(pref) click to toggle source

@param pref [String] @return [Hash{String=>String}]

# File lib/gbbib/scrapper.rb, line 153
def prefix(pref)
  file_path = File.join(__dir__, 'yaml/prefixes.yaml')
  @prefixes ||= YAML.load_file(file_path)
  @prefixes[pref]
end