module Gbbib::TScrapper

Social standard scarpper.

Public Class Methods

scrape_doc(pid) click to toggle source

@param pid [String] standard's page path @return [Gbbib::GbBibliographicItem]

# File lib/gbbib/t_scrapper.rb, line 42
def scrape_doc(pid)
  src = "http://www.ttbz.org.cn#{pid}"
  begin
    doc = Nokogiri::HTML OpenURI.open_uri(src), nil, Encoding::UTF_8.to_s
    GbBibliographicItem.new scrapped_data(doc, src: src)
  rescue
    warn "Cannot connect to #{src}"
  end
end
scrape_page(text) click to toggle source

rubocop:disable Metrics/MethodLength, Metrics/AbcSize @param text [String] @return [Gbbib::HitCollection]

# File lib/gbbib/t_scrapper.rb, line 20
def scrape_page(text)
  begin
    search_html = OpenURI.open_uri(
      'http://www.ttbz.org.cn/Home/Standard?searchType=2&key=' +
      CGI.escape(text.tr('-', [8212].pack('U')))
    )
    header = Nokogiri::HTML search_html
    xpath = '//table[contains(@class, "standard_list_table")]/tr/td/a'
    t_xpath = '../preceding-sibling::td[3]'
    hits = header.xpath(xpath).map do |h|
      title = h.at(t_xpath).text.gsub(/â\u0080\u0094/, '-')
      Hit.new pid: h[:href].sub(%r{\/$}, ''), title: title, scrapper: self
    end
    HitCollection.new hits
  rescue
    warn "Cannot connect to #{http://www.ttbz.org.cn/Home/Standard}"
  end
end

Private Class Methods

gbtype() click to toggle source
# File lib/gbbib/t_scrapper.rb, line 97
def gbtype
  { scope: 'social-group', prefix: 'T', mandate: 'mandatory' }
end
get_ccs(doc) click to toggle source

def get_group_code(ref)

ref.match(%r{(?<=\/)[^\s]})

end

# File lib/gbbib/t_scrapper.rb, line 105
def get_ccs(doc)
  [doc.xpath('//td[contains(.,"中国标准分类号")]/following-sibling::td[1]')
    .text.gsub(/[\r\n]/, '').strip.match(/^[^\s]+/).to_s]
end
get_committee(doc) click to toggle source

rubocop:enable Metrics/MethodLength

# File lib/gbbib/t_scrapper.rb, line 77
def get_committee(doc)
  {
    name: doc.xpath('//td[.="团体名称"]/following-sibling::td[1]').text,
    type: 'technical'
  }
end
get_dates(doc) click to toggle source
# File lib/gbbib/t_scrapper.rb, line 117
def get_dates(doc)
  d = doc.xpath('//td[contains(.,"发布日期")]/following-sibling::td[1]/span')
    .text.match(/(?<y>\d{4})[^\d]+(?<m>\d{2})[^\d]+(?<d>\d{2})/)
  [{ type: 'published', on: "#{d[:y]}-#{d[:m]}-#{d[:d]}" }]
end
get_ics(doc) click to toggle source
# File lib/gbbib/t_scrapper.rb, line 110
def get_ics(doc)
  xpath = '//td[contains(.,"国际标准分类号")]/following-sibling::td[1]/span'
  ics = doc.xpath(xpath).text.match(/^[^\s]+/).to_s
  field, group, subgroup = ics.split '.'
  [{ field: field, group: group.ljust(3, '0'), subgroup: subgroup }]
end
get_titles(doc) click to toggle source
# File lib/gbbib/t_scrapper.rb, line 84
def get_titles(doc)
  xpath  = '//td[contains(.,"中文标题")]/following-sibling::td[1]'
  titles = [{ title_main: doc.xpath(xpath).text,
              title_intro: nil, language: 'zh', script: 'Hans' }]
  xpath = '//td[contains(.,"英文标题")]/following-sibling::td[1]'
  title_main = doc.xpath(xpath).text
  unless title_main.empty?
    titles << { title_main: title_main, title_intro: nil, language: 'en',
                script: 'Latn' }
  end
  titles
end
scrapped_data(doc, src:) click to toggle source

rubocop:disable Metrics/MethodLength @param doc [Nokogiri::HTML::Document] @return [Hash]

# File lib/gbbib/t_scrapper.rb, line 57
def scrapped_data(doc, src:)
  docid_xpt  = '//td[contains(.,"标准编号")]/following-sibling::td[1]'
  status_xpt = '//td[contains(.,"标准状态")]/following-sibling::td[1]/span'
  {
    committee: get_committee(doc),
    docid:     get_docid(doc, docid_xpt),
    titles:    get_titles(doc),
    type:      'standard',
    docstatus: get_status(doc, status_xpt),
    gbtype:    gbtype,
    ccs:       get_ccs(doc),
    ics:       get_ics(doc),
    link:    [{ type: 'src', content: src }],
    dates:     get_dates(doc),
    language:  ['zh'],
    script:    ['Hans']
  }
end