module Iecbib::Scrapper

Scrapper. rubocop:disable Metrics/ModuleLength

Constants

DOMAIN
TYPES

Public Class Methods

parse_page(hit_data) click to toggle source

Parse page. @param hit [Hash] @return [Hash] rubocop:disable Metrics/AbcSize, Metrics/MethodLength

# File lib/iecbib/scrapper.rb, line 52
def parse_page(hit_data)
  doc = get_page hit_data[:url]

  # Fetch edition.
  edition = doc.at("//th[contains(., 'Edition')]/following-sibling::td/span").text

  status, relations = fetch_status_relations hit_data[:url]

  IsoBibItem::IsoBibliographicItem.new(
    docid:        fetch_docid(doc),
    edition:      edition,
    language:     ['en'],
    script:       ['Latn'],
    titles:       fetch_titles(hit_data),
    type:         fetch_type(doc),
    docstatus:    status,
    ics:          fetch_ics(doc),
    dates:        fetch_dates(doc),
    contributors: fetch_contributors(hit_data[:code]),
    workgroup:    fetch_workgroup(doc),
    abstract:     fetch_abstract(doc),
    copyright:    fetch_copyright(hit_data[:code], doc),
    link:         fetch_link(doc, hit_data[:url]),
    relations:    relations
  )
end

Private Class Methods

fetch_abstract(doc) click to toggle source

Fetch abstracts. @param doc [Nokigiri::HTML::Document] @return [Array<Array>]

# File lib/iecbib/scrapper.rb, line 125
def fetch_abstract(doc)
  abstract_content = doc.at('//div[@itemprop="description"]').text
  [{
    content:  abstract_content,
    language: 'en',
    script:   'Latn'
  }]
end
fetch_contributors(code) click to toggle source
# File lib/iecbib/scrapper.rb, line 341
def fetch_contributors(code)
  code.sub(/\s.*/, '').split('/').map do |abbrev|
    case abbrev
    when 'ISO'
      name = 'International Organization for Standardization'
      url = 'www.iso.org'
    when 'IEC'
      name = 'International Electrotechnical Commission'
      url  = 'www.iec.ch'
    end
    { entity: { name: name, url: url, abbreviation: abbrev },
      roles: ['publisher'] }
  end
end
fetch_dates(doc) click to toggle source

Fetch dates @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

# File lib/iecbib/scrapper.rb, line 332
def fetch_dates(doc)
  dates = []
  publish_date = doc.at("//span[@itemprop='releaseDate']").text
  unless publish_date.empty?
    dates << { type: 'published', on: publish_date }
  end
  dates
end
fetch_docid(doc) click to toggle source

Fetch docid. @param doc [Nokogiri::HTML::Document] @return [Hash]

# File lib/iecbib/scrapper.rb, line 172
def fetch_docid(doc)
  item_ref = doc.at("//span[@itemprop='productID']")
  unless item_ref
    return { project_number: '?', part_number: '', prefix: nil, id: '?' }
  end
  m = item_ref.text.match(/(?<=\s)(?<project>\d+)-?(?<part>(?<=-)\d+|)-?(?<subpart>(?<=-)\d+|)/)
  {
    project_number: m[:project],
    part_number: m[:part],
    subpart_number: m[:subpart],
    prefix: nil,
    type: 'IEC',
    id: item_ref.text
  }
end
fetch_ics(doc) click to toggle source

Fetch ICS. @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

# File lib/iecbib/scrapper.rb, line 359
def fetch_ics(doc)
  doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i|
    code = i.text.match(/[\d\.]+/).to_s.split '.'
    { field: code[0], group: code[1], subgroup: code[2] }
  end
end
fetch_relations(doc) click to toggle source

Fetch relations. @param doc [Nokogiri::HTML::Document] @return [Array<Hash>] rubocop:disable Metrics/MethodLength

# File lib/iecbib/scrapper.rb, line 226
def fetch_relations(doc)
  doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]').map do |r|
    r_type = r.at('STATUS').text.downcase
    type = case r_type
          #  when 'published' then 'obsoletes' # Valid
           when 'revised', 'replaced' then 'updates'
           when 'withdrawn' then 'obsoletes'
           else r_type
           end
    url = DOMAIN + '/publication/' + r.at('PUB_ID').text
    { type: type, identifier: r.at('FULL_NAME').text, url: url }
  end
end
fetch_status(doc) click to toggle source

Fetch status. @param doc [Nokogiri::HTML::Document] @param status [String] @return [Hash]

# File lib/iecbib/scrapper.rb, line 192
def fetch_status(doc)
  wip = doc.at('//ROW[STATUS[.="PREPARING"]]')
  if wip
    statuses = YAML.load_file 'lib/iecbib/statuses.yml'
    s = wip.at('STAGE').text
    stage, substage = statuses[s]['stage'].split '.'
    status = statuses[s]['status']
  else
    status   = 'Published'
    stage    = '60'
    substage = '60'
  end
  { status: status, stage: stage, substage: substage }
end
fetch_status_relations(url) click to toggle source
# File lib/iecbib/scrapper.rb, line 240
def fetch_status_relations(url)
  pubid = url.match(/\d+$/).to_s
  uri = URI DOMAIN + '/webstore/webstore.nsf/AjaxRequestXML?'\
  'Openagent&url=http://www.iec.ch/dyn/www/f?'\
  'p=103:390:::::P390_PUBLICATION_ID:' + pubid
  resp = Net::HTTP.get_response uri
  doc = Nokogiri::XML resp.body
  status = fetch_status doc
  relations = fetch_relations doc
  [status, relations]
  # doc.css('ul.steps li').inject([]) do |a, r|
  #   r_type = r.css('strong').text
  #   type = case r_type
  #          when 'Previously', 'Will be replaced by' then 'obsoletes'
  #          when 'Corrigenda/Amendments', 'Revised by', 'Now confirmed'
  #            'updates'
  #          else r_type
  #          end
  #   if ['Now', 'Now under review'].include? type
  #     a
  #   else
  #     a + r.css('a').map do |id|
  #       { type: type, identifier: id.text, url: id['href'] }
  #     end
  #   end
  # end
end
fetch_titles(hit_data) click to toggle source

Fetch titles. @param hit_data [Hash] @return [Array<Hash>]

# File lib/iecbib/scrapper.rb, line 293
def fetch_titles(hit_data)
  titles = hit_data[:title].split ' - '
  case titles.size
  when 0
    intro, main, part = nil, "", nil
  when 1
    intro, main, part = nil, titles[0], nil
  when 2
    if /^(Part|Partie) \d+:/ =~ titles[1]
      intro, main, part = nil, titles[0], titles[1]
    else
      intro, main, part = titles[0], titles[1], nil
    end
  when 3
    intro, main, part = titles[0], titles[1], titles[2]
  else
    intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ")
  end
  [{
    title_intro: intro,
    title_main:  main,
    title_part:  part,
    language:    'en',
    script:      'Latn'
  }]
end
fetch_type(doc) click to toggle source

Fetch type. @param doc [Nokogiri::HTML::Document] @return [String]

# File lib/iecbib/scrapper.rb, line 272
def fetch_type(doc)
  doc.at('//th[contains(., "Publication type")]/following-sibling::td/span')
     .text.downcase.tr ' ', '-'
  # type_match = title.match(%r{^(ISO|IWA|IEC)(?:(/IEC|/IEEE|/PRF|
  #   /NP)*\s|/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
  # #return "international-standard" if type_match.nil?
  # if TYPES[type_match[2]]
  #   TYPES[type_match[2]]
  # elsif type_match[1]
  # elsif type_match[1] == 'ISO'
  #   'international-standard'
  # elsif type_match[1] == 'IWA'
  #   'international-workshop-agreement'
  # end
  # # rescue => _e
  # #   puts 'Unknown document type: ' + title
end
fetch_workgroup(doc) click to toggle source

Fetch workgroup. @param doc [Nokogiri::HTML::Document] @return [Hash]

# File lib/iecbib/scrapper.rb, line 210
def fetch_workgroup(doc)
  wg = doc.at('//th/abbr[.="TC"]/../following-sibling::td/a').text
  { name:                'International Electrotechnical Commission',
    abbreviation:        'IEC',
    url:                 'webstore.iec.ch',
    technical_committee: {
      name:   wg,
      type:   'technicalCommittee',
      number: wg.match(/\d+/)&.to_s&.to_i
    } }
end
get_page(url) click to toggle source

rubocop:disable Metrics/AbcSize, Metrics/MethodLength Get page. @param path [String] page's path @return [Array<Nokogiri::HTML::Document, String>]

# File lib/iecbib/scrapper.rb, line 151
def get_page(url)
  uri = URI url
  resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
  if resp.code == '301'
    path = resp['location']
    url = DOMAIN + path
    uri = URI url
    resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
  end
  # n = 0
  # while resp.body !~ /<strong/ && n < 10
  #   resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
  #   n += 1
  # end
  Nokogiri::HTML(resp.body)
end