module Isobib::Scrapper

Scrapper. rubocop:disable Metrics/ModuleLength

Constants

DOMAIN
TYPES

Public Class Methods

get(text) click to toggle source

@param text [String] @return [Array<Hash>]

# File lib/isobib/scrapper.rb, line 42
def get(text)
  begin
    iso_workers = WorkersPool.new 4
    iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
    algolia_workers = start_algolia_search(text, iso_workers)
    iso_docs = iso_workers.result
    algolia_workers.end
    algolia_workers.result
    iso_docs
  rescue
    warn "Could not connect to http://www.iso.org"
    []
  end
end
parse_page(hit_data) click to toggle source

Parse page. @param hit [Hash] @return [Hash] rubocop:disable Metrics/AbcSize, Metrics/MethodLength

# File lib/isobib/scrapper.rb, line 61
def parse_page(hit_data)
  return unless hit_data['path'].match(/\d+$/)
  doc, url = get_page "/standard/#{hit_data['path'].match(/\d+$/)}.html"

  # Fetch edition.
  edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..")
  &.children&.last&.text&.match(/\d+/)&.to_s

  titles, abstract = fetch_titles_abstract(doc)

  IsoBibItem::IsoBibliographicItem.new(
    docid:        fetch_docid(doc),
    edition:      edition,
    language:     langs(doc).map { |l| l[:lang] },
    script:       langs(doc).map { |l| script(l[:lang]) }.uniq,
    titles:       titles,
    type:         fetch_type(hit_data['title']),
    docstatus:    fetch_status(doc, hit_data['status']),
    ics:          fetch_ics(doc),
    dates:        fetch_dates(doc),
    contributors: fetch_contributors(hit_data['title']),
    workgroup:    fetch_workgroup(doc),
    abstract:     abstract,
    copyright:    fetch_copyright(hit_data['title'], doc),
    link:       fetch_link(doc, url),
    relations:    fetch_relations(doc)
  )
end

Private Class Methods

algolia_worker(index, text, page, algolia_workers, iso_workers) click to toggle source

Fetch hits from algolia search service. @param index @param text [String] @param page [Integer] @param algolia_workers [Isobib::WorkersPool] @param isiso_workers [Isobib::WorkersPool]

# File lib/isobib/scrapper.rb, line 122
def algolia_worker(index, text, page, algolia_workers, iso_workers)
  res = index.search text, facetFilters: ['category:standard'], page: page
  next_page = res['page'] + 1
  algolia_workers << next_page if next_page < res['nbPages']
  res['hits'].each do |hit|
    iso_workers.nb_hits = res['nbHits']
    iso_workers << hit
  end
  iso_workers.end unless next_page < res['nbPages']
end
fetch_contributors(title) click to toggle source

rubocop:disable Metrics/MethodLength

# File lib/isobib/scrapper.rb, line 331
def fetch_contributors(title)
  title.sub(/\s.*/, '').split('/').map do |abbrev|
    case abbrev
    when 'ISO'
      name = 'International Organization for Standardization'
      url = 'www.iso.org'
    when 'IEC'
      name = 'International Electrotechnical Commission'
      url  = 'www.iec.ch'
    end
    { entity: { name: name, url: url, abbreviation: abbrev },
      roles: ['publisher'] }
  end
end
fetch_dates(doc) click to toggle source

Fetch dates @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

# File lib/isobib/scrapper.rb, line 321
def fetch_dates(doc)
  dates = []
  publish_date = doc.xpath("//span[@itemprop='releaseDate']").text
  unless publish_date.empty?
    dates << { type: 'published', on: publish_date }
  end
  dates
end
fetch_docid(doc) click to toggle source

Fetch docid. @param doc [Nokogiri::HTML::Document] @return [Hash]

# File lib/isobib/scrapper.rb, line 200
def fetch_docid(doc)
  item_ref = doc.xpath("//strong[@id='itemReference']") or 
    return { project_number: "?", part_number: "", prefix: nil, id: "?"}
  m = item_ref.text.match(/^(.*?\d+)-?((?<=-)\d+|)/)
  { project_number: m[1], part_number: m[2], prefix: nil, 
    id: item_ref.text, type: "ISO" }
end
fetch_ics(doc) click to toggle source

Fetch ICS. @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

# File lib/isobib/scrapper.rb, line 350
def fetch_ics(doc)
  doc.xpath('//strong[contains(text(), '\
            "'ICS')]/../following-sibling::dd/div/a").map do |i|
    code = i.text.match(/[\d\.]+/).to_s.split '.'
    { field: code[0], group: code[1], subgroup: code[2] }
  end
end
fetch_relations(doc) click to toggle source

Fetch relations. @param doc [Nokogiri::HTML::Document] @return [Array<Hash>] rubocop:disable Metrics/MethodLength

# File lib/isobib/scrapper.rb, line 239
def fetch_relations(doc)
  doc.css('ul.steps li').inject([]) do |a, r|
    r_type = r.css('strong').text
    type = case r_type
           when 'Previously', 'Will be replaced by' then 'obsoletes'
           when 'Corrigenda/Amendments', 'Revised by', 'Now confirmed'
             'updates'
           else r_type
           end
    if ['Now', 'Now under review'].include? type
      a
    else
      a + r.css('a').map do |id|
        { type: type, identifier: id.text, url: id['href'] }
      end
    end
  end
end
fetch_status(doc, status) click to toggle source

Fetch status. @param doc [Nokogiri::HTML::Document] @param status [String] @return [Hash]

# File lib/isobib/scrapper.rb, line 212
def fetch_status(doc, status)
  stage, substage = doc.css('li.dropdown.active span.stage-code > strong')
    .text.split '.'
  { status: status, stage: stage, substage: substage }
end
fetch_title(doc, lang) click to toggle source

Fetch titles. @param doc [Nokogiri::HTML::Document] @param lang [String] @return [Hash]

# File lib/isobib/scrapper.rb, line 281
def fetch_title(doc, lang)
  titles = doc.at("//h3[@itemprop='description'] | //h2[@itemprop='description']")
    .text.split ' -- '
  case titles.size
  when 0
    intro, main, part = nil, "", nil
  when 1
    intro, main, part = nil, titles[0], nil
  when 2
    if /^(Part|Partie) \d+:/ =~ titles[1]
      intro, main, part = nil, titles[0], titles[1]
    else
      intro, main, part = titles[0], titles[1], nil
    end
  when 3
    intro, main, part = titles[0], titles[1], titles[2]
  else
    intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ")
  end
  {
    title_intro: intro,
    title_main:  main,
    title_part:  part,
    language:    lang,
    script:      script(lang)
  }
end
fetch_titles_abstract(doc) click to toggle source

Fetch titles and abstracts. @param doc [Nokigiri::HTML::Document] @return [Array<Array>] rubocop:disable Metrics/AbcSize, Metrics/MethodLength

# File lib/isobib/scrapper.rb, line 137
def fetch_titles_abstract(doc)
  titles   = []
  abstract = []
  langs(doc).each do |lang|
    # Don't need to get page for en. We already have it.
    d = lang[:path] ? get_page(lang[:path])[0] : doc

    # Check if unavailable for the lang.
    next if d.css('h5.help-block').any?
    titles << fetch_title(d, lang[:lang])

    # Fetch abstracts.
    abstract_content = d.css("div[itemprop='description'] p").text
    next if abstract_content.empty?
    abstract << {
      content:  abstract_content,
      language: lang[:lang],
      script:   script(lang[:lang])
    }
  end
  [titles, abstract]
end
fetch_type(title) click to toggle source

Fetch type. @param title [String] @return [String]

# File lib/isobib/scrapper.rb, line 262
def fetch_type(title)
  type_match = title.match(%r{^(ISO|IWA|IEC)(?:(/IEC|/IEEE|/PRF|
                                                /NP)*\s|/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
  #return "international-standard" if type_match.nil?
  if TYPES[type_match[2]]
    TYPES[type_match[2]]
  elsif type_match[1] == 'ISO'
    'international-standard'
  elsif type_match[1] == 'IWA'
    'international-workshop-agreement'
  end
  # rescue => _e
  #   puts 'Unknown document type: ' + title
end
fetch_workgroup(doc) click to toggle source

Fetch workgroup. @param doc [Nokogiri::HTML::Document] @return [Hash]

# File lib/isobib/scrapper.rb, line 221
def fetch_workgroup(doc)
  wg_link = doc.css('div.entry-name.entry-block a')[0]
  # wg_url = DOMAIN + wg_link['href']
  workgroup = wg_link.text.split '/'
  { name:                'International Organization for Standardization',
    abbreviation:        'ISO',
    url:                 'www.iso.org',
    technical_committee: {
      name:   wg_link.text + doc.css('div.entry-title')[0].text,
      type:   'TC',
      number: workgroup[1]&.match(/\d+/)&.to_s&.to_i
    } }
end
get_page(path) click to toggle source

rubocop:disable Metrics/AbcSize, Metrics/MethodLength Get page. @param path [String] page's path @return [Array<Nokogiri::HTML::Document, String>]

# File lib/isobib/scrapper.rb, line 178
def get_page(path)
  url = DOMAIN + path
  uri = URI url
  resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
  if resp.code == '301'
    path = resp['location']
    url = DOMAIN + path
    uri = URI url
    resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
  end
  n = 0
  while resp.body !~ /<strong/ && n < 10
    resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
    n += 1
  end
  [Nokogiri::HTML(resp.body), url]
end
iso_worker(hit, iso_workers) click to toggle source

Fetch ISO documents. @param hit [Hash] @param isiso_workers [Isobib::WorkersPool]

# File lib/isobib/scrapper.rb, line 111
def iso_worker(hit, iso_workers)
  print "Parse #{iso_workers.size} of #{iso_workers.nb_hits}  \r"
  parse_page hit
end
langs(doc) click to toggle source

Get langs. @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

# File lib/isobib/scrapper.rb, line 164
def langs(doc)
  lgs = [{ lang: 'en' }]
  doc.css('ul#lang-switcher ul li a').each do |lang_link|
    lang_path = lang_link.attr('href')
    lang = lang_path.match(%r{^\/(fr)\/})
    lgs << { lang: lang[1], path: lang_path } if lang
  end
  lgs
end
script(lang) click to toggle source

Return ISO script code. @param lang [String] @return [String]

# File lib/isobib/scrapper.rb, line 312
def script(lang)
  case lang
  when 'en', 'fr' then 'Latn'
  end
end