module RelatonItu::Scrapper

Scrapper.

Constants

ROMAN_MONTHS
TYPES

Public Class Methods

parse_page(hit, imp = false) click to toggle source

Parse page. @param hit [RelatonItu::Hit] @return [Hash]

# File lib/relaton_itu/scrapper.rb, line 30
def parse_page(hit, imp = false) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  doc = get_page hit
  return unless doc.code == "200"

  if imp
    a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a"
    return unless a

    doc = get_page hit, a[:href].to_s
  end

  # Fetch edition.
  edition = doc.at("//table/tr/td/span[contains(@id, 'Label8')]/b")&.text

  ItuBibliographicItem.new(
    fetched: Date.today.to_s,
    type: "standard",
    docid: fetch_docid(doc, hit.hit[:title]),
    edition: edition,
    language: ["en"],
    script: ["Latn"],
    title: fetch_titles(doc),
    doctype: hit.hit[:type],
    docstatus: fetch_status(doc),
    ics: [], # fetch_ics(doc),
    date: fetch_dates(doc),
    contributor: fetch_contributors(hit.hit[:code]),
    editorialgroup: fetch_workgroup(hit.hit[:code], doc),
    abstract: fetch_abstract(doc, hit),
    copyright: fetch_copyright(hit.hit[:code], doc),
    link: fetch_link(doc),
    relation: fetch_relations(doc),
    place: ["Geneva"]
  )
end

Private Class Methods

createdocid(text) click to toggle source

@param text [String] @return [RelatonBib::DocumentIdentifier]

# File lib/relaton_itu/scrapper.rb, line 120
def createdocid(text) # rubocop:disable Metrics/MethodLength
  %r{
    ^(?<code>((ITU-\w|ISO\/IEC)\s)?[^\(:]+)
    (\(((?<_month>\d{2})\/)?(?<_year>\d{4})\))?
    (:[^\(]+\((?<buldate>\d{2}\.\w{1,4}\.\d{4})\))?
    (\s(?<corr>(Amd|Cor)\.\s?\d+))?
    # (\s\(((?<_cormonth>\d{2})\/)?(?<_coryear>\d{4})\))?
  }x =~ text.squeeze(" ")
  corr&.sub! /\.\s?/, " "
  id = [code.sub(/[[:space:]]$/, ""), corr].compact.join " "
  id += " - #{buldate}" if buldate
  type = id.match(%r{^\w+}).to_s
  type = "ITU" if type == "G"
  RelatonBib::DocumentIdentifier.new(type: type, id: id)
end
fetch_abstract(doc, hit) click to toggle source

Fetch abstracts. @param doc [Mechanize::Page] @param hit [RelatonItu::Hit] @return [Array<Hash>]

# File lib/relaton_itu/scrapper.rb, line 72
def fetch_abstract(doc, hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  abstract_url = doc.at '//table/tr[td/span[.="In force"]]/td/span[contains(@id, "lbl_dms")]/div'
  content = if abstract_url
              url = abstract_url[:onclick].match(/https?[^']+/).to_s
              rsp = hit.hit_collection.agent.get url
              d = Nokogiri::HTML rsp.body.encode(undef: :replace, replace: "")
              d.css("p.MsoNormal").text.gsub(/\r\n/, "").squeeze(" ").gsub(/\u00a0/, "")
            elsif a = doc.at('//table/tr/td/span[contains(@class, "observation")]/text()')
              a.text.strip
            end
  return [] unless content

  [{
    content: content,
    language: "en",
    script: "Latn",
  }]
end
fetch_contributors(code) click to toggle source

Fetch contributors @param doc [Mechanize::Page] @return [Array<Hash>]

# File lib/relaton_itu/scrapper.rb, line 244
def fetch_contributors(code)
  return [] unless code

  abbrev = code.sub(/-\w\s.*/, "")
  case abbrev
  when "ITU"
    name = "International Telecommunication Union"
    url = "www.itu.int"
  end
  [{ entity: { name: name, url: url, abbreviation: abbrev },
     role: [type: "publisher"] }]
end
fetch_dates(doc) click to toggle source

Fetch dates @param doc [Mechanize::Page] @return [Array<Hash>]

# File lib/relaton_itu/scrapper.rb, line 206
def fetch_dates(doc) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
  dates = []
  date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]",
                "//p[contains(.,'Approved in')]")
  pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc)
  if pdate && !pdate&.empty?
    dates << { type: "published", on: pdate }
  elsif pdate = ob_date(doc)
    dates << { type: "published", on: pdate }
  end
  dates
end
fetch_docid(doc, title) click to toggle source

Fetch docid. @param doc [Mechanize::Page] @param title [String] @return [Hash]

# File lib/relaton_itu/scrapper.rb, line 108
def fetch_docid(doc, title)
  docids = doc.xpath(
    "//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']",
    "//td[.='Identical standard:']/following-sibling::td",
    "//div/table[1]/tr[4]/td/strong"
  ).map { |c| createdocid c.text }
  docids << createdocid(title) unless docids.any?
  docids
end
fetch_relations(doc) click to toggle source

Fetch relations. @param doc [Mechanize::Page] @return [Array<Hash>]

# File lib/relaton_itu/scrapper.rb, line 181
def fetch_relations(doc)
  doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]')
    .map do |r|
    ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
    fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en",
                                        script: "Latn")
    bibitem = ItuBibliographicItem.new(formattedref: fref,
                                       type: "standard")
    { type: "complements", bibitem: bibitem }
  end
end
fetch_status(doc) click to toggle source

Fetch status. @param doc [Mechanize::Page] @return [RelatonBib::DocumentStatus, NilClass]

# File lib/relaton_itu/scrapper.rb, line 139
def fetch_status(doc)
  s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]",
             "//p[contains(.,'Status :')]")
  return unless s

  status = s.text.include?("In force") ? "Published" : "Withdrawal"
  RelatonBib::DocumentStatus.new(stage: status)
end
fetch_titles(doc) click to toggle source

Fetch titles. @param doc [Mechanize::Page] @return [RelatonBib::TypedTitleStringCollection]

# File lib/relaton_itu/scrapper.rb, line 196
def fetch_titles(doc)
  t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
  return [] unless t

  RelatonBib::TypedTitleString.from_string t.text, "en", "Latn"
end
fetch_workgroup(code, doc) click to toggle source

Fetch workgroup. @param code [String] @param doc [Mechanize::Page] @return [RelatonItu::EditorialGroup, NilClass]

# File lib/relaton_itu/scrapper.rb, line 152
def fetch_workgroup(code, doc)
  wg = doc.at('//table/tr/td/span[contains(@id, "Label8")]/a')
  # return unless wg

  group = wg && itugroup(wg.text)
  EditorialGroup.new(
    bureau: code.match(/(?<=-)./).to_s, group: group
  )
end
get_page(hit, url = nil) click to toggle source

Get page. @param hit [RelatonItu::Hit] @param url [String, nil] @return [Array<String, Nokogiri::HTML::Document>]

# File lib/relaton_itu/scrapper.rb, line 95
def get_page(hit, url = nil)
  uri = url || hit.hit[:url]
  hit.hit_collection.agent.get uri
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
       EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
       Net::ProtocolError, OpenSSL::SSL::SSLError
  raise RelatonBib::RequestError, "Could not access #{uri}"
end
itugroup(name) click to toggle source

@param name [String] @return [RelatonItu::ItuGroup]

# File lib/relaton_itu/scrapper.rb, line 164
def itugroup(name) # rubocop:disable Metrics/MethodLength
  if name.include? "Study Group"
    type = "study-group"
    acronym = "SG"
  elsif name.include? "Telecommunication Standardization Advisory Group"
    type = "tsag"
    acronym = "TSAG"
  else
    type = "work-group"
    acronym = "WG"
  end
  ItuGroup.new name: name, type: type, acronym: acronym
end
ob_date(doc) click to toggle source

Scrape Operational Bulletin date. @param doc [Mechanize::Page] @return [String]

# File lib/relaton_itu/scrapper.rb, line 222
def ob_date(doc)
  pdate = doc.at('//table/tbody/tr/td[contains(text(), "Year:")]')
  return unless pdate

  roman_to_arabic pdate.text.match(%r{(?<=Year: )(\d{2}.\w+.)?\d{4}}).to_s
end
roman_to_arabic(date) click to toggle source

Convert roman month number in string date to arabic number @param date [String] @return [String]

# File lib/relaton_itu/scrapper.rb, line 232
def roman_to_arabic(date)
  %r{(?<rmonth>[IVX]+)} =~ date
  if ROMAN_MONTHS.index(rmonth)
    month = ROMAN_MONTHS.index(rmonth) + 1
    Date.parse(date.sub(%r{[IVX]+}, month.to_s)).to_s
  else date
  end
end