class NistBib::Scrapper

Constants

DOMAIN

Public Class Methods

parse_page(hit_data) click to toggle source

Parse page. @param hit_data [Hash] @return [Hash]

# File lib/nistbib/scrapper.rb, line 13
def parse_page(hit_data)
  doc = get_page hit_data[:url]

  NistBibliographicItem.new(
    fetched: Date.today.to_s,
    type: "standard",
    # id: fetch_id(doc),
    titles: fetch_titles(hit_data),
    link: fetch_link(doc),
    docid: fetch_docid(doc),
    dates: fetch_dates(doc, hit_data[:release_date]),
    contributors: fetch_contributors(doc),
    edition: fetch_edition(hit_data[:code]),
    language: ["en"],
    script: ["Latn"],
    abstract: fetch_abstract(doc),
    docstatus: fetch_status(doc, hit_data[:status]),
    copyright: fetch_copyright(doc),
    relations: fetch_relations(doc),
    series: fetch_series(doc),
    keyword: fetch_keywords(doc),
    commentperiod: fetch_commentperiod(doc),
  )
end

Private Class Methods

contributors(doc, role) click to toggle source

rubocop:disable Metrics/CyclomaticComplexity

# File lib/nistbib/scrapper.rb, line 163
def contributors(doc, role)
  return [] if doc.nil?

  doc.text.split(", ").map do |contr|
    /(?<an>.+?)(\s+\((?<abbrev>.+?)\))?$/ =~ contr
    if abbrev && an.downcase !~ /(task|force|group)/ && an.split.size.between?(2, 3)
      fullname = RelatonBib::FullName.new(
        completename: RelatonBib::LocalizedString.new(an, "en", "Latn"),
      )
      case abbrev
      when "NIST"
        org_name = "National Institute of Standards and Technology"
        url = "www.nist.gov"
      when "MITRE"
        org_name = abbrev
        url = "www.mitre.org"
      else
        org_name = abbrev
        url = nil
      end
      org = RelatonBib::Organization.new name: org_name, url: url, abbreviation: abbrev
      affiliation = RelatonBib::Affilation.new org
      entity = RelatonBib::Person.new(
        name: fullname, affiliation: [affiliation], contacts: [],
      )
    else
      entity = RelatonBib::Organization.new name: an, abbreviation: abbrev
    end
    RelatonBib::ContributionInfo.new entity: entity, role: [role]
  end
end
doc_relation(type, ref) click to toggle source
# File lib/nistbib/scrapper.rb, line 256
def doc_relation(type, ref)
  RelatonBib::DocumentRelation.new(
    type: type,
    bibitem: RelatonBib::BibliographicItem.new(
      formattedref: RelatonBib::FormattedRef.new(
        content: ref.text, language: "en", script: "Latn", format: "text/plain",
      ),
      link: [RelatonBib::TypedUri.new(type: "src", content: DOMAIN + ref[:href])],
    ),
  )
end
fetch_abstract(doc) click to toggle source

Fetch abstracts. @param doc [Nokigiri::HTML::Document] @return [Array<Array>]

# File lib/nistbib/scrapper.rb, line 205
def fetch_abstract(doc)
  abstract_content = doc.xpath('//div[contains(@class, "pub-abstract-callout")]/div[1]/p').text
  [{
    content: abstract_content,
    language: "en",
    script: "Latn",
    format: "text/plain",
  }]
end
fetch_commentperiod(doc) click to toggle source
# File lib/nistbib/scrapper.rb, line 297
def fetch_commentperiod(doc)
  cp = doc.at "//span[@id='pub-comments-due']"
  return unless cp

  to = Date.strptime cp.text.strip, "%B %d, %Y"

  d = doc.at("//span[@id='pub-release-date']").text.strip
  from = Date.strptime(d, "%B %Y").to_s

  ex = doc.at "//strong[contains(.,'The comment closing date has been extended to')]"
  ext = ex&.text&.match(/\w+\s\d{2},\s\d{4}/).to_s
  extended = ext.empty? ? nil : Date.strptime(ext, "%B %d, %Y")
  CommentPeriod.new from, to, extended
end
fetch_contributors(doc) click to toggle source
# File lib/nistbib/scrapper.rb, line 146
def fetch_contributors(doc)
  name = "National Institute of Standards and Technology"
  org = RelatonBib::Organization.new(
    name: name, url: "www.nist.gov", abbreviation: "NIST",
  )
  contribs = [
    RelatonBib::ContributionInfo.new(entity: org, role: ["publisher"]),
  ]

  authors = doc.at('//h4[.="Author(s)"]/following-sibling::p')
  contribs += contributors(authors, "author")

  editors = doc.at('//h4[.="Editor(s)"]/following-sibling::p')
  contribs + contributors(editors, "editor")
end
fetch_dates(doc, release_date) click to toggle source

Fetch dates @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

# File lib/nistbib/scrapper.rb, line 132
def fetch_dates(doc, release_date)
  dates = [{ type: "published", on: release_date.to_s }]

  d = doc.at("//span[@id='pub-release-date']").text.strip
  date = if /(?<date>\w+\s\d{4})/ =~ d
           Date.strptime(date, "%B %Y")
         elsif /(?<date>\w+\s\d{1,2},\s\d{4})/ =~ d
           Date.strptime(date, "%B %d, %Y")
         end
  dates << { type: "issued", on: date.to_s }

  dates
end
fetch_docid(doc) click to toggle source

Fetch docid. @param doc [Nokogiri::HTML::Document] @return [Array<RelatonBib::DocumentIdentifier>]

# File lib/nistbib/scrapper.rb, line 53
def fetch_docid(doc)
  item_ref = doc.at("//div[contains(@class, 'publications-detail')]/h3").
    text.strip
  return [RelatonBib::DocumentIdentifier.new(type: "nist", id: "?")] unless item_ref

  [RelatonBib::DocumentIdentifier.new(id: item_ref, type: "nist")]
end
fetch_edition(code) click to toggle source

rubocop:enable Metrics/CyclomaticComplexity

# File lib/nistbib/scrapper.rb, line 196
def fetch_edition(code)
  return unless /(?<=Rev\.\s)(?<rev>\d+)/ =~ code

  "Revision #{rev}"
end
fetch_keywords(doc) click to toggle source
# File lib/nistbib/scrapper.rb, line 292
def fetch_keywords(doc)
  kws = doc.xpath "//span[@id='pub-keywords-container']/span"
  kws.map { |kw| Keyword.new kw.text }
end
fetch_relations(doc) click to toggle source

Fetch relations. @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]

# File lib/nistbib/scrapper.rb, line 242
def fetch_relations(doc)
  relations = doc.xpath('//span[@id="pub-supersedes-container"]/a').map do |r|
    doc_relation "supersedes", r
  end

  relations += doc.xpath('//span[@id="pub-part-container"]/a').map do |r|
    doc_relation "partOf", r
  end

  relations + doc.xpath('//span[@id="pub-related-container"]/a').map do |r|
    doc_relation "updates", r
  end
end
fetch_series(doc) click to toggle source
# File lib/nistbib/scrapper.rb, line 268
def fetch_series(doc)
  series = doc.xpath "//span[@id='pub-history-container']/a"\
    "|//span[@id='pub-history-container']/span"
  series.map.with_index do |s, idx|
    next if s.name == "span"

    iter = if idx.zero? then "I"
          #  elsif status == "final" && idx == (series.size - 1) then "F"
           else idx + 1
           end

    content = s.text.match(/^[^\(]+/).to_s.strip.gsub "  ", " "

    ref = case content.match(/\w+/).to_s
          when "Draft" then content.match(/(?<=Draft\s).+/).to_s + " (#{iter}PD)"
          end

    fref = RelatonBib::FormattedRef.new(
      content: ref, language: "en", script: "Latn", format: "text/plain",
    )
    RelatonBib::Series.new(formattedref: fref)
  end.select { |s| s }
end
fetch_status(doc, status) click to toggle source

Fetch status. @param doc [Nokogiri::HTML::Document] @param status [String] @return [Hash]

# File lib/nistbib/scrapper.rb, line 73
def fetch_status(doc, status)
  case status
  when "draft (withdrawn)"
    stage = "draft-public"
    subst = "withdrawn"
  when "retired draft"
    stage = "draft-public"
    subst = "retired"
  when "withdrawn"
    stage = "final"
    subst = "withdrawn"
  when "draft"
    stage = "draft-public"
    subst = "active"
  else
    stage = status
    subst = "active"
  end

  iter = nil
  if stage.include? "draft"
    iter = 1
    history = doc.xpath("//span[@id='pub-history-container']/a"\
      "|//span[@id='pub-history-container']/span")
    history.each_with_index do |h, idx|
      next if h.name == "a"

      iter = idx + 1 if idx.positive?
      # iter = if lsif idx < (history.size - 1) && !history.last.text.include?("Draft")
      #          "final"
      #        elsif idx.positive? then idx + 1
      #        end
      break
    end
  end

  # if doc.at "//p/strong[text()='Withdrawn:']"
  #   substage = "withdrawn"
  # else
  #   substage = "active"
  #   item_ref = doc.at(
  #     "//div[contains(@class, 'publications-detail')]/h3",
  #   ).text.strip
  #   wip = item_ref.match(/(?<=\()\w+/).to_s
  #   stage = "draft-public" if wip == "DRAFT"
  # end
  NistBib::DocumentStatus.new stage: stage, substage: subst, iteration: iter
end
fetch_titles(hit_data) click to toggle source

Fetch titles. @param hit_data [Hash] @return [Array<Hash>]

# File lib/nistbib/scrapper.rb, line 125
def fetch_titles(hit_data)
  [{ content: hit_data[:title], language: "en", script: "Latn", format: "text/plain" }]
end
get_page(url) click to toggle source

Get page. @param path [String] page's path @return [Array<Nokogiri::HTML::Document, String>]

# File lib/nistbib/scrapper.rb, line 44
def get_page(url)
  uri = URI url
  resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
  Nokogiri::HTML(resp.body)
end