module RelatonOmg::Scrapper

Constants

URL_PATTERN

Public Class Methods

scrape_page(ref) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 8
def scrape_page(ref)
  %r{OMG (?<acronym>[^\s]+)\s?(?<version>.*)} =~ ref
  return unless acronym

  url = URL_PATTERN + acronym
  url += "/" + version if version
  doc = Nokogiri::HTML OpenURI.open_uri(URI(url))
  OmgBibliographicItem.new **item(doc, acronym)
rescue OpenURI::HTTPError, URI::InvalidURIError => e
  if e.is_a?(URI::InvalidURIError) || e.io.status[0] == "404"
    warn %{[relaton-omg] no document found for "#{ref}" reference.}
    return
  end

  raise RelatonBib::RequestError, "Unable acces #{url} (#{e.io.status.join(" ")}"
end

Private Class Methods

fetch_abstract(doc) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 62
def fetch_abstract(doc)
  content = doc.at('//section[@id="document-metadata"]/div/div/p').text
  [{ content: content, language: "en", script: "Latn" }]
end
fetch_date(doc) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 75
def fetch_date(doc)
  [type: "published", on: pub_date(doc).to_s]
end
fetch_docid(doc, acronym) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 54
def fetch_docid(doc, acronym)
  id = [acronym]
  if (ver = version(doc))
    id << ver
  end
  [RelatonBib::DocumentIdentifier.new(id: id.join(" "), type: "OMG")]
end
fetch_id(doc, acronym) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 44
def fetch_id(doc, acronym)
  acronym + version(doc)
end
fetch_keyword(doc) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 113
def fetch_keyword(doc)
  doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map &:text
end
fetch_license(doc) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 117
def fetch_license(doc)
  doc.xpath(
    '//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span'
  ).map { |l| l.text.match(/[\w\s-]+/).to_s.strip }
end
fetch_relation(doc) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 98
def fetch_relation(doc)
  current_version = version(doc)
  v = doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr')
  v.reduce([]) do |mem, row|
    ver = row.at("td").text
    unless ver == current_version
      acronym = row.at('td[3]/a')[:href].split("/")[4]
      fref = RelatonBib::FormattedRef.new content: "OMG #{acronym} #{ver}"
      bibitem = OmgBibliographicItem.new formattedref: fref
      mem << { type: "obsoletes", bibitem: bibitem }
    end
    mem
  end
end
fetch_status(doc) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 83
def fetch_status(doc)
  status = doc.at('//dt[.="Document Status:"]/following-sibling::dd')
  stage = status.text.strip.match(/\w+/).to_s
  RelatonBib::DocumentStatus.new(stage: stage)
end
fetch_title(doc) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 48
def fetch_title(doc)
  content = doc.at('//dt[.="Title:"]/following-sibling::dd').text
  title = RelatonBib::FormattedString.new content: content, language: "en", script: "Latn"
  [RelatonBib::TypedTitleString.new(type: "main", title: title)]
end
fetch_version(doc) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 67
def fetch_version(doc)
  RelatonBib::BibliographicItem::Version.new pub_date(doc), [version(doc)]
end
item(doc, acronym) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 27
def item(doc, acronym)
  {
    id: fetch_id(doc, acronym),
    fetched: Date.today.to_s,
    docid: fetch_docid(doc, acronym),
    title: fetch_title(doc),
    abstract: fetch_abstract(doc),
    version: fetch_version(doc),
    date: fetch_date(doc),
    docstatus: fetch_status(doc),
    link: fetch_link(doc),
    relation: fetch_relation(doc),
    keyword: fetch_keyword(doc),
    license: fetch_license(doc)
  }
end
pub_date(doc) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 79
def pub_date(doc)
  Date.parse doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip
end
version(doc) click to toggle source
# File lib/relaton_omg/scrapper.rb, line 71
def version(doc)
  doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text
end