module RelatonCalconnect::Scrapper

Constants

DOMAIN
HOST

Public Class Methods

parse_page(hit) click to toggle source

papam hit [Hash] @return [RelatonOgc::OrcBibliographicItem]

# File lib/relaton_calconnect/scrapper.rb, line 10
def parse_page(hit)
  links = array(hit["link"])
  link = links.detect { |l| l["type"] == "rxl" }
  if link
    bib = fetch_bib_xml link["content"]
    update_links bib, links
    # XMLParser.from_xml bib_xml
  else
    bib = RelatonCalconnect::CcBibliographicItem.from_hash doc_to_hash(hit)
  end
  bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host }
  bib
end

Private Class Methods

array(content) click to toggle source

Wrap into Array if not Array

@param [Array, Hash, String, nil] content

@return [Array<Hash, String>]

# File lib/relaton_calconnect/scrapper.rb, line 87
def array(content)
  case content
  when Array then content
  when nil then []
  else [content]
  end
end
doc_to_hash(doc) click to toggle source

Fix editorial group

@param [Hash] doc

@return [Hash]

# File lib/relaton_calconnect/scrapper.rb, line 64
def doc_to_hash(doc)
  array(doc["editorialgroup"]).each do |eg|
    tc = eg.delete("technical_committee")
    eg.merge!(tc) if tc
  end
  doc
end
fetch_bib_xml(url) click to toggle source

@param url [String] @return [String] XML

# File lib/relaton_calconnect/scrapper.rb, line 28
def fetch_bib_xml(url)
  # rxl = get_rxl url
  # uri_rxl = rxl.at("uri[@type='rxl']")
  # return rxl.to_xml unless uri_rxl

  # uri_xml = rxl.xpath("//uri").to_xml
  # rxl = get_rxl uri_rxl.text
  # docid = rxl.at "//docidentifier"
  # docid.add_previous_sibling uri_xml
  # rxl.to_xml
  rxl = get_rxl url
  uri_rxl = rxl.at("uri[@type='rxl']")
  if uri_rxl
    uri_xml = rxl.xpath("//uri").to_xml
    rxl = get_rxl uri_rxl.text
    docid = rxl.at "//docidentifier"
    docid.add_previous_sibling uri_xml
  end
  xml = rxl.to_xml.gsub!(%r{(</?)technical-committee(>)}, '\1committee\2')
  RelatonCalconnect::XMLParser.from_xml xml
end
get_rxl(path) click to toggle source

@param path [String] @return [Nokogiri::XML::Document]

# File lib/relaton_calconnect/scrapper.rb, line 52
def get_rxl(path)
  resp = Faraday.get DOMAIN + path
  Nokogiri::XML resp.body
end