module RelatonCalconnect::Scrapper
Constants
- DOMAIN
- HOST
Public Class Methods
parse_page(hit)
click to toggle source
papam hit [Hash] @return [RelatonOgc::OrcBibliographicItem]
# File lib/relaton_calconnect/scrapper.rb, line 10 def parse_page(hit) links = array(hit["link"]) link = links.detect { |l| l["type"] == "rxl" } if link bib = fetch_bib_xml link["content"] update_links bib, links # XMLParser.from_xml bib_xml else bib = RelatonCalconnect::CcBibliographicItem.from_hash doc_to_hash(hit) end bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host } bib end
Private Class Methods
array(content)
click to toggle source
Wrap into Array if not Array
@param [Array, Hash, String, nil] content
@return [Array<Hash, String>]
# File lib/relaton_calconnect/scrapper.rb, line 87 def array(content) case content when Array then content when nil then [] else [content] end end
doc_to_hash(doc)
click to toggle source
Fix editorial group
@param [Hash] doc
@return [Hash]
# File lib/relaton_calconnect/scrapper.rb, line 64 def doc_to_hash(doc) array(doc["editorialgroup"]).each do |eg| tc = eg.delete("technical_committee") eg.merge!(tc) if tc end doc end
fetch_bib_xml(url)
click to toggle source
@param url [String] @return [String] XML
# File lib/relaton_calconnect/scrapper.rb, line 28 def fetch_bib_xml(url) # rxl = get_rxl url # uri_rxl = rxl.at("uri[@type='rxl']") # return rxl.to_xml unless uri_rxl # uri_xml = rxl.xpath("//uri").to_xml # rxl = get_rxl uri_rxl.text # docid = rxl.at "//docidentifier" # docid.add_previous_sibling uri_xml # rxl.to_xml rxl = get_rxl url uri_rxl = rxl.at("uri[@type='rxl']") if uri_rxl uri_xml = rxl.xpath("//uri").to_xml rxl = get_rxl uri_rxl.text docid = rxl.at "//docidentifier" docid.add_previous_sibling uri_xml end xml = rxl.to_xml.gsub!(%r{(</?)technical-committee(>)}, '\1committee\2') RelatonCalconnect::XMLParser.from_xml xml end
get_rxl(path)
click to toggle source
@param path [String] @return [Nokogiri::XML::Document]
# File lib/relaton_calconnect/scrapper.rb, line 52 def get_rxl(path) resp = Faraday.get DOMAIN + path Nokogiri::XML resp.body end
update_links(bib, links)
click to toggle source
# File lib/relaton_calconnect/scrapper.rb, line 72 def update_links(bib, links) links.each do |l| tu = l.transform_keys(&:to_sym) bib.link << RelatonBib::TypedUri.new(**tu) unless bib.url(l["type"]) end bib end