class Terrier::HtmlData

Constants

AUTHOR_META_TAGS
DOI_META_TAGS
ISSN_TAGS
LICENSING_TAGS
PUBLICATION_DATE_META_TAGS
PUBLICATION_META_TAGS
TITLE_META_TAGS

Attributes

url[R]

Public Class Methods

new(url) click to toggle source
# File lib/terrier/html_data.rb, line 13
def initialize(url)
  raise Terrier::UrlError, "bad url given" unless uri?(url)
  @url = url
  @raw = self.class.get(url)
  @parsed_html = Nokogiri::HTML(@raw)
end

Public Instance Methods

data() click to toggle source
# File lib/terrier/html_data.rb, line 20
def data
  return @_data if @_data
  @_data = {
    url: url,
    journal: collect_meta_data(@parsed_html, PUBLICATION_META_TAGS).first,
    title: collect_meta_data( @parsed_html, TITLE_META_TAGS).first,
    authors: collect_meta_data( @parsed_html, AUTHOR_META_TAGS).uniq,
    publication_date: collect_meta_data(@parsed_html, PUBLICATION_DATE_META_TAGS).first,
    doi: collect_meta_data(@parsed_html, DOI_META_TAGS).first,
    issn: nil,
    zenodo_pdf: zenodo_pdf
  }

  @_data.merge(bibliography: bibliography(@_data))
end

Private Instance Methods

bibliography(parsed_data) click to toggle source
# File lib/terrier/html_data.rb, line 42
def bibliography(parsed_data)
  "#{parsed_data[:authors].join(', ')}. (#{parsed_data[:publication_date]}). #{parsed_data[:title]}. #{parsed_data[:journal]}. #{bibliography_reference}"
end
bibliography_reference() click to toggle source
# File lib/terrier/html_data.rb, line 64
def bibliography_reference
  if data[:doi]
    "<a href='https://doi.org/#{data[:doi]}'>DOI: #{data[:doi]}</a>"
  else
    data[:url]
  end
end
citation_header() click to toggle source
# File lib/terrier/html_data.rb, line 72
def citation_header
  { "Accept" => "application/vnd.citationstyles.csl+json;q=1.0" }
end
collect_meta_data(parsed_html, meta_names_array) click to toggle source
# File lib/terrier/html_data.rb, line 55
def collect_meta_data(parsed_html, meta_names_array)
  value = []
  meta_names_array.each do |meta_name|
    value = parsed_html.xpath("//meta[@name='#{meta_name}']/@content").map(&:value)
    return value unless value.empty?
  end
  value
end
uri?(string) click to toggle source
# File lib/terrier/html_data.rb, line 46
def uri?(string)
  uri = URI.parse(string)
  %w( http https ).include?(uri.scheme)
rescue URI::BadURIError
  false
rescue URI::InvalidURIError
  false
end
zenodo_pdf() click to toggle source
# File lib/terrier/html_data.rb, line 38
def zenodo_pdf
  @zenodo_pdf ||= /\bhttps:\/\/zenodo.org\S*pdf\b/.match(@raw).to_s
end