module Iecbib::Scrapper
Scrapper
. rubocop:disable Metrics/ModuleLength
Constants
- DOMAIN
- TYPES
Public Class Methods
Parse page. @param hit [Hash] @return [Hash] rubocop:disable Metrics/AbcSize, Metrics/MethodLength
# File lib/iecbib/scrapper.rb, line 52 def parse_page(hit_data) doc = get_page hit_data[:url] # Fetch edition. edition = doc.at("//th[contains(., 'Edition')]/following-sibling::td/span").text status, relations = fetch_status_relations hit_data[:url] IsoBibItem::IsoBibliographicItem.new( docid: fetch_docid(doc), edition: edition, language: ['en'], script: ['Latn'], titles: fetch_titles(hit_data), type: fetch_type(doc), docstatus: status, ics: fetch_ics(doc), dates: fetch_dates(doc), contributors: fetch_contributors(hit_data[:code]), workgroup: fetch_workgroup(doc), abstract: fetch_abstract(doc), copyright: fetch_copyright(hit_data[:code], doc), link: fetch_link(doc, hit_data[:url]), relations: relations ) end
Private Class Methods
Fetch abstracts. @param doc [Nokigiri::HTML::Document] @return [Array<Array>]
# File lib/iecbib/scrapper.rb, line 125 def fetch_abstract(doc) abstract_content = doc.at('//div[@itemprop="description"]').text [{ content: abstract_content, language: 'en', script: 'Latn' }] end
# File lib/iecbib/scrapper.rb, line 341 def fetch_contributors(code) code.sub(/\s.*/, '').split('/').map do |abbrev| case abbrev when 'ISO' name = 'International Organization for Standardization' url = 'www.iso.org' when 'IEC' name = 'International Electrotechnical Commission' url = 'www.iec.ch' end { entity: { name: name, url: url, abbreviation: abbrev }, roles: ['publisher'] } end end
Fetch copyright. @param title [String] @return [Hash]
# File lib/iecbib/scrapper.rb, line 380 def fetch_copyright(code, doc) abbreviation = code.match(/.*?(?=\s)/).to_s case abbreviation when 'IEC' name = 'International Electrotechnical Commission' url = 'www.iec.ch' end from = code.match(/(?<=:)\d{4}/).to_s if from.empty? from = doc.xpath("//span[@itemprop='releaseDate']").text .match(/\d{4}/).to_s end { owner: { name: name, abbreviation: abbreviation, url: url }, from: from } end
Fetch dates @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]
# File lib/iecbib/scrapper.rb, line 332 def fetch_dates(doc) dates = [] publish_date = doc.at("//span[@itemprop='releaseDate']").text unless publish_date.empty? dates << { type: 'published', on: publish_date } end dates end
Fetch docid. @param doc [Nokogiri::HTML::Document] @return [Hash]
# File lib/iecbib/scrapper.rb, line 172 def fetch_docid(doc) item_ref = doc.at("//span[@itemprop='productID']") unless item_ref return { project_number: '?', part_number: '', prefix: nil, id: '?' } end m = item_ref.text.match(/(?<=\s)(?<project>\d+)-?(?<part>(?<=-)\d+|)-?(?<subpart>(?<=-)\d+|)/) { project_number: m[:project], part_number: m[:part], subpart_number: m[:subpart], prefix: nil, type: 'IEC', id: item_ref.text } end
Fetch ICS. @param doc [Nokogiri::HTML::Document] @return [Array<Hash>]
# File lib/iecbib/scrapper.rb, line 359 def fetch_ics(doc) doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i| code = i.text.match(/[\d\.]+/).to_s.split '.' { field: code[0], group: code[1], subgroup: code[2] } end end
Fetch links. @param doc [Nokogiri::HTML::Document] @param url [String] @return [Array<Hash>]
# File lib/iecbib/scrapper.rb, line 370 def fetch_link(doc, url) links = [{ type: 'src', content: url }] obp_elms = doc.at_css('p.btn-preview a') links << { type: 'obp', content: obp_elms[:href] } if obp_elms links end
Fetch relations. @param doc [Nokogiri::HTML::Document] @return [Array<Hash>] rubocop:disable Metrics/MethodLength
# File lib/iecbib/scrapper.rb, line 226 def fetch_relations(doc) doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]').map do |r| r_type = r.at('STATUS').text.downcase type = case r_type # when 'published' then 'obsoletes' # Valid when 'revised', 'replaced' then 'updates' when 'withdrawn' then 'obsoletes' else r_type end url = DOMAIN + '/publication/' + r.at('PUB_ID').text { type: type, identifier: r.at('FULL_NAME').text, url: url } end end
Fetch status. @param doc [Nokogiri::HTML::Document] @param status [String] @return [Hash]
# File lib/iecbib/scrapper.rb, line 192 def fetch_status(doc) wip = doc.at('//ROW[STATUS[.="PREPARING"]]') if wip statuses = YAML.load_file 'lib/iecbib/statuses.yml' s = wip.at('STAGE').text stage, substage = statuses[s]['stage'].split '.' status = statuses[s]['status'] else status = 'Published' stage = '60' substage = '60' end { status: status, stage: stage, substage: substage } end
# File lib/iecbib/scrapper.rb, line 240 def fetch_status_relations(url) pubid = url.match(/\d+$/).to_s uri = URI DOMAIN + '/webstore/webstore.nsf/AjaxRequestXML?'\ 'Openagent&url=http://www.iec.ch/dyn/www/f?'\ 'p=103:390:::::P390_PUBLICATION_ID:' + pubid resp = Net::HTTP.get_response uri doc = Nokogiri::XML resp.body status = fetch_status doc relations = fetch_relations doc [status, relations] # doc.css('ul.steps li').inject([]) do |a, r| # r_type = r.css('strong').text # type = case r_type # when 'Previously', 'Will be replaced by' then 'obsoletes' # when 'Corrigenda/Amendments', 'Revised by', 'Now confirmed' # 'updates' # else r_type # end # if ['Now', 'Now under review'].include? type # a # else # a + r.css('a').map do |id| # { type: type, identifier: id.text, url: id['href'] } # end # end # end end
Fetch titles. @param hit_data [Hash] @return [Array<Hash>]
# File lib/iecbib/scrapper.rb, line 293 def fetch_titles(hit_data) titles = hit_data[:title].split ' - ' case titles.size when 0 intro, main, part = nil, "", nil when 1 intro, main, part = nil, titles[0], nil when 2 if /^(Part|Partie) \d+:/ =~ titles[1] intro, main, part = nil, titles[0], titles[1] else intro, main, part = titles[0], titles[1], nil end when 3 intro, main, part = titles[0], titles[1], titles[2] else intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ") end [{ title_intro: intro, title_main: main, title_part: part, language: 'en', script: 'Latn' }] end
Fetch type. @param doc [Nokogiri::HTML::Document] @return [String]
# File lib/iecbib/scrapper.rb, line 272 def fetch_type(doc) doc.at('//th[contains(., "Publication type")]/following-sibling::td/span') .text.downcase.tr ' ', '-' # type_match = title.match(%r{^(ISO|IWA|IEC)(?:(/IEC|/IEEE|/PRF| # /NP)*\s|/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x) # #return "international-standard" if type_match.nil? # if TYPES[type_match[2]] # TYPES[type_match[2]] # elsif type_match[1] # elsif type_match[1] == 'ISO' # 'international-standard' # elsif type_match[1] == 'IWA' # 'international-workshop-agreement' # end # # rescue => _e # # puts 'Unknown document type: ' + title end
Fetch workgroup. @param doc [Nokogiri::HTML::Document] @return [Hash]
# File lib/iecbib/scrapper.rb, line 210 def fetch_workgroup(doc) wg = doc.at('//th/abbr[.="TC"]/../following-sibling::td/a').text { name: 'International Electrotechnical Commission', abbreviation: 'IEC', url: 'webstore.iec.ch', technical_committee: { name: wg, type: 'technicalCommittee', number: wg.match(/\d+/)&.to_s&.to_i } } end
rubocop:disable Metrics/AbcSize, Metrics/MethodLength Get page. @param path [String] page's path @return [Array<Nokogiri::HTML::Document, String>]
# File lib/iecbib/scrapper.rb, line 151 def get_page(url) uri = URI url resp = Net::HTTP.get_response(uri)#.encode("UTF-8") if resp.code == '301' path = resp['location'] url = DOMAIN + path uri = URI url resp = Net::HTTP.get_response(uri)#.encode("UTF-8") end # n = 0 # while resp.body !~ /<strong/ && n < 10 # resp = Net::HTTP.get_response(uri)#.encode("UTF-8") # n += 1 # end Nokogiri::HTML(resp.body) end