module Gbbib::GbScrapper
National standard scrapper.
Public Class Methods
get_committee(doc)
click to toggle source
@param doc [Nokogiri::HTML] @return [Hash]
* :type [String] * :name [String]
# File lib/gbbib/gb_scrapper.rb, line 51 def get_committee(doc) name = doc.xpath('//p/a[1]/following-sibling::text()').text .match(/(?<=()[^)]+/).to_s { type: 'technical', name: name } end
scrape_doc(pid)
click to toggle source
@param pid [Strin] standard's page id @return [Gbbib::GbBibliographicItem]
# File lib/gbbib/gb_scrapper.rb, line 37 def scrape_doc(pid) src = 'http://www.std.gov.cn/gb/search/gbDetailed?id=' + pid begin doc = Nokogiri::HTML OpenURI.open_uri(src) GbBibliographicItem.new scrapped_data(doc, src: src) rescue warn "Cannot access http://www.std.gov.cn/search/stdPage" end end
scrape_page(text)
click to toggle source
@param text [Strin] code of standard for serarch @return [Gbbib::HitCollection]
# File lib/gbbib/gb_scrapper.rb, line 20 def scrape_page(text) begin search_html = OpenURI.open_uri( 'http://www.std.gov.cn/search/stdPage?q=' + text ) result = Nokogiri::HTML search_html hits = result.css('.s-title a').map do |h| Hit.new pid: h[:pid], title: h.text, scrapper: self end HitCollection.new hits rescue warn "Cannot access http://www.std.gov.cn/search/stdPage" end end