module TottoriOpenDataCatalog::Parser
Public Class Methods
parse_index(html)
click to toggle source
# File lib/tottori-opendata-catalog/parser.rb, line 6 def parse_index(html) doc = Nokogiri::HTML(html, nil, 'Shift_JIS') # collect name = doc.xpath('//title').text.strip anchors = doc.xpath('//a') categories = anchors.select { |a| a[:href].include?('forweb_bunrui') }.map { |a| { name: a.text.strip, link: a[:href].strip } } items = { name: name, categories: categories } items end
parse_list(html)
click to toggle source
# File lib/tottori-opendata-catalog/parser.rb, line 23 def parse_list(html) doc = Nokogiri::HTML(html, nil, 'Shift_JIS') # collect items = doc.xpath('//table[@id="contentslist"]/tr[position() > 1]').map do |tr| tds = tr.children { name: tds[0].text.strip, link: tds[0].children[0][:href], formats: tds[1].text.strip, url: tds[2].children[0][:href], department: tds[3].text.strip, division: tds[4].text.strip } end # trim items.each do |item| if item[:formats] item[:formats].upcase! item[:formats] = item[:formats].split end end items end
parse_record(string)
click to toggle source
# File lib/tottori-opendata-catalog/parser.rb, line 47 def parse_record(string) doc = Nokogiri::HTML(string, nil, 'Shift_JIS') # collect item = doc.xpath('//form/div[@id="all"]').children.map do |e| case e.text.strip when /データ年次.*?:(.*)$/ { updated_at: Regexp.last_match(1).strip } when /情報.*?:(.*)/ { tags: Regexp.last_match(1).strip } when /再配布可否.*?:(.*)$/ { redistribution_allowed: Regexp.last_match(1).strip } when /商用利用可否.*?:(.*)$/ { commercial_use_allowed: Regexp.last_match(1).strip } when /テータ提供.*?:(.*)$/ { provider: Regexp.last_match(1).strip } when /問い合わせ先電話番号.*?:(.*)$/ { tel: Regexp.last_match(1).strip } when /内容.*?:(.*)$/ { description: Regexp.last_match(1).strip } when /コメント.*?:(.*)$/ { comment: Regexp.last_match(1).strip } end end.compact.inject(&:merge) # trim if item[:redistribution_allowed] == '○' item[:redistribution_allowed] = true end if item[:redistribution_allowed] == '×' item[:redistribution_allowed] = false end if item[:commercial_use_allowed] == '○' item[:commercial_use_allowed] = true end if item[:commercial_use_allowed] == '×' item[:commercial_use_allowed] = false end updated_at = item.delete(:updated_at) if updated_at updated_at.tr!(' ', ' ') updated_at.tr!('0-9', '0-9') end unless updated_at.delete(' ').empty? date, repeat_rule_description = updated_at.split year, month, day = date.split('.') end item[:year] = (year ? year.to_i : nil) item[:month] = (month ? month.to_i : nil) item[:day] = (day ? day.to_i : nil) item[:repeat_rule] = {} item[:repeat_rule].merge!( case repeat_rule_description when /毎年度/ { frequency: 'fiscal_yearly', interval: nil } when /毎年/ { frequency: 'yearly', interval: nil } when /毎月/ { frequency: 'monthly', interval: nil } when /毎週/ { frequency: 'weekly', interval: nil } when /(\d)年度毎/ { frequency: 'fiscal_yearly', interval: Regexp.last_match(1).to_i } when /(\d)年毎/ { frequency: 'yearly', interval: Regexp.last_match(1).to_i } when /(\d)月毎/, /(\d)ヶ月毎/ { frequency: 'monthly', interval: Regexp.last_match(1).to_i } when /(\d)週毎/ { frequency: 'weekly', interval: Regexp.last_match(1).to_i } when /随時/ { frequency: 'as_needed', interval: nil } when /不定期/ { frequency: 'unscheduled', interval: nil } when /予算編成時/ { frequency: 'budgeting', interval: nil } else { frequency: nil, interval: nil } end) item[:repeat_rule].merge!(description: repeat_rule_description) # 繰り返しルールの説明が有るのに周期が無い場合は例外をスロー if !item[:repeat_rule][:description].nil? && item[:repeat_rule][:frequency].nil? fail end item[:tags] = item[:tags].split if item.include?(:tags) item end