require 'json' require 'gov_scooper' require 'fileutils' # require 'pry'
namespace :gov_scooper do
desc 'Sample OGM data and move it to a new directory' task :sample do number = Integer(ENV['GS_SAMPLE']) || 1 ogm_loc = ENV.fetch('DATA_DIR') sample_output = ENV.fetch('GS_OUTPUT') if ogm_loc.nil? raise 'Please provide environment variable DATA_DIR'\ ' for opengeometdata directory location' end raise 'Please provide output directory GS_OUTPUT' if sample_output.nil? layers = JSON.parse(File.read(File.join(ogm_loc, 'pairtree_root', 'layers.json'))) puts "#{layers.length} layers found" random_layers = layers.to_a.sample(number).to_h puts "Sampling #{random_layers.length} layers" random_layers.values.each do |value| output = File.join(sample_output, 'pairtree_root', value) FileUtils.mkdir_p output Dir[File.join(ogm_loc, 'pairtree_root', value, '*')].each do |file_name| next if File.directory? file_name puts "Copying #{file_name}" FileUtils.cp file_name, output end end ENV['DATA_DIR'] = sample_output Rake::Task['gov_scooper:create_layers_json'].invoke Rake::Task['gov_scooper:download_data'].invoke ENV['DATA_DIR'] = ogm_loc end desc 'Download data for layers in a given directory - Be careful with this' task :download_data do ogm_loc = ENV.fetch('DATA_DIR') if ogm_loc.nil? raise 'Please provide environment variable DATA_DIR'\ ' for opengeometdata directory location' end layers = JSON.parse(File.read(File.join(ogm_loc, 'pairtree_root', 'layers.json'))) puts "#{layers.length} layers found" resource_count = 0 layers.each do |layer| dataset = DataGov::Dataset.from_id(layer[0]) puts "Downloading from dataset #{dataset.id}" resources = dataset.resources resource_count += resources.length resources.map(&:download) end puts "#{resource_count} total resources" end desc 'Create layers.json' task :create_layers_json do ogm_loc = ENV.fetch('DATA_DIR') if ogm_loc.nil? raise 'Please provide environment variable DATA_DIR'\ ' for opengeometdata directory location' end layers = Dir[File.join(ogm_loc, 'pairtree_root', '**', 'ckan.json')] h = layers.map do |f| d = DataGov::Dataset.new(JSON.parse(File.read(f))) { d.id => f.sub(/.*pairtree_root\//, '').sub('ckan.json', '') } end v = h.inject(:merge!) puts "layers.json created for #{layers.count} files" File.open(File.join(ogm_loc, 'pairtree_root', 'layers.json'), 'w') do |io| io.write(JSON.pretty_generate(v)) end end
end