require 'net/http' require 'open-uri'

namespace :load_maps do

desc "Load MARC geo codes by screen-scraping LC"
task :marc_geographic do
  begin
    require 'nokogiri'
  rescue LoadError => e
    $stderr.puts "\n  load_maps:marc_geographic task requires nokogiri"
    $stderr.puts "  Try `gem install nokogiri` and try again. Exiting...\n\n"
    exit 1
  end

  source_url = "http://www.loc.gov/marc/geoareas/gacs_code.html"

  filename = ENV["OUTPUT_TO"] || File.expand_path("../../translation_maps/marc_geographic.yaml", __FILE__)
  file = File.open( filename, "w:utf-8" )

  $stderr.puts "Writing to `#{filename}` ..."

  html = Nokogiri::HTML(open(source_url).read)

  file.puts "# Translation map for marc geographic codes constructed by `rake load_maps:marc_geographic` task"    
  file.puts "# Scraped from #{source_url} at #{Time.now}"
  file.puts "# Intentionally includes discontinued codes."

  file.puts "\n"
  html.css("tr").each do |line|
    code = line.css("td.code").inner_text.strip
    unless code.nil? || code.empty?
      code.gsub!(/^\-/, '') # treat discontinued code like any other

      label = line.css("td[2]").inner_text.strip

      label.gsub!(/\n */, ' ') # get rid of newlines that file now sometimes contains, bah.
      label.gsub!("'", "''") # yaml escapes single-quotes by doubling them, weird but true.

      file.puts "'#{code}': '#{label}'"
    end
  end
  $stderr.puts "Done."
end

end