module CanvasDataClient::Helpers::CsvHelper
Public Instance Methods
download_latest_to_csv_file(table:, path:)
click to toggle source
# File lib/canvas_data_client/helpers/csv_helper.rb, line 35 def download_latest_to_csv_file(table:, path:) latest_dump = latest download_to_csv_file dump_id: latest_dump['dumpId'], table: table, path: path end
download_to_csv_file(dump_id:, table:, path:)
click to toggle source
# File lib/canvas_data_client/helpers/csv_helper.rb, line 8 def download_to_csv_file(dump_id:, table:, path:) dump_definition = dump(dump_id) schema_definition = schema(dump_definition['schemaVersion']) raise TableNotPresentError.new("Table #{table} not present in dump #{dump_id}") unless dump_definition['artifactsByTable'][table] csv = CSV.open(path, 'w') columns = table_headers(schema_definition, table) csv << columns Dir.mktmpdir do |dir| dump_definition['artifactsByTable'][table]['files'].each do |file_mapping| renew_urls(dump_id, table, dump_definition['artifactsByTable'][table]['files']) if url_expired?(file_mapping['url']) logger.info("Downloading table file: #{file_mapping['filename']}") file_path = download_raw_file(file_mapping, dir) logger.info("Processing table file: #{file_mapping['filename']}") File.foreach(file_path) do |row| split_row = row.gsub(/\n/, '').split(/\t/) split_row.fill(nil, split_row.length...columns.length) if split_row.length < columns.length csv << split_row.map { |col| col == '\\N' ? nil : col } end FileUtils.rm_f file_path end end ensure csv.close if csv end
Private Instance Methods
download_raw_file(file_mapping, dir)
click to toggle source
# File lib/canvas_data_client/helpers/csv_helper.rb, line 45 def download_raw_file(file_mapping, dir) resp = open(file_mapping['url']) file_path = "#{dir}/#{File.basename(file_mapping['filename'], '.gz')}" csv_file_path = "#{dir}/#{File.basename(file_mapping['filename'], '.gz')}.csv" if resp.is_a?(StringIO) File.open(file_path, 'wb') { |file| file.write(resp.read) } else FileUtils.cp resp, file_path end File.open(csv_file_path, 'wb') do |file| Zlib::GzipReader.open(file_path) do |gz| while !gz.eof? file.write gz.readpartial(50_000) end end end FileUtils.rm_f file_path csv_file_path end
renew_urls(dump_id, table, mappings)
click to toggle source
# File lib/canvas_data_client/helpers/csv_helper.rb, line 71 def renew_urls(dump_id, table, mappings) logger.info("Download URLs have expired. Pulling dump again to get a fresh set") new_definition = dump(dump_id) new_definition['artifactsByTable'][table]['files'].each_with_index do |new_mapping, idx| mappings[idx]['url'] = new_mapping['url'] end end
table_headers(schema_definition, table)
click to toggle source
# File lib/canvas_data_client/helpers/csv_helper.rb, line 41 def table_headers(schema_definition, table) schema_definition['schema'].find { |k, v| v['tableName'] == table }.last['columns'].map { |c| c['name'] } end
url_expired?(url)
click to toggle source
# File lib/canvas_data_client/helpers/csv_helper.rb, line 65 def url_expired?(url) uri = URI.parse(url) params = CGI::parse(uri.query) params['Expires'].first.to_i < (Time.now.to_i + 600) end