class Mvp::Bigquery
Public Class Methods
new(options = {})
click to toggle source
# File lib/mvp/bigquery.rb, line 7 def initialize(options = {}) @options = options @cachedir = options[:cachedir] @bigquery = Google::Cloud::Bigquery.new( :project_id => options[:gcloud][:project], :credentials => Google::Cloud::Bigquery::Credentials.new(options[:gcloud][:keyfile]), ) @dataset = @bigquery.dataset(options[:gcloud][:dataset]) raise "\nThere is a problem with the gCloud configuration: \n #{JSON.pretty_generate(options)}" if @dataset.nil? @itemized = @dataset.table('forge_itemized') || @dataset.create_table('forge_itemized') do |table| table.name = 'Itemized dependencies between modules' table.description = 'A list of all types/classes/functions used by each module and where they come from' table.schema do |s| s.string "module", mode: :required s.string "version", mode: :required s.string "source" s.string "kind", mode: :required s.string "element", mode: :required s.integer "count", mode: :required end end @puppetfile_usage = @dataset.table('github_puppetfile_usage') || @dataset.create_table('github_puppetfile_usage') do |table| table.name = 'Puppetfile Module Usage' table.description = 'A list of all modules referenced in public Puppetfiles' table.schema do |s| s.string "repo_name", mode: :required s.string "module", mode: :required s.string "type", mode: :required s.string "source" s.string "version" s.string "md5", mode: :required end end end
Public Instance Methods
delete(entity, field, match, suite = 'forge')
click to toggle source
# File lib/mvp/bigquery.rb, line 225 def delete(entity, field, match, suite = 'forge') @dataset.query("DELETE FROM #{suite}_#{entity} WHERE #{field} = '#{match}'") end
get(entity, fields, suite = 'forge')
click to toggle source
# File lib/mvp/bigquery.rb, line 229 def get(entity, fields, suite = 'forge') raise 'pass fields as an array' unless fields.is_a? Array @dataset.query("SELECT #{fields.join(', ')} FROM #{suite}_#{entity}") end
insert(entity, data, suite = 'forge')
click to toggle source
# File lib/mvp/bigquery.rb, line 209 def insert(entity, data, suite = 'forge') return if @options[:noop] return if data.empty? table = @dataset.table("#{suite}_#{entity}") response = table.insert(data) unless response.success? $logger.error '=========================================================================' response.insert_errors.each do |err| $logger.debug JSON.pretty_generate(err.row.reject {|k,v| ['metadata'].include? k}) $logger.error JSON.pretty_generate(err.errors) end end end
mirror_table(entity)
click to toggle source
# File lib/mvp/bigquery.rb, line 185 def mirror_table(entity) return if @options[:noop] begin case entity[:type] when :view @dataset.table(entity[:name]).delete rescue nil # delete if exists @dataset.create_view(entity[:name], entity[:query]) when :table job = @dataset.query_job(entity[:query], :write => 'truncate', :table => @dataset.table(entity[:name], :skip_lookup => true)) job.wait_until_done! else $logger.error "Unknown mirror type: #{entity[:type]}" end rescue => e $logger.error("(Google Cloud error: #{e.message})") $logger.debug e.backtrace.join("\n") end end
module_sources()
click to toggle source
# File lib/mvp/bigquery.rb, line 234 def module_sources() get('modules', ['slug', 'source']) end
puppetfiles()
click to toggle source
# File lib/mvp/bigquery.rb, line 238 def puppetfiles() sql = 'SELECT f.repo_name, f.path, c.content, c.md5 FROM github_puppetfile_files AS f JOIN github_puppetfile_contents AS c ON c.id = f.id WHERE c.md5 NOT IN ( SELECT u.md5 FROM github_puppetfile_usage AS u WHERE u.repo_name = f.repo_name ) AND LOWER(repo_name) NOT LIKE "%boxen%"' @dataset.query(sql) end
retrieve(entity)
click to toggle source
# File lib/mvp/bigquery.rb, line 181 def retrieve(entity) get(entity, ['*']) end
test()
click to toggle source
# File lib/mvp/bigquery.rb, line 270 def test() require 'pry' binding.pry end
truncate(entity)
click to toggle source
# File lib/mvp/bigquery.rb, line 45 def truncate(entity) return if @options[:noop] begin case entity when :authors @dataset.table('forge_authors').delete rescue nil @dataset.create_table('forge_authors') do |table| table.name = 'Forge Authors' table.description = 'A list of all authors (users) on the Forge' table.schema do |s| s.integer "module_count", mode: :required s.integer "release_count", mode: :required s.timestamp "created_at", mode: :required s.string "display_name", mode: :required s.string "username", mode: :required s.timestamp "updated_at", mode: :required s.string "gravatar_id", mode: :required s.string "slug", mode: :required s.string "uri", mode: :required end end when :modules # both modules and validations @dataset.table('forge_modules').delete rescue nil @dataset.create_table('forge_modules') do |table| table.name = 'Forge Modules' table.description = 'All modules and their metadata on the Forge' table.schema do |s| s.string "name", mode: :required s.string "owner", mode: :required s.string "version", mode: :required s.string "slug", mode: :required s.string "uri", mode: :required s.timestamp "created_at", mode: :required s.timestamp "updated_at", mode: :required s.string "tasks", mode: :repeated s.string "plans", mode: :repeated s.string "homepage_url" s.string "project_page" s.string "issues_url" s.string "source" s.boolean "supported" s.string "endorsement" s.string "module_group" s.boolean "pdk" s.string "operatingsystem", mode: :repeated s.integer "release_count", mode: :required s.integer "downloads", mode: :required s.integer "feedback_score" s.integer "validation_score" s.string "releases", mode: :repeated s.string "puppet_range" s.boolean "puppet_2x" s.boolean "puppet_3x" s.boolean "puppet_4x" s.boolean "puppet_5x" s.boolean "puppet_6x" s.boolean "puppet_99x" s.string "superseded_by" s.string "deprecated_for" s.timestamp "deprecated_at" s.timestamp "deleted_at" s.string "dependencies", mode: :repeated s.string "license" s.string "metadata", mode: :required end end @dataset.table('forge_validations').delete rescue nil @dataset.create_table('forge_validations') do |table| table.name = 'Forge Module Validations' table.description = 'Validation scores for all the modules on the Forge' table.schema do |s| s.integer "total" s.integer "parser" s.integer "metadata" s.integer "lint" s.string "name", mode: :required end end when :releases @dataset.table('forge_releases').delete rescue nil @dataset.create_table('forge_releases') do |table| table.name = 'Forge Releases' table.description = 'Releases of all modules on the Forge' table.schema do |s| s.string "name", mode: :required s.string "owner", mode: :required s.string "version", mode: :required s.string "slug", mode: :required s.string "uri", mode: :required s.timestamp "created_at", mode: :required s.timestamp "updated_at", mode: :required s.timestamp "deleted_at" s.string "deleted_for" s.string "tasks", mode: :repeated s.string "plans", mode: :repeated s.string "project_page" s.string "issues_url" s.string "source" s.boolean "supported" s.boolean "pdk" s.string "tags", mode: :repeated s.string "operatingsystem", mode: :repeated s.integer "downloads", mode: :required s.integer "feedback_score" s.integer "validation_score" s.string "puppet_range" s.boolean "puppet_2x" s.boolean "puppet_3x" s.boolean "puppet_4x" s.boolean "puppet_5x" s.boolean "puppet_6x" s.boolean "puppet_99x" s.string "dependencies", mode: :repeated s.string "file_uri", mode: :required s.string "file_md5" s.string "file_sha256" s.integer "file_size", mode: :required s.string "license" s.string "metadata", mode: :required end end sleep 5 # this allows BigQuery time to flush schema changes end rescue => e $logger.error e.message $logger.debug e.backtrace.join("\n") @channels = @dataset.table('slack_channels') end end
unitemized()
click to toggle source
# File lib/mvp/bigquery.rb, line 252 def unitemized() sql = 'SELECT m.name, m.slug, m.version, m.dependencies FROM forge_modules AS m WHERE m.version NOT IN ( SELECT i.version FROM forge_itemized AS i WHERE module = m.slug )' @dataset.query(sql) end
version_itemized?(mod, version)
click to toggle source
# File lib/mvp/bigquery.rb, line 263 def version_itemized?(mod, version) str = "SELECT DISTINCT version FROM forge_itemized WHERE module = '#{mod}'" versions = @dataset.query(str).map {|row| row[:version] } rescue [] versions.include? version end