class WorkflowManager::FGCZCluster

Public Instance Methods

cluster_nodes() click to toggle source
# File lib/workflow_manager/cluster.rb, line 218
def cluster_nodes
  nodes = {
    'fgcz-c-043: cpu 24,mem  23 GB,scr  11T' => 'fgcz-c-043',
    'fgcz-c-044: cpu 16,mem 128 GB,scr 500G' => 'fgcz-c-044',
    'fgcz-c-045: cpu 64,mem 504 GB,scr  15T' => 'fgcz-c-045',
    'fgcz-c-046: cpu 64,mem 504 GB,scr  11T' => 'fgcz-c-046',
    'fgcz-c-047: cpu 32,mem   1 TB,scr  28T' => 'fgcz-c-047',
    'fgcz-c-048: cpu 48,mem 252 GB,scr 3.5T' => 'fgcz-c-048',
    'fgcz-c-049: cpu  8,mem  63 GB,scr 1.7T' => 'fgcz-c-049',
    'fgcz-c-051: cpu  8,mem  31 GB,scr 800G' => 'fgcz-c-051',
    'fgcz-c-052: cpu  8,mem  31 GB,scr 800G' => 'fgcz-c-052',
    'fgcz-c-053: cpu  8,mem  31 GB,scr 800G' => 'fgcz-c-053',
    'fgcz-c-054: cpu  8,mem  31 GB,scr 800G' => 'fgcz-c-054',
    'fgcz-c-055: cpu  8,mem  31 GB,scr 800G' => 'fgcz-c-055',
    'fgcz-c-057: cpu  8,mem  31 GB,scr 200G' => 'fgcz-c-057',
    'fgcz-c-058: cpu  8,mem  31 GB,scr 200G' => 'fgcz-c-058',
    'fgcz-c-059: cpu  8,mem  31 GB,scr 200G' => 'fgcz-c-059',
    'fgcz-c-061: cpu  8,mem  31 GB,scr 200G' => 'fgcz-c-061',
    'fgcz-c-063: cpu 12,mem  70 GB,scr 450G' => 'fgcz-c-063',
    'fgcz-c-065: cpu 24,mem  70 GB,scr 197G' => 'fgcz-c-065',
    'fgcz-h-004: cpu 8,mem  30 GB,scr 400G' => 'fgcz-h-004',
    'fgcz-h-009: cpu 8,mem  30 GB,scr 500G' => 'fgcz-h-009',
    'fgcz-h-010: cpu 8,mem  30 GB,scr 400G' => 'fgcz-h-010',
  }
end
copy_commands(org_dir, dest_parent_dir, now=nil) click to toggle source
# File lib/workflow_manager/cluster.rb, line 202
def copy_commands(org_dir, dest_parent_dir, now=nil)
  commands = if now == "force"
               target_file = File.join(dest_parent_dir, File.basename(org_dir))
               ["g-req copynow -f #{org_dir} #{dest_parent_dir}"]
             elsif now
               ["g-req copynow #{org_dir} #{dest_parent_dir}"]
             else
               ["g-req -w copy #{org_dir} #{dest_parent_dir}"]
             end
end
delete_command(target) click to toggle source
# File lib/workflow_manager/cluster.rb, line 215
def delete_command(target)
  command = "g-req remove #{target}"
end
job_ends?(log_file) click to toggle source
# File lib/workflow_manager/cluster.rb, line 177
def job_ends?(log_file)
  log_flag = false
  IO.popen("tail -n 10 #{log_file} 2> /dev/null") do |io|
    while line=io.gets
      if line =~ /__SCRIPT END__/
        log_flag = true
        break
      end
    end
  end
  log_flag
end
job_pending?(job_id) click to toggle source
# File lib/workflow_manager/cluster.rb, line 189
def job_pending?(job_id)
 qstat_flag = false
  IO.popen('qstat -u "*"') do |io|
    while line=io.gets
      jobid, prior, name, user, state, *others = line.chomp.split
      if jobid.strip == job_id and state =~ /qw/
        qstat_flag = true
        break
      end
    end
  end
  qstat_flag
end
job_running?(job_id) click to toggle source
# File lib/workflow_manager/cluster.rb, line 164
def job_running?(job_id)
 qstat_flag = false
  IO.popen('qstat -u "*"') do |io|
    while line=io.gets
      jobid, prior, name, user, state, *others = line.chomp.split
      if jobid.strip == job_id and state == 'r'
        qstat_flag = true
        break
      end
    end
  end
  qstat_flag
end
kill_command(job_id) click to toggle source
# File lib/workflow_manager/cluster.rb, line 212
def kill_command(job_id)
  command = "qdel #{job_id}"
end
node_list() click to toggle source
# File lib/workflow_manager/cluster.rb, line 243
def node_list
  node2scr = {}
  command = "qhost -F scratch"
  keep = nil
  IO.popen(command) do |out|
    while line=out.gets
      hostname, arch, ncpu, loading, memtot, memuse, *others = line.split
      if hostname =~ /fgcz/
        keep = hostname
      elsif scratch_ = line.chomp.split.last and
            scratch = scratch_.split('=').last
        node2scr[keep] = scratch.to_i
        keep = nil
      end
    end
  end

  list = {}
  keep = nil
  command = 'qhost -q'
  IO.popen(command) do |out|
    while line=out.gets
      # HOSTNAME                ARCH         NCPU  LOAD  MEMTOT  MEMUSE  SWAPTO  SWAPUS
      hostname, arch, ncpu, loading, memtot, memuse, *others = line.split
      if hostname =~ /fgcz/
        #puts [hostname, ncpu, loading, memtot, memuse].join("\t")
        mem = memtot.gsub(/G/, '').to_i
        keep = [hostname, ncpu, "#{mem}G"]
      elsif hostname == "GT" and keep and cores = line.chomp.split.last and cores !~ /[du]/
        hostname = keep.shift
        keep[0] = cores
        if scr = node2scr[hostname] and scr >= 1000
          scr = "%.1f" % (scr.to_f / 1000)
          scr << "T"
        else
          scr = scr.to_s + "G"
        end
        keep << scr
        list[hostname] = keep
        keep = nil
      end
    end
  end

  # reformat
  nodes = {}
  list.each do |hostname, specs|
    # 20190823 masa tentatively off use f47
    unless hostname =~ /fgcz-c-047/
      cores, ram, scr = specs
      key = "#{hostname}: cores #{cores}, ram #{ram}, scr #{scr}"
      value = hostname
      nodes[key] = value
    end
  end
  nodes
end
submit_job(script_file, script_content, option='') click to toggle source
# File lib/workflow_manager/cluster.rb, line 147
def submit_job(script_file, script_content, option='')
  if script_name = File.basename(script_file) and script_name =~ /\.sh/
    script_name = script_name.split(/\.sh/).first + ".sh"
    new_job_script = generate_new_job_script(script_name, script_content)
    new_job_script_base = File.basename(new_job_script)
    log_file = File.join(@log_dir, new_job_script_base + "_o.log")
    err_file = File.join(@log_dir, new_job_script_base + "_e.log")
    command = "g-sub -o #{log_file} -e #{err_file} #{option} #{new_job_script}"
    job_id = `#{command}`
    job_id = job_id.match(/Your job (\d+) \(/)[1]
    [job_id, log_file, command]
  else
    err_msg = "FGCZCluster#submit_job, ERROR: script_name is not *.sh: #{File.basename(script_file)}"
    warn err_msg
    raise err_msg
  end
end