obmon  1.3.1
 All Classes Functions Variables Typedefs Enumerations Groups Pages
ObSensorGpu.cpp
1 #include <ObSensorGpu.h>
2 
3 ObSensorGpu::ObSensorGpu(std::string name) : ObSensor(name) {
7 }
12  nvmlShutdown();
13 }
14 
19 
20  if (type() == SensorType::VALUE || type() == SensorType::SPEED) {
21  _logger->info("Searching for GPU ...");
22 
23  if (nvmlInit() == NVML_SUCCESS) {
24  nvmlDeviceGetCount(&_count);
25  _logger->info("Found {} GPU cards", _count);
26 
27  nvmlDevice_t dev;
28  devStat_t stat;
29  char *devName = static_cast<char *>(malloc(64 * sizeof(char)));
30 
31  for (unsigned int i = 0; i < _count; i++) {
32  nvmlDeviceGetHandleByIndex(i, &dev);
33  nvmlDeviceGetName(dev, devName, 64);
34  _logger->debug("GPU {} : {}", i, devName);
35  stat.name = devName;
36  fDevices.push_back(dev);
37  fStatus.push_back(stat);
38  }
39 
40  free(devName);
41 
42  fComputeProc.resize(_count);
43  fGraphicsProc.resize(_count);
44  } else {
45  _logger->warn("No GPU cards found");
46  enabled(false);
47  return false;
48  }
49  } else {
50  _first = new ObSensorGpu("value");
51  _first->type(SensorType::VALUE);
52  _second = new ObSensorGpu("value");
53  _second->type(SensorType::VALUE);
54  _change = new ObSensorGpu("change");
55  _change->type(SensorType::IGNORE);
56  bool rc = _first->init();
57  rc &= _second->init();
58  enabled(rc);
59  }
60 
61  return true;
62 }
63 
68 
69  _logger->trace("ObSensorGpu::Doing process '{}' with count {} ...", _name,
70  _count);
71 
72  for (unsigned int i = 0; i < _count; i++) {
73  _logger->trace("ObSensorGpu::Getting info from GPU {}", i);
74  nvmlDeviceGetUtilizationRates(fDevices[i], &fStatus[i].util);
75  nvmlDeviceGetMemoryInfo(fDevices[i], &fStatus[i].mem);
76 
77  unsigned int count = 0;
78  nvmlDeviceGetComputeRunningProcesses(fDevices[i], &count, nullptr);
79  fComputeProc[i].resize(count);
80  nvmlDeviceGetComputeRunningProcesses(fDevices[i], &count,
81  fComputeProc[i].data());
82 
83  nvmlDeviceGetGraphicsRunningProcesses(fDevices[i], &count, nullptr);
84  fGraphicsProc[i].resize(count);
85  nvmlDeviceGetGraphicsRunningProcesses(fDevices[i], &count,
86  fGraphicsProc[i].data());
87  }
88 }
89 
90 void ObSensorGpu::speed(ObSensor * /*s1*/, ObSensor * /*s2*/,
91  unsigned int /*timeout*/) {
95 
96  return;
97 }
98 
99 std::string ObSensorGpu::json(const std::string name) const {
103 
104  if (type() == SensorType::SPEED || !_count)
105  return "";
106 
107  using namespace fmt::literals; // Using for _a suffix
108 
109  std::string json = "";
110  _logger->trace("ObSensorGpu::JSON name={} count={}...", name, _count);
111 
112  // start ROOT
113  // ROOT starts here ---------vv ARRAY -v
114  json += fmt::format(R"("{}": {{ "gpus" : [)", name);
115 
116  for (unsigned int i = 0; i < _count; i++) {
117  auto &stat = fStatus[i];
118  // ---------- BEGIN device JSON object ----------
119  json += "{";
120 
121  json += fmt::format(
122  R"("id": {id}, "name": "{name}",)"
123  R"("load": {load},)"
124  R"("mem": {{)"
125  R"("free": {{ "value": {freeV}, "alpha": {freeA:.2f} }},)"
126  R"("used": {{ "value": {usedV}, "alpha": {usedA:.2f} }},)"
127  R"("total": {{ "value": {totalV}, "alpha": 1.0 }})"
128  R"(}},)",
129 
130  // ----- Variables ----- //
131  "id"_a = i, "name"_a = stat.name, "load"_a = stat.util.gpu,
132  // Memory status
133  "freeV"_a = stat.mem.free,
134  "freeA"_a = (static_cast<double>(stat.mem.free) / stat.mem.total),
135  "usedV"_a = stat.mem.used,
136  "usedA"_a = (static_cast<double>(stat.mem.used) / stat.mem.total),
137  "totalV"_a = stat.mem.total //, "totalA"_a = 1.0f
138  // ----- Clang-format force newline ----- //
139  );
140 
141  auto &compute = fComputeProc[i];
142  auto &graphics = fGraphicsProc[i];
143 
144  // ---------- BEGIN processes JSON array ----------
145  json += R"("pids": [)";
146  if (compute.size() + graphics.size() > 0) {
147 
148  // JSON-ify compute processes
149  for (unsigned int j = 0; j < compute.size(); j++) {
150  auto &proc = compute[j];
151  json +=
152  fmt::format(R"({{)"
153  R"("type": "proc", "pid": {pid},)"
154  R"("user": "{user}", "mem": {mem})"
155  R"(}},)",
156  // ----- Variables ----- //
157  "pid"_a = proc.pid, "user"_a = getProcUser(proc.pid),
158  "mem"_a = proc.usedGpuMemory
159  // ----- Clang-format force newline ----- //
160  );
161  }
162 
163  if (json.back() == ',')
164  json.pop_back();
165 
166  // JSON-ify graphical processes
167  for (unsigned int j = 0; j < fGraphicsProc[i].size(); j++) {
168  auto &proc = graphics[j];
169  json +=
170  fmt::format(R"({{)"
171  R"("type": "graphics", "pid": {pid},)"
172  R"("user": "{user}", "mem": {mem})"
173  R"(}},)",
174  // ----- Variables ----- //
175  "pid"_a = proc.pid, "user"_a = getProcUser(proc.pid),
176  "mem"_a = proc.usedGpuMemory
177  // ----- Clang-format force newline ----- //
178  );
179  }
180  if (json.back() == ',')
181  json.pop_back();
182  }
183  json += "]" // ---------- END processes JSON array ----------
184  "}" // ---------- END device JSON object ----------
185  ",";
186  }
187  if (json.back() == ',')
188  json.pop_back();
189 
190  // end ROOT
191  json += "]}";
192  return json;
193 }
194 
195 std::string ObSensorGpu::getProcUser(unsigned int pid) const {
199 
200  redi::ipstream in("ps -p" + std::to_string(pid) + " -ouser=");
201  std::string tmp;
202  std::getline(in, tmp);
203  in.close();
204  return tmp;
205 }
ObSensorGpu(std::string _name={"gpu"})
Definition: ObSensorGpu.cpp:3
std::shared_ptr< spdlog::logger > _logger
Pointer to spd logger.
Definition: ObSensor.h:56
std::vector< std::vector< nvmlProcessInfo_t > > fComputeProc
Compute processes (ex. CUDA Applications)
Definition: ObSensorGpu.h:53
void process() override
Process function.
Definition: ObSensorGpu.cpp:64
Structure for device stat simplification.
Definition: ObSensorGpu.h:37
bool init() override
Definition: ObSensorGpu.cpp:15
ObSensor * _first
Pointer to first sensor.
Definition: ObSensor.h:60
virtual bool init()
Definition: ObSensor.cpp:16
ObSensor * _change
Pointer to change sensor.
Definition: ObSensor.h:62
std::string getProcUser(unsigned int) const
ObSensor * _second
Pointer to second sensor.
Definition: ObSensor.h:61
std::string name
Device name reported by NVML.
Definition: ObSensorGpu.h:38
std::string json(const std::string name={"static"}) const override
Definition: ObSensorGpu.cpp:99
unsigned int _count
Count of NVIDIA devices.
Definition: ObSensorGpu.h:44
void speed(ObSensor *s1, ObSensor *s2, unsigned int timeout=1000) override
Calculate time change (speed)
Definition: ObSensorGpu.cpp:90
Base Obmon sensor class.
Definition: ObSensor.h:19
bool enabled() const
Returns flag if sensor is enabled.
Definition: ObSensor.h:50
virtual ~ObSensorGpu() override
Definition: ObSensorGpu.cpp:8
std::vector< nvmlDevice_t > fDevices
NVML device handles.
Definition: ObSensorGpu.h:47
SensorType type() const
Returns sensor type.
Definition: ObSensor.h:44
std::vector< devStat_t > fStatus
Device statistics.
Definition: ObSensorGpu.h:50
std::string _name
Sensor name.
Definition: ObSensor.h:57
void type(SensorType t)
Sets sensor type.
Definition: ObSensor.h:41
std::vector< std::vector< nvmlProcessInfo_t > > fGraphicsProc
Graphics processes (ex. desktop env)
Definition: ObSensorGpu.h:56