artdaq_utilities  v1_07_01
SystemMetricCollector.cc
1 #include "trace.h"
2 #define TRACE_NAME "SystemMetricCollector"
3 
4 #include <chrono>
5 #include <fstream>
6 #include "SystemMetricCollector.hh"
7 #include "sys/sysinfo.h"
8 #include "sys/types.h"
9 #include "unistd.h"
10 
11 #define MLEVEL_PROCESS 6
12 #define MLEVEL_CPU 7
13 #define MLEVEL_RAM 8
14 #define MLEVEL_NETWORK 9
15 
16 artdaq::SystemMetricCollector::SystemMetricCollector(bool processMetrics, bool systemMetrics)
17  : cpuCount_(GetCPUCount_())
18  , nonIdleCPUPercent_(0)
19  , userCPUPercent_(0)
20  , systemCPUPercent_(0)
21  , idleCPUPercent_(0)
22  , iowaitCPUPercent_(0)
23  , irqCPUPercent_(0)
24  , lastCPU_()
25  , lastProcessCPUTimes_()
26  , lastProcessCPUTime_(0)
27  , sendProcessMetrics_(processMetrics)
28  , sendSystemMetrics_(systemMetrics)
29 {
30  lastCPU_ = ReadProcStat_();
31  lastProcessCPUTime_ = times(&lastProcessCPUTimes_);
32  thisNetStat_ = ReadProcNetDev_();
33  lastNetStat_ = thisNetStat_;
34 }
35 
37 {
38  auto thisCPU = ReadProcStat_();
39  auto total = static_cast<double>(thisCPU.total - lastCPU_.total);
40 
41  if (total == 0)
42  {
43  nonIdleCPUPercent_ = 0;
44  userCPUPercent_ = 0;
45  systemCPUPercent_ = 0;
46  idleCPUPercent_ = 0;
47  iowaitCPUPercent_ = 0;
48  irqCPUPercent_ = 0;
49  return;
50  }
51 
52  nonIdleCPUPercent_ = (thisCPU.totalUsage - lastCPU_.totalUsage) * 100.0 * cpuCount_ / total;
53  userCPUPercent_ = (thisCPU.user + thisCPU.nice - lastCPU_.user - lastCPU_.nice) * 100.0 * cpuCount_ / total;
54  systemCPUPercent_ = (thisCPU.system - lastCPU_.system) * 100.0 * cpuCount_ / total;
55  idleCPUPercent_ = (thisCPU.idle - lastCPU_.idle) * 100.0 * cpuCount_ / total;
56  iowaitCPUPercent_ = (thisCPU.iowait - lastCPU_.iowait) * 100.0 * cpuCount_ / total;
57  irqCPUPercent_ = (thisCPU.irq + thisCPU.softirq - lastCPU_.irq - lastCPU_.softirq) * 100.0 * cpuCount_ / total;
58 
59  lastCPU_ = thisCPU;
60 }
61 
63 {
64  struct tms this_times;
65  auto now = times(&this_times);
66 
67  if (now < 0)
68  {
69  return 0.0;
70  }
71  auto delta_t = now - lastProcessCPUTime_;
72  if (delta_t == 0) return 0;
73 
74  auto utime = this_times.tms_utime - lastProcessCPUTimes_.tms_utime;
75  auto stime = this_times.tms_stime - lastProcessCPUTimes_.tms_stime;
76 
77  lastProcessCPUTime_ = now;
78  lastProcessCPUTimes_ = this_times;
79 
80  return (utime + stime) * 100.0 / static_cast<double>(delta_t);
81 }
82 
84 {
85  struct sysinfo meminfo;
86  auto err = sysinfo(&meminfo);
87  if (err == 0)
88  {
89  return meminfo.freeram * meminfo.mem_unit;
90  }
91  return 0;
92 }
93 
95 {
96  struct sysinfo meminfo;
97  auto err = sysinfo(&meminfo);
98  if (err == 0)
99  {
100  return meminfo.bufferram * meminfo.mem_unit;
101  }
102  return 0;
103 }
104 
106 {
107  struct sysinfo meminfo;
108  auto err = sysinfo(&meminfo);
109  if (err == 0)
110  {
111  return meminfo.totalram * meminfo.mem_unit;
112  }
113  return 0;
114 }
115 
117 {
118  struct sysinfo meminfo;
119  auto err = sysinfo(&meminfo);
120  if (err == 0 && meminfo.totalram > 0)
121  {
122  auto available = meminfo.freeram + (buffers ? meminfo.bufferram : 0);
123  return available * 100.0 / static_cast<double>(meminfo.totalram);
124  }
125  return 0.0;
126 }
127 
129 {
130  auto filp = fopen("/proc/self/statm", "r");
131  uint64_t mem;
132  fscanf(filp, "%*u %lu", &mem); // NOLINT(cert-err34-c) Proc files are defined by the kernel API, and will not have unexpected values
133  fclose(filp);
134  return mem * sysconf(_SC_PAGESIZE);
135 }
136 
138 {
139  auto proc = GetProcessMemUsage();
140  auto total = GetTotalRAM();
141  if (total == 0) return 0;
142  return proc * 100.0 / static_cast<double>(total);
143 }
144 
146 {
147  UpdateNetstat_();
148  return thisNetStat_.stats[ifname].recv_bytes - lastNetStat_.stats[ifname].recv_bytes;
149 }
150 
152 {
153  UpdateNetstat_();
154  return thisNetStat_.stats[ifname].send_bytes - lastNetStat_.stats[ifname].send_bytes;
155 }
156 
158 {
159  UpdateNetstat_();
160  return thisNetStat_.stats[ifname].recv_errs - lastNetStat_.stats[ifname].recv_errs;
161 }
162 
164 {
165  auto filp = fopen("/proc/net/snmp", "r");
166 #define BFSZ_ 200
167  char tcp_lbls[BFSZ_];
168  char tcp_data[BFSZ_];
169  char* bufptr = tcp_lbls;
170 
171  // find the Tcp line token
172 #define TCP_LINE_TKN_ "Tcp:"
173 #define TCP_RETRANSSEGS_TKN_ "RetransSegs"
174  uint64_t retranssegs = 0;
175  while (fgets(bufptr, BFSZ_ - 1, filp) != nullptr)
176  if (strstr(bufptr, TCP_LINE_TKN_))
177  {
178  char *tokn_name, *tokn_data, *tokn_save, *data_save;
179  fgets(tcp_data, BFSZ_ - 1, filp);
180  tokn_name = strtok_r(tcp_lbls, " ", &tokn_save);
181  tokn_data = strtok_r(tcp_data, " ", &data_save);
182  while (tokn_name != NULL && strcmp(tokn_name, TCP_RETRANSSEGS_TKN_) != 0)
183  {
184  tokn_name = strtok_r(NULL, " ", &tokn_save);
185  tokn_data = strtok_r(NULL, " ", &data_save);
186  }
187  if (tokn_name) retranssegs = strtoull(tokn_data, 0, 0);
188  break;
189  }
190  TRACE(TLVL_DEBUG + 10, "retranssegs=%lu", retranssegs);
191  fclose(filp);
192  return 0;
193 }
194 
196 {
197  UpdateNetstat_();
198  return thisNetStat_.stats[ifname].send_errs - lastNetStat_.stats[ifname].send_errs;
199 }
200 
202 {
203  std::list<std::string> output;
204  for (auto& i : thisNetStat_.stats)
205  {
206  output.push_back(i.first);
207  }
208  return output;
209 }
210 
211 std::list<std::unique_ptr<artdaq::MetricData>> artdaq::SystemMetricCollector::SendMetrics()
212 {
213  auto start_time = std::chrono::steady_clock::now();
214  std::list<std::unique_ptr<MetricData>> output;
215  if (sendProcessMetrics_)
216  {
217  output.emplace_back(new MetricData("Process CPU Usage", GetProcessCPUUsagePercent(), "%", MLEVEL_PROCESS, MetricMode::Average, "", false));
218  output.emplace_back(new MetricData("Process RAM Usage", GetProcessMemUsage(), "B", MLEVEL_PROCESS, MetricMode::LastPoint, "", false));
219  }
220  if (sendSystemMetrics_)
221  {
222  GetSystemCPUUsage();
223  output.emplace_back(new MetricData("System CPU Usage", nonIdleCPUPercent_, "%", MLEVEL_CPU, MetricMode::Average, "", false));
224  output.emplace_back(new MetricData("System CPU User", userCPUPercent_, "%", MLEVEL_CPU, MetricMode::Average, "", false));
225  output.emplace_back(new MetricData("System CPU System", systemCPUPercent_, "%", MLEVEL_CPU, MetricMode::Average, "", false));
226  output.emplace_back(new MetricData("System CPU Idle", idleCPUPercent_, "%", MLEVEL_CPU, MetricMode::Average, "", false));
227  output.emplace_back(new MetricData("System CPU IOWait", iowaitCPUPercent_, "%", MLEVEL_CPU, MetricMode::Average, "", false));
228  output.emplace_back(new MetricData("System CPU IRQ", irqCPUPercent_, "%", MLEVEL_CPU, MetricMode::Average, "", false));
229 
230  output.emplace_back(new MetricData("Free RAM", GetAvailableRAM(), "B", MLEVEL_RAM, MetricMode::LastPoint, "", false));
231  output.emplace_back(new MetricData("Total RAM", GetTotalRAM(), "B", MLEVEL_RAM, MetricMode::LastPoint, "", false));
232  output.emplace_back(new MetricData("Available RAM", GetAvailableRAMPercent(true), "%", MLEVEL_RAM, MetricMode::LastPoint, "", false));
233 
234  for (auto& ifname : GetNetworkInterfaceNames())
235  {
236  output.emplace_back(new MetricData(ifname + " Network Receive Rate", GetNetworkReceiveBytes(ifname), "B", MLEVEL_NETWORK, MetricMode::Rate, "", false));
237  output.emplace_back(new MetricData(ifname + " Network Send Rate", GetNetworkSendBytes(ifname), "B", MLEVEL_NETWORK, MetricMode::Rate, "", false));
238  output.emplace_back(new MetricData(ifname + " Network Send Errors", GetNetworkSendErrors(ifname), "Errors", MLEVEL_NETWORK, MetricMode::Accumulate, "", false));
239  output.emplace_back(new MetricData(ifname + " Network Receive Errors", GetNetworkReceiveErrors(ifname), "Errors", MLEVEL_NETWORK, MetricMode::Accumulate, "", false));
240  }
241  output.emplace_back(new MetricData("Network TCP RetransSegs", GetNetworkTCPRetransSegs(), "Segs", MLEVEL_NETWORK, MetricMode::Rate, "", false));
242  }
243 
244  TLOG(10)
245  << "Time to collect system metrics: "
246  << std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start_time).count()
247  << " us.";
248  return output;
249 }
250 
251 artdaq::SystemMetricCollector::cpustat artdaq::SystemMetricCollector::ReadProcStat_()
252 {
253  auto filp = fopen("/proc/stat", "r");
254  cpustat this_cpu;
255 
256  fscanf(filp, "cpu %lu %lu %lu %lu %lu %lu %lu", &this_cpu.user, &this_cpu.nice, &this_cpu.system, // NOLINT(cert-err34-c) Proc files are defined by the kernel API, and will not have unexpected values
257  &this_cpu.idle, &this_cpu.iowait, &this_cpu.irq, &this_cpu.softirq);
258  fclose(filp);
259 
260  // Reset iowait if it decreases
261  if (this_cpu.iowait < lastCPU_.iowait)
262  {
263  auto diff = lastCPU_.iowait - this_cpu.iowait;
264  lastCPU_.iowait = this_cpu.iowait;
265  lastCPU_.total -= diff;
266  lastCPU_.totalUsage -= diff;
267  }
268 
269  this_cpu.totalUsage =
270  this_cpu.user + this_cpu.nice + this_cpu.system + this_cpu.iowait + this_cpu.irq + this_cpu.softirq;
271  this_cpu.total = this_cpu.totalUsage + this_cpu.idle;
272 
273  return this_cpu;
274 }
275 
276 size_t artdaq::SystemMetricCollector::GetCPUCount_()
277 {
278  size_t count = 0;
279  std::ifstream file("/proc/stat");
280  std::string line;
281  bool first = true;
282  while (std::getline(file, line))
283  {
284  if (first)
285  {
286  first = false;
287  continue;
288  }
289  if (line.find("cpu") == 0)
290  {
291  count++;
292  }
293  else
294  {
295  break;
296  }
297  }
298  return count;
299 }
300 
301 artdaq::SystemMetricCollector::netstats artdaq::SystemMetricCollector::ReadProcNetDev_()
302 {
303  auto filp = fopen("/proc/net/dev", "r");
304  char buf[200], ifname_c[20];
305  auto start_time = std::chrono::steady_clock::now();
306  netstats output;
307 
308  // skip first two lines
309  for (int i = 0; i < 2; i++)
310  {
311  fgets(buf, 200, filp);
312  }
313 
314  uint64_t rbytes, rerrs, rdrop, rfifo, rframe, tbytes, terrs, tdrop, tfifo, tcolls, tcarrier;
315 
316  while (fgets(buf, 200, filp) != nullptr)
317  {
318  sscanf(buf, " %[^:]: %lu %*u %lu %lu %lu %lu %*u %*u %lu %*u %lu %lu %lu %lu %lu", ifname_c, &rbytes, &rerrs, // NOLINT(cert-err34-c) Proc files are defined by the kernel API, and will not have unexpected values
319  &rdrop, &rfifo, &rframe, &tbytes, &terrs, &tdrop, &tfifo, &tcolls, &tcarrier);
320 
321  std::string ifname(ifname_c);
322  netstat stat;
323 
324  auto total_rerrs = rerrs + rdrop + rfifo + rframe;
325  auto total_terrs = terrs + tdrop + tfifo + tcolls + tcarrier;
326  stat.recv_bytes = rbytes;
327  stat.send_bytes = tbytes;
328  stat.send_errs = total_terrs;
329  stat.recv_errs = total_rerrs;
330 
331  output.stats[ifname] = stat;
332  }
333  output.collectionTime = start_time;
334  fclose(filp);
335 
336  return output;
337 }
338 
339 void artdaq::SystemMetricCollector::UpdateNetstat_()
340 {
341  auto start_time = std::chrono::steady_clock::now();
342  // Only collect network stats once per second
343  if (std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(start_time - thisNetStat_.collectionTime)
344  .count() > 1.0)
345  {
346  auto output = ReadProcNetDev_();
347  lastNetStat_ = thisNetStat_;
348  thisNetStat_ = output;
349  }
350 }
uint64_t GetNetworkTCPRetransSegs()
Return the current number of TCP (total) segments retransmitted, segments
Report the sum of all values. Use for counters to report accurate results.
uint64_t GetTotalRAM()
Get the total amount of RAM in the system
uint64_t GetNetworkReceiveErrors(std::string ifname)
Get the number of network receive errors in the last network collection interval (1.0 s)
uint64_t GetNetworkSendBytes(std::string ifname)
Get the amount of data sent to the network in the last network collection interval (1...
uint64_t GetProcessMemUsage()
Get the amount of RAM being used by this process
over. Use to create rates from counters.
Report only the last value recorded. Useful for event counters, run numbers, etc. ...
std::list< std::unique_ptr< MetricData > > SendMetrics()
Send the configured metrics
uint64_t GetNetworkSendErrors(std::string ifname)
Get the number of network send errors in the last network collection interval (1.0 s) ...
double GetAvailableRAMPercent(bool buffers)
Get the percentage of available RAM
std::list< std::string > GetNetworkInterfaceNames()
Get the names of the local network interfaces.
void GetSystemCPUUsage()
Calculate the system CPU usage percentages
uint64_t GetAvailableRAM()
Get the amount of available RAM in the system
Small structure used to hold a metric data point before sending to the metric plugins ...
Definition: MetricData.hh:65
double GetProcessCPUUsagePercent()
Return the current amount of CPU usage for the current process, %
Report the average of all values. Use for rates to report accurate results.
SystemMetricCollector(bool processMetrics, bool systemMetrics)
SystemMetricCollector Constructor
double GetProcessMemUsagePercent()
Get the amount of RAM being used by this process
uint64_t GetNetworkReceiveBytes(std::string ifname)
Get the amount of data received from the network in the last network collection interval (1...
uint64_t GetBufferedRAM()
Get the amount of RAM currently being used for cache