artdaq_utilities  v1_05_10
SystemMetricCollector.cc
1 #include "trace.h"
2 #define TRACE_NAME "SystemMetricCollector"
3 
4 #include <chrono>
5 #include "SystemMetricCollector.hh"
6 #include "sys/sysinfo.h"
7 #include "sys/types.h"
8 #include "unistd.h"
9 
10 #define MLEVEL_PROCESS 6
11 #define MLEVEL_CPU 7
12 #define MLEVEL_RAM 8
13 #define MLEVEL_NETWORK 9
14 
15 artdaq::SystemMetricCollector::SystemMetricCollector(bool processMetrics, bool systemMetrics)
16  : lastProcessCPUTimes_(), lastProcessCPUTime_(0), sendProcessMetrics_(processMetrics), sendSystemMetrics_(systemMetrics)
17 {
18  lastCPU_ = ReadProcStat_();
19  lastProcessCPUTime_ = times(&lastProcessCPUTimes_);
20  thisNetStat_ = ReadProcNetDev_();
21  lastNetStat_ = thisNetStat_;
22 }
23 
25 {
26  auto thisCPU = ReadProcStat_();
27  auto totalUsage = thisCPU.totalUsage - lastCPU_.totalUsage;
28  auto total = thisCPU.total - lastCPU_.total;
29  lastCPU_ = thisCPU;
30  return totalUsage * 100.0 / static_cast<double>(total);
31 }
32 
34 {
35  struct tms this_times;
36  auto now = times(&this_times);
37 
38  if (now < 0)
39  {
40  return 0.0;
41  }
42  auto delta_t = now - lastProcessCPUTime_;
43  auto utime = this_times.tms_utime - lastProcessCPUTimes_.tms_utime;
44  auto stime = this_times.tms_stime - lastProcessCPUTimes_.tms_stime;
45 
46  lastProcessCPUTime_ = now;
47  lastProcessCPUTimes_ = this_times;
48 
49  return utime + stime * 100.0 / static_cast<double>(delta_t);
50 }
51 
53 {
54  struct sysinfo meminfo;
55  auto err = sysinfo(&meminfo);
56  if (err == 0)
57  {
58  return meminfo.freeram * meminfo.mem_unit;
59  }
60  return 0;
61 }
62 
64 {
65  struct sysinfo meminfo;
66  auto err = sysinfo(&meminfo);
67  if (err == 0)
68  {
69  return meminfo.bufferram * meminfo.mem_unit;
70  }
71  return 0;
72 }
73 
75 {
76  struct sysinfo meminfo;
77  auto err = sysinfo(&meminfo);
78  if (err == 0)
79  {
80  return meminfo.totalram * meminfo.mem_unit;
81  }
82  return 0;
83 }
84 
86 {
87  struct sysinfo meminfo;
88  auto err = sysinfo(&meminfo);
89  if (err == 0)
90  {
91  auto available = meminfo.freeram + (buffers ? meminfo.bufferram : 0);
92  return available * 100.0 / static_cast<double>(meminfo.totalram);
93  }
94  return 0.0;
95 }
96 
98 {
99  auto filp = fopen("/proc/self/statm", "r");
100  uint64_t mem;
101  fscanf(filp, "%*u %lu", &mem); // NOLINT(cert-err34-c) Proc files are defined by the kernel API, and will not have unexpected values
102  fclose(filp);
103  return mem * sysconf(_SC_PAGESIZE);
104 }
105 
107 {
108  auto proc = GetProcessMemUsage();
109  auto total = GetTotalRAM();
110  return proc * 100.0 / static_cast<double>(total);
111 }
112 
114 {
115  UpdateNetstat_();
116  return thisNetStat_.recv_bytes - lastNetStat_.recv_bytes;
117 }
118 
120 {
121  UpdateNetstat_();
122  return thisNetStat_.send_bytes - lastNetStat_.send_bytes;
123 }
124 
126 {
127  UpdateNetstat_();
128  return thisNetStat_.recv_errs - lastNetStat_.recv_errs;
129 }
130 
132 {
133  UpdateNetstat_();
134  return thisNetStat_.send_errs - lastNetStat_.send_errs;
135 }
136 
137 std::list<std::unique_ptr<artdaq::MetricData>> artdaq::SystemMetricCollector::SendMetrics()
138 {
139  auto start_time = std::chrono::steady_clock::now();
140  std::list<std::unique_ptr<MetricData>> output;
141  if (sendProcessMetrics_)
142  {
143  output.emplace_back(new MetricData("Process CPU Usage", GetProcessCPUUsagePercent(), "%", MLEVEL_PROCESS, MetricMode::Average, "", false));
144  output.emplace_back(new MetricData("Process RAM Usage", GetProcessMemUsage(), "B", MLEVEL_PROCESS, MetricMode::LastPoint, "", false));
145  }
146  if (sendSystemMetrics_)
147  {
148  output.emplace_back(new MetricData("System CPU Usage", GetSystemCPUUsagePercent(), "%", MLEVEL_CPU, MetricMode::Average, "", false));
149 
150  output.emplace_back(new MetricData("Free RAM", GetAvailableRAM(), "B", MLEVEL_RAM, MetricMode::LastPoint, "", false));
151  output.emplace_back(new MetricData("Total RAM", GetTotalRAM(), "B", MLEVEL_RAM, MetricMode::LastPoint, "", false));
152  output.emplace_back(new MetricData("Available RAM", GetAvailableRAMPercent(true), "%", MLEVEL_RAM, MetricMode::LastPoint, "", false));
153 
154  output.emplace_back(new MetricData("Network Receive Rate", GetNetworkReceiveBytes(), "B", MLEVEL_NETWORK, MetricMode::Rate, "", false));
155  output.emplace_back(new MetricData("Network Send Rate", GetNetworkSendBytes(), "B", MLEVEL_NETWORK, MetricMode::Rate, "", false));
156  output.emplace_back(new MetricData("Network Send Errors", GetNetworkSendErrors(), "Errors", MLEVEL_NETWORK, MetricMode::Accumulate, "", false));
157  output.emplace_back(new MetricData("Network Receive Errors", GetNetworkReceiveErrors(), "Errors", MLEVEL_NETWORK, MetricMode::Accumulate, "", false));
158  }
159 
160  TLOG(10)
161  << "Time to collect system metrics: "
162  << std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start_time).count()
163  << " us.";
164  return output;
165 }
166 
167 artdaq::SystemMetricCollector::cpustat artdaq::SystemMetricCollector::ReadProcStat_()
168 {
169  auto filp = fopen("/proc/stat", "r");
170  cpustat this_cpu;
171 
172  fscanf(filp, "cpu %lu %lu %lu %lu %lu %lu %lu", &this_cpu.user, &this_cpu.nice, &this_cpu.system, // NOLINT(cert-err34-c) Proc files are defined by the kernel API, and will not have unexpected values
173  &this_cpu.idle, &this_cpu.iowait, &this_cpu.irq, &this_cpu.softirq);
174  fclose(filp);
175 
176  this_cpu.totalUsage =
177  this_cpu.user + this_cpu.nice + this_cpu.system + this_cpu.iowait + this_cpu.irq + this_cpu.softirq;
178  this_cpu.total = this_cpu.totalUsage + this_cpu.idle;
179 
180  return this_cpu;
181 }
182 
183 artdaq::SystemMetricCollector::netstat artdaq::SystemMetricCollector::ReadProcNetDev_()
184 {
185  auto filp = fopen("/proc/net/dev", "r");
186  char buf[200], ifname[20];
187  netstat output;
188  auto start_time = std::chrono::steady_clock::now();
189 
190  // skip first two lines
191  for (int i = 0; i < 2; i++)
192  {
193  fgets(buf, 200, filp);
194  }
195 
196  uint64_t rbytes, rerrs, rdrop, rfifo, rframe, tbytes, terrs, tdrop, tfifo, tcolls, tcarrier;
197 
198  while (fgets(buf, 200, filp) != nullptr)
199  {
200  sscanf(buf, "%[^:]: %lu %*u %lu %lu %lu %lu %*u %*u %lu %*u %lu %lu %lu %lu %lu", ifname, &rbytes, &rerrs, // NOLINT(cert-err34-c) Proc files are defined by the kernel API, and will not have unexpected values
201  &rdrop, &rfifo, &rframe, &tbytes, &terrs, &tdrop, &tfifo, &tcolls, &tcarrier);
202 
203  if (ifname[0] == 'e')
204  {
205  auto total_rerrs = rerrs + rdrop + rfifo + rframe;
206  auto total_terrs = terrs + tdrop + tfifo + tcolls + tcarrier;
207  output.recv_bytes += rbytes;
208  output.send_bytes += tbytes;
209  output.send_errs += total_terrs;
210  output.recv_errs += total_rerrs;
211  }
212  }
213  output.collectionTime = start_time;
214  fclose(filp);
215 
216  return output;
217 }
218 
219 void artdaq::SystemMetricCollector::UpdateNetstat_()
220 {
221  auto start_time = std::chrono::steady_clock::now();
222  // Only collect network stats once per second
223  if (std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(start_time - thisNetStat_.collectionTime)
224  .count() > 1.0)
225  {
226  auto output = ReadProcNetDev_();
227  lastNetStat_ = thisNetStat_;
228  thisNetStat_ = output;
229  }
230 }
uint64_t GetNetworkSendBytes()
Get the amount of data sent to the network in the last network collection interval (1...
uint64_t GetNetworkSendErrors()
Get the number of network send errors in the last network collection interval (1.0 s) ...
Report the sum of all values. Use for counters to report accurate results.
uint64_t GetNetworkReceiveBytes()
Get the amount of data received from the network in the last network collection interval (1...
uint64_t GetTotalRAM()
Get the total amount of RAM in the system
uint64_t GetProcessMemUsage()
Get the amount of RAM being used by this process
over. Use to create rates from counters.
Report only the last value recorded. Useful for event counters, run numbers, etc. ...
std::list< std::unique_ptr< MetricData > > SendMetrics()
Send the configured metrics
double GetAvailableRAMPercent(bool buffers)
Get the percentage of available RAM
uint64_t GetAvailableRAM()
Get the amount of available RAM in the system
Small structure used to hold a metric data point before sending to the metric plugins ...
Definition: MetricData.hh:65
double GetProcessCPUUsagePercent()
Return the current amount of CPU usage for the current process, %
Report the average of all values. Use for rates to report accurate results.
SystemMetricCollector(bool processMetrics, bool systemMetrics)
SystemMetricCollector Constructor
double GetProcessMemUsagePercent()
Get the amount of RAM being used by this process
double GetSystemCPUUsagePercent()
Return the current overall system CPU usage in %
uint64_t GetNetworkReceiveErrors()
Get the number of network receive errors in the last network collection interval (1.0 s)
uint64_t GetBufferedRAM()
Get the amount of RAM currently being used for cache