artdaq_utilities  v1_05_06
SystemMetricCollector.cc
1 #include "trace.h"
2 
3 #include <chrono>
4 #include "SystemMetricCollector.hh"
5 #include "sys/sysinfo.h"
6 #include "sys/types.h"
7 #include "unistd.h"
8 
9 #define MLEVEL_PROCESS 6
10 #define MLEVEL_CPU 7
11 #define MLEVEL_RAM 8
12 #define MLEVEL_NETWORK 9
13 
14 artdaq::SystemMetricCollector::SystemMetricCollector(bool processMetrics, bool systemMetrics)
15  : lastProcessCPUTimes_(), lastProcessCPUTime_(0), sendProcessMetrics_(processMetrics), sendSystemMetrics_(systemMetrics)
16 {
17  lastCPU_ = ReadProcStat_();
18  lastProcessCPUTime_ = times(&lastProcessCPUTimes_);
19  thisNetStat_ = ReadProcNetDev_();
20  lastNetStat_ = thisNetStat_;
21 }
22 
24 {
25  auto thisCPU = ReadProcStat_();
26  auto totalUsage = thisCPU.totalUsage - lastCPU_.totalUsage;
27  auto total = thisCPU.total - lastCPU_.total;
28  lastCPU_ = thisCPU;
29  return totalUsage * 100.0 / static_cast<double>(total);
30 }
31 
33 {
34  struct tms this_times;
35  auto now = times(&this_times);
36 
37  if (now < 0)
38  {
39  return 0.0;
40  }
41  auto delta_t = now - lastProcessCPUTime_;
42  auto utime = this_times.tms_utime - lastProcessCPUTimes_.tms_utime;
43  auto stime = this_times.tms_stime - lastProcessCPUTimes_.tms_stime;
44 
45  lastProcessCPUTime_ = now;
46  lastProcessCPUTimes_ = this_times;
47 
48  return utime + stime * 100.0 / static_cast<double>(delta_t);
49 }
50 
52 {
53  struct sysinfo meminfo;
54  auto err = sysinfo(&meminfo);
55  if (err == 0)
56  {
57  return meminfo.freeram * meminfo.mem_unit;
58  }
59  return 0;
60 }
61 
63 {
64  struct sysinfo meminfo;
65  auto err = sysinfo(&meminfo);
66  if (err == 0)
67  {
68  return meminfo.bufferram * meminfo.mem_unit;
69  }
70  return 0;
71 }
72 
74 {
75  struct sysinfo meminfo;
76  auto err = sysinfo(&meminfo);
77  if (err == 0)
78  {
79  return meminfo.totalram * meminfo.mem_unit;
80  }
81  return 0;
82 }
83 
85 {
86  struct sysinfo meminfo;
87  auto err = sysinfo(&meminfo);
88  if (err == 0)
89  {
90  auto available = meminfo.freeram + (buffers ? meminfo.bufferram : 0);
91  return available * 100.0 / static_cast<double>(meminfo.totalram);
92  }
93  return 0.0;
94 }
95 
97 {
98  auto filp = fopen("/proc/self/statm", "r");
99  uint64_t mem;
100  fscanf(filp, "%*u %lu", &mem); // NOLINT(cert-err34-c) Proc files are defined by the kernel API, and will not have unexpected values
101  fclose(filp);
102  return mem * sysconf(_SC_PAGESIZE);
103 }
104 
106 {
107  auto proc = GetProcessMemUsage();
108  auto total = GetTotalRAM();
109  return proc * 100.0 / static_cast<double>(total);
110 }
111 
113 {
114  UpdateNetstat_();
115  return thisNetStat_.recv_bytes - lastNetStat_.recv_bytes;
116 }
117 
119 {
120  UpdateNetstat_();
121  return thisNetStat_.send_bytes - lastNetStat_.send_bytes;
122 }
123 
125 {
126  UpdateNetstat_();
127  return thisNetStat_.recv_errs - lastNetStat_.recv_errs;
128 }
129 
131 {
132  UpdateNetstat_();
133  return thisNetStat_.send_errs - lastNetStat_.send_errs;
134 }
135 
136 std::list<std::unique_ptr<artdaq::MetricData>> artdaq::SystemMetricCollector::SendMetrics()
137 {
138  auto start_time = std::chrono::steady_clock::now();
139  std::list<std::unique_ptr<MetricData>> output;
140  if (sendProcessMetrics_)
141  {
142  output.emplace_back(new MetricData("Process CPU Usage", GetProcessCPUUsagePercent(), "%", MLEVEL_PROCESS, MetricMode::Average, "", false));
143  output.emplace_back(new MetricData("Process RAM Usage", GetProcessMemUsage(), "B", MLEVEL_PROCESS, MetricMode::LastPoint, "", false));
144  }
145  if (sendSystemMetrics_)
146  {
147  output.emplace_back(new MetricData("System CPU Usage", GetSystemCPUUsagePercent(), "%", MLEVEL_CPU, MetricMode::Average, "", false));
148 
149  output.emplace_back(new MetricData("Free RAM", GetAvailableRAM(), "B", MLEVEL_RAM, MetricMode::LastPoint, "", false));
150  output.emplace_back(new MetricData("Total RAM", GetTotalRAM(), "B", MLEVEL_RAM, MetricMode::LastPoint, "", false));
151  output.emplace_back(new MetricData("Available RAM", GetAvailableRAMPercent(true), "%", MLEVEL_RAM, MetricMode::LastPoint, "", false));
152 
153  output.emplace_back(new MetricData("Network Receive Rate", GetNetworkReceiveBytes(), "B", MLEVEL_NETWORK, MetricMode::Rate, "", false));
154  output.emplace_back(new MetricData("Network Send Rate", GetNetworkSendBytes(), "B", MLEVEL_NETWORK, MetricMode::Rate, "", false));
155  output.emplace_back(new MetricData("Network Send Errors", GetNetworkSendErrors(), "Errors", MLEVEL_NETWORK, MetricMode::Accumulate, "", false));
156  output.emplace_back(new MetricData("Network Receive Errors", GetNetworkReceiveErrors(), "Errors", MLEVEL_NETWORK, MetricMode::Accumulate, "", false));
157  }
158 
159  TLOG(TLVL_DEBUG)
160  << "Time to collect system metrics: "
161  << std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start_time).count()
162  << " us.";
163  return output;
164 }
165 
166 artdaq::SystemMetricCollector::cpustat artdaq::SystemMetricCollector::ReadProcStat_()
167 {
168  auto filp = fopen("/proc/stat", "r");
169  cpustat this_cpu;
170 
171  fscanf(filp, "cpu %lu %lu %lu %lu %lu %lu %lu", &this_cpu.user, &this_cpu.nice, &this_cpu.system, // NOLINT(cert-err34-c) Proc files are defined by the kernel API, and will not have unexpected values
172  &this_cpu.idle, &this_cpu.iowait, &this_cpu.irq, &this_cpu.softirq);
173  fclose(filp);
174 
175  this_cpu.totalUsage =
176  this_cpu.user + this_cpu.nice + this_cpu.system + this_cpu.iowait + this_cpu.irq + this_cpu.softirq;
177  this_cpu.total = this_cpu.totalUsage + this_cpu.idle;
178 
179  return this_cpu;
180 }
181 
182 artdaq::SystemMetricCollector::netstat artdaq::SystemMetricCollector::ReadProcNetDev_()
183 {
184  auto filp = fopen("/proc/net/dev", "r");
185  char buf[200], ifname[20];
186  netstat output;
187  auto start_time = std::chrono::steady_clock::now();
188 
189  // skip first two lines
190  for (int i = 0; i < 2; i++)
191  {
192  fgets(buf, 200, filp);
193  }
194 
195  uint64_t rbytes, rerrs, rdrop, rfifo, rframe, tbytes, terrs, tdrop, tfifo, tcolls, tcarrier;
196 
197  while (fgets(buf, 200, filp) != nullptr)
198  {
199  sscanf(buf, "%[^:]: %lu %*u %lu %lu %lu %lu %*u %*u %lu %*u %lu %lu %lu %lu %lu", ifname, &rbytes, &rerrs, // NOLINT(cert-err34-c) Proc files are defined by the kernel API, and will not have unexpected values
200  &rdrop, &rfifo, &rframe, &tbytes, &terrs, &tdrop, &tfifo, &tcolls, &tcarrier);
201 
202  if (ifname[0] == 'e')
203  {
204  auto total_rerrs = rerrs + rdrop + rfifo + rframe;
205  auto total_terrs = terrs + tdrop + tfifo + tcolls + tcarrier;
206  output.recv_bytes += rbytes;
207  output.send_bytes += tbytes;
208  output.send_errs += total_terrs;
209  output.recv_errs += total_rerrs;
210  }
211  }
212  output.collectionTime = start_time;
213  fclose(filp);
214 
215  return output;
216 }
217 
218 void artdaq::SystemMetricCollector::UpdateNetstat_()
219 {
220  auto start_time = std::chrono::steady_clock::now();
221  // Only collect network stats once per second
222  if (std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(start_time - thisNetStat_.collectionTime)
223  .count() > 1.0)
224  {
225  auto output = ReadProcNetDev_();
226  lastNetStat_ = thisNetStat_;
227  thisNetStat_ = output;
228  }
229 }
uint64_t GetNetworkSendBytes()
Get the amount of data sent to the network in the last network collection interval (1...
uint64_t GetNetworkSendErrors()
Get the number of network send errors in the last network collection interval (1.0 s) ...
Report the sum of all values. Use for counters to report accurate results.
uint64_t GetNetworkReceiveBytes()
Get the amount of data received from the network in the last network collection interval (1...
uint64_t GetTotalRAM()
Get the total amount of RAM in the system
uint64_t GetProcessMemUsage()
Get the amount of RAM being used by this process
over. Use to create rates from counters.
Report only the last value recorded. Useful for event counters, run numbers, etc. ...
std::list< std::unique_ptr< MetricData > > SendMetrics()
Send the configured metrics
double GetAvailableRAMPercent(bool buffers)
Get the percentage of available RAM
uint64_t GetAvailableRAM()
Get the amount of available RAM in the system
Small structure used to hold a metric data point before sending to the metric plugins ...
Definition: MetricData.hh:65
double GetProcessCPUUsagePercent()
Return the current amount of CPU usage for the current process, %
Report the average of all values. Use for rates to report accurate results.
SystemMetricCollector(bool processMetrics, bool systemMetrics)
SystemMetricCollector Constructor
double GetProcessMemUsagePercent()
Get the amount of RAM being used by this process
double GetSystemCPUUsagePercent()
Return the current overall system CPU usage in %
uint64_t GetNetworkReceiveErrors()
Get the number of network receive errors in the last network collection interval (1.0 s)
uint64_t GetBufferedRAM()
Get the amount of RAM currently being used for cache