artdaq_utilities  v1_04_09
MetricManager.cc
1 // MetricManager.cc: MetricManager class implementation file
2 // Author: Eric Flumerfelt
3 // Last Modified: 11/14/2014
4 //
5 // MetricManager loads a user-specified set of plugins, sends them their configuration,
6 // and sends them data as it is recieved. It also maintains the state of the plugins
7 // relative to the application state.
8 
9 #define TRACE_NAME "MetricManager"
10 #include "tracemf.h"
11 #include "artdaq-utilities/Plugins/MetricManager.hh"
12 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
13 #include "fhiclcpp/ParameterSet.h"
14 
15 #include <chrono>
16 #include <boost/exception/all.hpp>
17 
19 MetricManager() : metric_plugins_(0)
20 , metric_send_interval_ms_(15000)
21 , initialized_(false)
22 , running_(false)
23 , active_(false)
24 , missed_metric_calls_(0)
25 , metric_queue_max_size_(1000)
26 , metric_queue_notify_size_(10)
27 {}
28 
30 {
31  shutdown();
32 }
33 
34 void artdaq::MetricManager::initialize(fhicl::ParameterSet const& pset, std::string const& prefix)
35 {
36  prefix_ = prefix;
37  if (initialized_)
38  {
39  shutdown();
40  }
41  TLOG(TLVL_INFO) << "Configuring metrics with parameter set: " << pset.to_string() ;
42 
43  std::vector<std::string> names = pset.get_pset_names();
44 
45  metric_plugins_.clear();
46 
47  for (auto name : names)
48  {
49  if (name == "metric_queue_size")
50  {
51  metric_queue_max_size_ = pset.get<size_t>("metric_queue_size");
52  }
53  else if (name == "metric_queue_notify_size")
54  {
55  metric_queue_notify_size_ = pset.get<size_t>("metric_queue_notify_size");
56  }
57  else if (name == "metric_send_maximum_delay_ms")
58  {
59  metric_send_interval_ms_ = pset.get<int>("metric_send_maximum_delay_ms");
60  }
61  else
62  {
63  try
64  {
65  TLOG(TLVL_DEBUG) << "Constructing metric plugin with name " << name ;
66  fhicl::ParameterSet plugin_pset = pset.get<fhicl::ParameterSet>(name);
67  metric_plugins_.push_back(makeMetricPlugin(
68  plugin_pset.get<std::string>("metricPluginType", ""), plugin_pset, prefix_));
69  }
70  catch (const cet::exception& e)
71  {
72  TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
73  ", cet::exception object caught:" << e.explain_self() ;
74  }
75  catch (const boost::exception& e)
76  {
77  TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
78  ", boost::exception object caught: " << boost::diagnostic_information(e) ;
79  }
80  catch (const std::exception& e)
81  {
82  TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
83  ", std::exception caught: " << e.what() ;
84  }
85  catch (...)
86  {
87  TLOG(TLVL_ERROR) << "Unknown Exception caught in MetricManager::initialize, error loading plugin with name " << name ;
88  }
89  }
90  }
91 
92  initialized_ = true;
93 }
94 
96 {
97  auto lk = std::unique_lock<std::mutex>(metric_mutex_);
98  if (!running_)
99  {
100  TLOG(TLVL_DEBUG) << "Starting MetricManager" ;
101  for (auto& metric : metric_plugins_)
102  {
103  if (!metric) continue;
104  try
105  {
106  metric->startMetrics();
107  TLOG(TLVL_INFO) << "Metric Plugin " << metric->getLibName() << " started." ;
108  active_ = true;
109  }
110  catch (...)
111  {
112  TLOG(TLVL_ERROR) <<
113  "Exception caught in MetricManager::do_start(), error starting plugin with name " <<
114  metric->getLibName() ;
115  }
116  }
117  running_ = true;
118  startMetricLoop_();
119  }
120 }
121 
123 {
124  auto lk = std::unique_lock<std::mutex>(metric_mutex_);
125  TLOG(TLVL_DEBUG) << "Stopping Metrics" ;
126  running_ = false;
127  metric_cv_.notify_all();
128  TLOG(TLVL_DEBUG) << "Joining Metric-Sending thread" ;
129  lk.unlock();
130  if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
131  TLOG(TLVL_DEBUG) << "do_stop Complete" ;
132 }
133 
134 void artdaq::MetricManager::do_pause() { /*do_stop();*/ }
135 void artdaq::MetricManager::do_resume() { /*do_start();*/ }
136 
137 void artdaq::MetricManager::reinitialize(fhicl::ParameterSet const& pset, std::string const& prefix)
138 {
139  shutdown();
140  initialize(pset, prefix);
141 }
142 
144 {
145  TLOG(TLVL_DEBUG) << "MetricManager is shutting down..." ;
146  do_stop();
147 
148  auto lk = std::unique_lock<std::mutex>(metric_mutex_);
149  if (initialized_)
150  {
151  for (auto& i : metric_plugins_)
152  {
153  try
154  {
155  std::string name = i->getLibName();
156  i.reset(nullptr);
157  TLOG(TLVL_DEBUG) << "Metric Plugin " << name << " shutdown." ;
158  }
159  catch (...)
160  {
161  TLOG(TLVL_ERROR) <<
162  "Exception caught in MetricManager::shutdown(), error shutting down metric with name " <<
163  i->getLibName() ;
164  }
165  }
166  metric_plugins_.clear();
167  initialized_ = false;
168  }
169 }
170 
171 void artdaq::MetricManager::sendMetric(std::string const& name, std::string const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
172 {
173  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
174  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
175  else if (active_)
176  {
177  {
178  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
179  if (!metric_queue_.count(name)) {
180  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
181  }
182  }
183  auto entry = &(metric_queue_[name]);
184 
185  auto size = entry->first.load();
186  if (size < metric_queue_max_size_)
187  {
188  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
189  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
190  {
191  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
192  entry->first++;
193  entry->second.emplace_back(std::move(metric));
194  }
195  }
196  else
197  {
198  TLOG(10) << "Rejecting metric because queue full" ;
199  missed_metric_calls_++;
200  }
201  metric_cv_.notify_all();
202  }
203 }
204 
205 void artdaq::MetricManager::sendMetric(std::string const& name, int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
206 {
207  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
208  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
209  else if (active_)
210  {
211  {
212  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
213  if (!metric_queue_.count(name)) {
214  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
215  }
216  }
217  auto entry = &(metric_queue_[name]);
218 
219  auto size = entry->first.load();
220  if (size < metric_queue_max_size_)
221  {
222  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
223  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
224  {
225  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
226  entry->first++;
227  entry->second.emplace_back(std::move(metric));
228  }
229  }
230  else
231  {
232  TLOG(10) << "Rejecting metric because queue full" ;
233  missed_metric_calls_++;
234  }
235  metric_cv_.notify_all();
236  }
237 }
238 
239 void artdaq::MetricManager::sendMetric(std::string const& name, double const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
240 {
241  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
242  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
243  else if (active_)
244  {
245  {
246  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
247  if (!metric_queue_.count(name)) {
248  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
249  }
250  }
251  auto entry = &(metric_queue_[name]);
252 
253  auto size = entry->first.load();
254  if (size < metric_queue_max_size_)
255  {
256  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
257  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
258  {
259  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
260  entry->first++;
261  entry->second.emplace_back(std::move(metric));
262  }
263  }
264  else
265  {
266  TLOG(10) << "Rejecting metric because queue full" ;
267  missed_metric_calls_++;
268  }
269  metric_cv_.notify_all();
270  }
271 }
272 
273 void artdaq::MetricManager::sendMetric(std::string const& name, float const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
274 {
275  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
276  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
277  else if (active_)
278  {
279  {
280  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
281  if (!metric_queue_.count(name)) {
282  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
283  }
284  }
285  auto entry = &(metric_queue_[name]);
286 
287  auto size = entry->first.load();
288  if (size < metric_queue_max_size_)
289  {
290  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
291  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
292  {
293  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
294  entry->first++;
295  entry->second.emplace_back(std::move(metric));
296  }
297  }
298  else
299  {
300  TLOG(10) << "Rejecting metric because queue full" ;
301  missed_metric_calls_++;
302  }
303  metric_cv_.notify_all();
304  }
305 }
306 
307 void artdaq::MetricManager::sendMetric(std::string const& name, long unsigned int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
308 {
309  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
310  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
311  else if (active_)
312  {
313  {
314  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
315  if (!metric_queue_.count(name)) {
316  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
317  }
318  }
319  auto entry = &(metric_queue_[name]);
320 
321  auto size = entry->first.load();
322  if (size < metric_queue_max_size_)
323  {
324  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
325  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
326  {
327  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
328  entry->first++;
329  entry->second.emplace_back(std::move(metric));
330  }
331  }
332  else
333  {
334  TLOG(10) << "Rejecting metric because queue full" ;
335  missed_metric_calls_++;
336  }
337  metric_cv_.notify_all();
338  }
339 }
340 
341 void artdaq::MetricManager::startMetricLoop_()
342 {
343  if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
344  boost::thread::attributes attrs;
345  attrs.set_stack_size(4096 * 2000); // 8000 KB
346  TLOG(TLVL_INFO) << "Starting Metric Sending Thread" ;
347  try {
348  metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_, this));
349  }
350  catch (const boost::exception& e)
351  {
352  TLOG(TLVL_ERROR) << "Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
353  std::cerr << "Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
354  exit(5);
355  }
356  TLOG(TLVL_INFO) << "Metric Sending thread started";
357 }
358 
360 {
361  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
362  for (auto& q : metric_queue_)
363  {
364  if (q.second.first != 0) return false;
365  }
366  return true;
367 }
368 
369 size_t artdaq::MetricManager::metricQueueSize(std::string const& name)
370 {
371  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
372  size_t size = 0;
373  if (name == "") {
374  for (auto& q : metric_queue_)
375  {
376  size += q.second.first;
377  }
378  }
379  else {
380  if (metric_queue_.count(name)) size = metric_queue_[name].first;
381  }
382 
383  return size;
384 }
385 
386 void artdaq::MetricManager::sendMetricLoop_()
387 {
388  TLOG(TLVL_INFO) << "sendMetricLoop_ START";
389  auto last_send_time = std::chrono::steady_clock::time_point();
390  while (running_)
391  {
392  while (metricQueueEmpty() && running_)
393  {
394  std::unique_lock<std::mutex> lk(metric_mutex_);
395  metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
396  auto now = std::chrono::steady_clock::now();
397  if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() > metric_send_interval_ms_)
398  {
399  for (auto& metric : metric_plugins_) { if(metric) metric->sendMetrics(); }
400  last_send_time = now;
401  }
402  }
403 
404  auto processing_start = std::chrono::steady_clock::now();
405  auto temp_list = std::list<std::unique_ptr<MetricData>>();
406  {
407  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
408 
409  for (auto& q : metric_queue_)
410  {
411  temp_list.splice(temp_list.end(), q.second.second);
412  q.second.first = 0;
413  }
414 
415  temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
416  auto missed = missed_metric_calls_.exchange(0);
417 
418  temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
419  TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
420  }
421 
422  while (temp_list.size() > 0)
423  {
424  auto data_ = std::move(temp_list.front());
425  temp_list.pop_front();
426  if (data_->Type == MetricType::InvalidMetric) continue;
427  if (!data_->UseNameOverride)
428  {
429  if (data_->MetricPrefix.size() > 0)
430  {
431  data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
432  }
433  else
434  {
435  data_->Name = prefix_ + "." + data_->Name;
436  }
437  }
438 
439  for (auto& metric : metric_plugins_)
440  {
441  if (!metric) continue;
442  if (metric->getRunLevel() >= data_->Level)
443  {
444  try
445  {
446  metric->addMetricData(*data_);
447  last_send_time = std::chrono::steady_clock::now();
448  }
449  catch (...)
450  {
451  TLOG(TLVL_ERROR) <<
452  "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
453  << metric->getLibName() ;
454  }
455  }
456  }
457  }
458 
459  for (auto& metric : metric_plugins_)
460  {
461  if (!metric) continue;
462  metric->sendMetrics(false, processing_start);
463  }
464  }
465 
466  auto temp_list = std::list<std::unique_ptr<MetricData>>();
467  {
468  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
469 
470  for (auto& q : metric_queue_)
471  {
472  temp_list.splice(temp_list.end(), q.second.second);
473  }
474  metric_queue_.clear();
475 
476  temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
477  auto missed = missed_metric_calls_.exchange(0);
478 
479  temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
480  TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
481  }
482 
483  while (temp_list.size() > 0)
484  {
485  auto data_ = std::move(temp_list.front());
486  temp_list.pop_front();
487  if (data_->Type == MetricType::InvalidMetric) continue;
488  if (!data_->UseNameOverride)
489  {
490  if (data_->MetricPrefix.size() > 0)
491  {
492  data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
493  }
494  else
495  {
496  data_->Name = prefix_ + "." + data_->Name;
497  }
498  }
499 
500  for (auto& metric : metric_plugins_)
501  {
502  if (!metric) continue;
503  if (metric->getRunLevel() >= data_->Level)
504  {
505  try
506  {
507  metric->addMetricData(*data_);
508  last_send_time = std::chrono::steady_clock::now();
509  }
510  catch (...)
511  {
512  TLOG(TLVL_ERROR) <<
513  "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
514  << metric->getLibName() ;
515  }
516  }
517  }
518  }
519 
520  for (auto& metric : metric_plugins_)
521  {
522  if (!metric) continue;
523  try
524  {
525  metric->stopMetrics();
526  TLOG(TLVL_DEBUG) << "Metric Plugin " << metric->getLibName() << " stopped." ;
527  }
528  catch (...)
529  {
530  TLOG(TLVL_ERROR) <<
531  "Exception caught in MetricManager::do_stop(), error stopping plugin with name " <<
532  metric->getLibName() ;
533  }
534  }
535  TLOG(TLVL_DEBUG) << "MetricManager has been stopped." ;
536 }
void shutdown()
Call the destructors for all configured MetricPlugin instances.
void initialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Initialize the MetricPlugin instances.
void sendMetric(std::string const &name, std::string const &value, std::string const &unit, int level, MetricMode mode, std::string const &metricPrefix="", bool useNameOverride=false)
Send a metric with the given parameters to any MetricPlugins with a threshold level &gt;= to level...
size_t metricQueueSize(std::string const &name="")
Return the size of the named metric queue
void reinitialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Reinitialize all MetricPlugin Instances.
MetricMode
The Mode of the metric indicates how multiple metric values should be combined within a reporting int...
Definition: MetricData.hh:30
MetricManager()
Construct an instance of the MetricManager class.
void do_start()
Perform startup actions for each configured MetricPlugin.
void do_stop()
Stop sending metrics to the MetricPlugin instances.
virtual ~MetricManager() noexcept
MetricManager destructor.
std::unique_ptr< MetricPlugin > makeMetricPlugin(std::string const &generator_plugin_spec, fhicl::ParameterSet const &ps, std::string const &app_name)
Load a given MetricPlugin and return a pointer to it.
Sends both the Accumulate mode and Rate mode metric. (Rate mode metric will append &quot;/s&quot; to metric uni...
Small structure used to hold a metric data point before sending to the metric plugins ...
Definition: MetricData.hh:42
Default, invalid value.
void do_resume()
Resume metric sending. Currently a No-Op.
bool metricQueueEmpty()
Returns whether the metric queue is completely empty
void do_pause()
Pause metric sending. Currently a No-Op.