artdaq_utilities  v1_04_08
MetricManager.cc
1 // MetricManager.cc: MetricManager class implementation file
2 // Author: Eric Flumerfelt
3 // Last Modified: 11/14/2014
4 //
5 // MetricManager loads a user-specified set of plugins, sends them their configuration,
6 // and sends them data as it is recieved. It also maintains the state of the plugins
7 // relative to the application state.
8 
9 #define TRACE_NAME "MetricManager"
10 #include "tracemf.h"
11 #include "artdaq-utilities/Plugins/MetricManager.hh"
12 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
13 #include "fhiclcpp/ParameterSet.h"
14 
15 #include <chrono>
16 #include <boost/exception/all.hpp>
17 
19 MetricManager() : metric_plugins_(0)
20 , metric_send_interval_ms_(15000)
21 , initialized_(false)
22 , running_(false)
23 , active_(false)
24 , missed_metric_calls_(0)
25 , metric_queue_max_size_(1000)
26 , metric_queue_notify_size_(10)
27 {}
28 
30 {
31  shutdown();
32 }
33 
34 void artdaq::MetricManager::initialize(fhicl::ParameterSet const& pset, std::string const& prefix)
35 {
36  prefix_ = prefix;
37  if (initialized_)
38  {
39  shutdown();
40  }
41  TLOG(TLVL_INFO) << "Configuring metrics with parameter set: " << pset.to_string() ;
42 
43  std::vector<std::string> names = pset.get_pset_names();
44 
45  for (auto name : names)
46  {
47  if (name == "metric_queue_size")
48  {
49  metric_queue_max_size_ = pset.get<size_t>("metric_queue_size");
50  }
51  else if (name == "metric_queue_notify_size")
52  {
53  metric_queue_notify_size_ = pset.get<size_t>("metric_queue_notify_size");
54  }
55  else if (name == "metric_send_maximum_delay_ms")
56  {
57  metric_send_interval_ms_ = pset.get<int>("metric_send_maximum_delay_ms");
58  }
59  else
60  {
61  try
62  {
63  TLOG(TLVL_DEBUG) << "Constructing metric plugin with name " << name ;
64  fhicl::ParameterSet plugin_pset = pset.get<fhicl::ParameterSet>(name);
65  metric_plugins_.push_back(makeMetricPlugin(
66  plugin_pset.get<std::string>("metricPluginType", ""), plugin_pset, prefix_));
67  }
68  catch (const cet::exception& e)
69  {
70  TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
71  ", cet::exception object caught:" << e.explain_self() ;
72  }
73  catch (const boost::exception& e)
74  {
75  TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
76  ", boost::exception object caught: " << boost::diagnostic_information(e) ;
77  }
78  catch (const std::exception& e)
79  {
80  TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
81  ", std::exception caught: " << e.what() ;
82  }
83  catch (...)
84  {
85  TLOG(TLVL_ERROR) << "Unknown Exception caught in MetricManager::initialize, error loading plugin with name " << name ;
86  }
87  }
88  }
89 
90  initialized_ = true;
91 }
92 
94 {
95  auto lk = std::unique_lock<std::mutex>(metric_mutex_);
96  if (!running_)
97  {
98  TLOG(TLVL_DEBUG) << "Starting MetricManager" ;
99  for (auto& metric : metric_plugins_)
100  {
101  try
102  {
103  metric->startMetrics();
104  TLOG(TLVL_INFO) << "Metric Plugin " << metric->getLibName() << " started." ;
105  active_ = true;
106  }
107  catch (...)
108  {
109  TLOG(TLVL_ERROR) <<
110  "Exception caught in MetricManager::do_start(), error starting plugin with name " <<
111  metric->getLibName() ;
112  }
113  }
114  running_ = true;
115  startMetricLoop_();
116  }
117 }
118 
120 {
121  auto lk = std::unique_lock<std::mutex>(metric_mutex_);
122  TLOG(TLVL_DEBUG) << "Stopping Metrics" ;
123  running_ = false;
124  metric_cv_.notify_all();
125  TLOG(TLVL_DEBUG) << "Joining Metric-Sending thread" ;
126  lk.unlock();
127  if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
128  TLOG(TLVL_DEBUG) << "do_stop Complete" ;
129 }
130 
131 void artdaq::MetricManager::do_pause() { /*do_stop();*/ }
132 void artdaq::MetricManager::do_resume() { /*do_start();*/ }
133 
134 void artdaq::MetricManager::reinitialize(fhicl::ParameterSet const& pset, std::string const& prefix)
135 {
136  shutdown();
137  initialize(pset, prefix);
138 }
139 
141 {
142  TLOG(TLVL_DEBUG) << "MetricManager is shutting down..." ;
143  do_stop();
144 
145  auto lk = std::unique_lock<std::mutex>(metric_mutex_);
146  if (initialized_)
147  {
148  for (auto& i : metric_plugins_)
149  {
150  try
151  {
152  std::string name = i->getLibName();
153  i.reset(nullptr);
154  TLOG(TLVL_DEBUG) << "Metric Plugin " << name << " shutdown." ;
155  }
156  catch (...)
157  {
158  TLOG(TLVL_ERROR) <<
159  "Exception caught in MetricManager::shutdown(), error shutting down metric with name " <<
160  i->getLibName() ;
161  }
162  }
163  initialized_ = false;
164  }
165 }
166 
167 void artdaq::MetricManager::sendMetric(std::string const& name, std::string const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
168 {
169  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
170  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
171  else if (active_)
172  {
173  {
174  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
175  if (!metric_queue_.count(name)) {
176  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
177  }
178  }
179  auto entry = &(metric_queue_[name]);
180 
181  auto size = entry->first.load();
182  if (size < metric_queue_max_size_)
183  {
184  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
185  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
186  {
187  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
188  entry->first++;
189  entry->second.emplace_back(std::move(metric));
190  }
191  }
192  else
193  {
194  TLOG(10) << "Rejecting metric because queue full" ;
195  missed_metric_calls_++;
196  }
197  metric_cv_.notify_all();
198  }
199 }
200 
201 void artdaq::MetricManager::sendMetric(std::string const& name, int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
202 {
203  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
204  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
205  else if (active_)
206  {
207  {
208  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
209  if (!metric_queue_.count(name)) {
210  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
211  }
212  }
213  auto entry = &(metric_queue_[name]);
214 
215  auto size = entry->first.load();
216  if (size < metric_queue_max_size_)
217  {
218  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
219  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
220  {
221  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
222  entry->first++;
223  entry->second.emplace_back(std::move(metric));
224  }
225  }
226  else
227  {
228  TLOG(10) << "Rejecting metric because queue full" ;
229  missed_metric_calls_++;
230  }
231  metric_cv_.notify_all();
232  }
233 }
234 
235 void artdaq::MetricManager::sendMetric(std::string const& name, double const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
236 {
237  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
238  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
239  else if (active_)
240  {
241  {
242  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
243  if (!metric_queue_.count(name)) {
244  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
245  }
246  }
247  auto entry = &(metric_queue_[name]);
248 
249  auto size = entry->first.load();
250  if (size < metric_queue_max_size_)
251  {
252  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
253  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
254  {
255  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
256  entry->first++;
257  entry->second.emplace_back(std::move(metric));
258  }
259  }
260  else
261  {
262  TLOG(10) << "Rejecting metric because queue full" ;
263  missed_metric_calls_++;
264  }
265  metric_cv_.notify_all();
266  }
267 }
268 
269 void artdaq::MetricManager::sendMetric(std::string const& name, float const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
270 {
271  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
272  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
273  else if (active_)
274  {
275  {
276  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
277  if (!metric_queue_.count(name)) {
278  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
279  }
280  }
281  auto entry = &(metric_queue_[name]);
282 
283  auto size = entry->first.load();
284  if (size < metric_queue_max_size_)
285  {
286  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
287  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
288  {
289  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
290  entry->first++;
291  entry->second.emplace_back(std::move(metric));
292  }
293  }
294  else
295  {
296  TLOG(10) << "Rejecting metric because queue full" ;
297  missed_metric_calls_++;
298  }
299  metric_cv_.notify_all();
300  }
301 }
302 
303 void artdaq::MetricManager::sendMetric(std::string const& name, long unsigned int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
304 {
305  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
306  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
307  else if (active_)
308  {
309  {
310  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
311  if (!metric_queue_.count(name)) {
312  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
313  }
314  }
315  auto entry = &(metric_queue_[name]);
316 
317  auto size = entry->first.load();
318  if (size < metric_queue_max_size_)
319  {
320  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
321  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
322  {
323  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
324  entry->first++;
325  entry->second.emplace_back(std::move(metric));
326  }
327  }
328  else
329  {
330  TLOG(10) << "Rejecting metric because queue full" ;
331  missed_metric_calls_++;
332  }
333  metric_cv_.notify_all();
334  }
335 }
336 
337 void artdaq::MetricManager::startMetricLoop_()
338 {
339  if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
340  boost::thread::attributes attrs;
341  attrs.set_stack_size(4096 * 2000); // 8000 KB
342  TLOG(TLVL_INFO) << "Starting Metric Sending Thread" ;
343  try {
344  metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_, this));
345  }
346  catch (const boost::exception& e)
347  {
348  TLOG(TLVL_ERROR) << "Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
349  std::cerr << "Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
350  exit(5);
351  }
352  TLOG(TLVL_INFO) << "Metric Sending thread started";
353 }
354 
356 {
357  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
358  for (auto& q : metric_queue_)
359  {
360  if (q.second.first != 0) return false;
361  }
362  return true;
363 }
364 
365 size_t artdaq::MetricManager::metricQueueSize(std::string const& name)
366 {
367  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
368  size_t size = 0;
369  if (name == "") {
370  for (auto& q : metric_queue_)
371  {
372  size += q.second.first;
373  }
374  }
375  else {
376  if (metric_queue_.count(name)) size = metric_queue_[name].first;
377  }
378 
379  return size;
380 }
381 
382 void artdaq::MetricManager::sendMetricLoop_()
383 {
384  TLOG(TLVL_INFO) << "sendMetricLoop_ START";
385  auto last_send_time = std::chrono::steady_clock::time_point();
386  while (running_)
387  {
388  while (metricQueueEmpty() && running_)
389  {
390  std::unique_lock<std::mutex> lk(metric_mutex_);
391  metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
392  auto now = std::chrono::steady_clock::now();
393  if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() > metric_send_interval_ms_)
394  {
395  for (auto& metric : metric_plugins_) { metric->sendMetrics(); }
396  last_send_time = now;
397  }
398  }
399 
400  auto processing_start = std::chrono::steady_clock::now();
401  auto temp_list = std::list<std::unique_ptr<MetricData>>();
402  {
403  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
404 
405  for (auto& q : metric_queue_)
406  {
407  temp_list.splice(temp_list.end(), q.second.second);
408  q.second.first = 0;
409  }
410 
411  temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
412  auto missed = missed_metric_calls_.exchange(0);
413 
414  temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
415  TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
416  }
417 
418  while (temp_list.size() > 0)
419  {
420  auto data_ = std::move(temp_list.front());
421  temp_list.pop_front();
422  if (data_->Type == MetricType::InvalidMetric) continue;
423  if (!data_->UseNameOverride)
424  {
425  if (data_->MetricPrefix.size() > 0)
426  {
427  data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
428  }
429  else
430  {
431  data_->Name = prefix_ + "." + data_->Name;
432  }
433  }
434 
435  for (auto& metric : metric_plugins_)
436  {
437  if (metric->getRunLevel() >= data_->Level)
438  {
439  try
440  {
441  metric->addMetricData(*data_);
442  last_send_time = std::chrono::steady_clock::now();
443  }
444  catch (...)
445  {
446  TLOG(TLVL_ERROR) <<
447  "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
448  << metric->getLibName() ;
449  }
450  }
451  }
452  }
453 
454  for (auto& metric : metric_plugins_)
455  {
456  metric->sendMetrics(false, processing_start);
457  }
458  }
459 
460  auto temp_list = std::list<std::unique_ptr<MetricData>>();
461  {
462  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
463 
464  for (auto& q : metric_queue_)
465  {
466  temp_list.splice(temp_list.end(), q.second.second);
467  }
468  metric_queue_.clear();
469 
470  temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
471  auto missed = missed_metric_calls_.exchange(0);
472 
473  temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
474  TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
475  }
476 
477  while (temp_list.size() > 0)
478  {
479  auto data_ = std::move(temp_list.front());
480  temp_list.pop_front();
481  if (data_->Type == MetricType::InvalidMetric) continue;
482  if (!data_->UseNameOverride)
483  {
484  if (data_->MetricPrefix.size() > 0)
485  {
486  data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
487  }
488  else
489  {
490  data_->Name = prefix_ + "." + data_->Name;
491  }
492  }
493 
494  for (auto& metric : metric_plugins_)
495  {
496  if (metric->getRunLevel() >= data_->Level)
497  {
498  try
499  {
500  metric->addMetricData(*data_);
501  last_send_time = std::chrono::steady_clock::now();
502  }
503  catch (...)
504  {
505  TLOG(TLVL_ERROR) <<
506  "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
507  << metric->getLibName() ;
508  }
509  }
510  }
511  }
512 
513  for (auto& metric : metric_plugins_)
514  {
515  try
516  {
517  metric->stopMetrics();
518  TLOG(TLVL_DEBUG) << "Metric Plugin " << metric->getLibName() << " stopped." ;
519  }
520  catch (...)
521  {
522  TLOG(TLVL_ERROR) <<
523  "Exception caught in MetricManager::do_stop(), error stopping plugin with name " <<
524  metric->getLibName() ;
525  }
526  }
527  TLOG(TLVL_DEBUG) << "MetricManager has been stopped." ;
528 }
void shutdown()
Call the destructors for all configured MetricPlugin instances.
void initialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Initialize the MetricPlugin instances.
void sendMetric(std::string const &name, std::string const &value, std::string const &unit, int level, MetricMode mode, std::string const &metricPrefix="", bool useNameOverride=false)
Send a metric with the given parameters to any MetricPlugins with a threshold level &gt;= to level...
size_t metricQueueSize(std::string const &name="")
Return the size of the named metric queue
void reinitialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Reinitialize all MetricPlugin Instances.
MetricMode
The Mode of the metric indicates how multiple metric values should be combined within a reporting int...
Definition: MetricData.hh:30
MetricManager()
Construct an instance of the MetricManager class.
void do_start()
Perform startup actions for each configured MetricPlugin.
void do_stop()
Stop sending metrics to the MetricPlugin instances.
virtual ~MetricManager() noexcept
MetricManager destructor.
std::unique_ptr< MetricPlugin > makeMetricPlugin(std::string const &generator_plugin_spec, fhicl::ParameterSet const &ps, std::string const &app_name)
Load a given MetricPlugin and return a pointer to it.
Sends both the Accumulate mode and Rate mode metric. (Rate mode metric will append &quot;/s&quot; to metric uni...
Small structure used to hold a metric data point before sending to the metric plugins ...
Definition: MetricData.hh:42
Default, invalid value.
void do_resume()
Resume metric sending. Currently a No-Op.
bool metricQueueEmpty()
Returns whether the metric queue is completely empty
void do_pause()
Pause metric sending. Currently a No-Op.