artdaq_utilities  v1_04_06
MetricManager.cc
1 // MetricManager.cc: MetricManager class implementation file
2 // Author: Eric Flumerfelt
3 // Last Modified: 11/14/2014
4 //
5 // MetricManager loads a user-specified set of plugins, sends them their configuration,
6 // and sends them data as it is recieved. It also maintains the state of the plugins
7 // relative to the application state.
8 
9 #define TRACE_NAME "MetricManager"
10 #include "tracemf.h"
11 #include "artdaq-utilities/Plugins/MetricManager.hh"
12 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
13 #include "fhiclcpp/ParameterSet.h"
14 
15 #include <chrono>
16 #include <boost/exception/all.hpp>
17 
19 MetricManager() : metric_plugins_(0)
20 , metric_send_interval_ms_(15000)
21 , initialized_(false)
22 , running_(false)
23 , active_(false)
24 , missed_metric_calls_(0)
25 , metric_queue_max_size_(1000)
26 , metric_queue_notify_size_(10)
27 {}
28 
30 {
31  shutdown();
32 }
33 
34 void artdaq::MetricManager::initialize(fhicl::ParameterSet const& pset, std::string prefix)
35 {
36  prefix_ = prefix;
37  if (initialized_)
38  {
39  shutdown();
40  }
41  TLOG(TLVL_INFO) << "Configuring metrics with parameter set:\n" << pset.to_string() ;
42 
43  std::vector<std::string> names = pset.get_pset_names();
44 
45  for (auto name : names)
46  {
47  if (name == "metric_queue_size")
48  {
49  metric_queue_max_size_ = pset.get<size_t>("metric_queue_size");
50  }
51  else if (name == "metric_queue_notify_size")
52  {
53  metric_queue_notify_size_ = pset.get<size_t>("metric_queue_notify_size");
54  }
55  else if (name == "metric_send_maximum_delay_ms")
56  {
57  metric_send_interval_ms_ = pset.get<int>("metric_send_maximum_delay_ms");
58  }
59  else
60  {
61  try
62  {
63  TLOG(TLVL_DEBUG) << "Constructing metric plugin with name " << name ;
64  fhicl::ParameterSet plugin_pset = pset.get<fhicl::ParameterSet>(name);
65  metric_plugins_.push_back(makeMetricPlugin(
66  plugin_pset.get<std::string>("metricPluginType", ""), plugin_pset, prefix_));
67  }
68  catch (const cet::exception& e)
69  {
70  TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
71  ", cet::exception object caught:" << e.explain_self() ;
72  }
73  catch (const boost::exception& e)
74  {
75  TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
76  ", boost::exception object caught: " << boost::diagnostic_information(e) ;
77  }
78  catch (const std::exception& e)
79  {
80  TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
81  ", std::exception caught: " << e.what() ;
82  }
83  catch (...)
84  {
85  TLOG(TLVL_ERROR) << "Unknown Exception caught in MetricManager::initialize, error loading plugin with name " << name ;
86  }
87  }
88  }
89 
90  initialized_ = true;
91 }
92 
94 {
95  if (!running_)
96  {
97  TLOG(TLVL_DEBUG) << "Starting MetricManager" ;
98  for (auto& metric : metric_plugins_)
99  {
100  try
101  {
102  metric->startMetrics();
103  TLOG(TLVL_INFO) << "Metric Plugin " << metric->getLibName() << " started." ;
104  active_ = true;
105  }
106  catch (...)
107  {
108  TLOG(TLVL_ERROR) <<
109  "Exception caught in MetricManager::do_start(), error starting plugin with name " <<
110  metric->getLibName() ;
111  }
112  }
113  running_ = true;
114  startMetricLoop_();
115  }
116 }
117 
119 {
120  TLOG(TLVL_DEBUG) << "Stopping Metrics" ;
121  running_ = false;
122  metric_cv_.notify_all();
123  TLOG(TLVL_DEBUG) << "Joining Metric-Sending thread" ;
124  if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
125  TLOG(TLVL_DEBUG) << "do_stop Complete" ;
126 }
127 
128 void artdaq::MetricManager::do_pause() { /*do_stop();*/ }
129 void artdaq::MetricManager::do_resume() { /*do_start();*/ }
130 
131 void artdaq::MetricManager::reinitialize(fhicl::ParameterSet const& pset, std::string prefix)
132 {
133  shutdown();
134  initialize(pset, prefix);
135 }
136 
138 {
139  TLOG(TLVL_DEBUG) << "MetricManager is shutting down..." ;
140  do_stop();
141 
142  if (initialized_)
143  {
144  for (auto& i : metric_plugins_)
145  {
146  try
147  {
148  std::string name = i->getLibName();
149  i.reset(nullptr);
150  TLOG(TLVL_DEBUG) << "Metric Plugin " << name << " shutdown." ;
151  }
152  catch (...)
153  {
154  TLOG(TLVL_ERROR) <<
155  "Exception caught in MetricManager::shutdown(), error shutting down metric with name " <<
156  i->getLibName() ;
157  }
158  }
159  initialized_ = false;
160  }
161 }
162 
163 void artdaq::MetricManager::sendMetric(std::string const& name, std::string const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
164 {
165  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
166  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
167  else if (active_)
168  {
169  if (!metric_queue_.count(name)) {
170  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
171  }
172  auto entry = &(metric_queue_[name]);
173 
174  auto size = entry->first;
175  if (size < metric_queue_max_size_)
176  {
177  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
178  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
179  {
180  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
181  entry->first++;
182  entry->second.emplace_back(std::move(metric));
183  }
184  }
185  else
186  {
187  TLOG(10) << "Rejecting metric because queue full" ;
188  missed_metric_calls_++;
189  }
190  metric_cv_.notify_all();
191  }
192 }
193 
194 void artdaq::MetricManager::sendMetric(std::string const& name, int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
195 {
196  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
197  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
198  else if (active_)
199  {
200  if (!metric_queue_.count(name)) {
201  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
202  }
203  auto entry = &(metric_queue_[name]);
204 
205  auto size = entry->first;
206  if (size < metric_queue_max_size_)
207  {
208  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
209  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
210  {
211  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
212  entry->first++;
213  entry->second.emplace_back(std::move(metric));
214  }
215  }
216  else
217  {
218  TLOG(10) << "Rejecting metric because queue full" ;
219  missed_metric_calls_++;
220  }
221  metric_cv_.notify_all();
222  }
223 }
224 
225 void artdaq::MetricManager::sendMetric(std::string const& name, double const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
226 {
227  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
228  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
229  else if (active_)
230  {
231  if (!metric_queue_.count(name)) {
232  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
233  }
234  auto entry = &(metric_queue_[name]);
235 
236  auto size = entry->first;
237  if (size < metric_queue_max_size_)
238  {
239  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
240  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
241  {
242  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
243  entry->first++;
244  entry->second.emplace_back(std::move(metric));
245  }
246  }
247  else
248  {
249  TLOG(10) << "Rejecting metric because queue full" ;
250  missed_metric_calls_++;
251  }
252  metric_cv_.notify_all();
253  }
254 }
255 
256 void artdaq::MetricManager::sendMetric(std::string const& name, float const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
257 {
258  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
259  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
260  else if (active_)
261  {
262  if (!metric_queue_.count(name)) {
263  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
264  }
265  auto entry = &(metric_queue_[name]);
266 
267  auto size = entry->first;
268  if (size < metric_queue_max_size_)
269  {
270  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
271  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
272  {
273  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
274  entry->first++;
275  entry->second.emplace_back(std::move(metric));
276  }
277  }
278  else
279  {
280  TLOG(10) << "Rejecting metric because queue full" ;
281  missed_metric_calls_++;
282  }
283  metric_cv_.notify_all();
284  }
285 }
286 
287 void artdaq::MetricManager::sendMetric(std::string const& name, long unsigned int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
288 {
289  if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
290  else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
291  else if (active_)
292  {
293  if (!metric_queue_.count(name)) {
294  metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
295  }
296  auto entry = &(metric_queue_[name]);
297 
298  auto size = entry->first;
299  if (size < metric_queue_max_size_)
300  {
301  if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
302  std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
303  {
304  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
305  entry->first++;
306  entry->second.emplace_back(std::move(metric));
307  }
308  }
309  else
310  {
311  TLOG(10) << "Rejecting metric because queue full" ;
312  missed_metric_calls_++;
313  }
314  metric_cv_.notify_all();
315  }
316 }
317 
318 void artdaq::MetricManager::startMetricLoop_()
319 {
320  if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
321  TLOG(TLVL_INFO) << "Starting Metric Sending Thread" ;
322  boost::thread::attributes attrs;
323  attrs.set_stack_size(4096 * 200); // 800 KB
324  metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_, this));
325 }
326 
328 {
329  for (auto& q : metric_queue_)
330  {
331  if (q.second.first != 0) return false;
332  }
333  return true;
334 }
335 
337 {
338  size_t size = 0;
339  if (name == "") {
340 
341  }
342  else {
343  if (metric_queue_.count(name)) size = metric_queue_[name].first;
344  }
345 
346  return size;
347 }
348 
349 void artdaq::MetricManager::sendMetricLoop_()
350 {
351  auto last_send_time = std::chrono::steady_clock::time_point();
352  while (running_)
353  {
354  while (metricQueueEmpty() && running_)
355  {
356  std::unique_lock<std::mutex> lk(metric_mutex_);
357  metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
358  auto now = std::chrono::steady_clock::now();
359  if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() > metric_send_interval_ms_)
360  {
361  for (auto& metric : metric_plugins_) { metric->sendMetrics(); }
362  last_send_time = now;
363  }
364  }
365 
366  auto processing_start = std::chrono::steady_clock::now();
367  auto temp_list = std::list<std::unique_ptr<MetricData>>();
368  {
369  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
370 
371  for (auto& q : metric_queue_)
372  {
373  temp_list.splice(temp_list.end(), q.second.second);
374  q.second.first = 0;
375  }
376 
377  temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
378  auto missed = missed_metric_calls_.exchange(0);
379 
380  temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
381  TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
382  }
383 
384  while (temp_list.size() > 0)
385  {
386  auto data_ = std::move(temp_list.front());
387  temp_list.pop_front();
388  if (data_->Type == MetricType::InvalidMetric) continue;
389  if (!data_->UseNameOverride)
390  {
391  if (data_->MetricPrefix.size() > 0)
392  {
393  data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
394  }
395  else
396  {
397  data_->Name = prefix_ + "." + data_->Name;
398  }
399  }
400 
401  for (auto& metric : metric_plugins_)
402  {
403  if (metric->getRunLevel() >= data_->Level)
404  {
405  try
406  {
407  metric->addMetricData(*data_);
408  last_send_time = std::chrono::steady_clock::now();
409  }
410  catch (...)
411  {
412  TLOG(TLVL_ERROR) <<
413  "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
414  << metric->getLibName() ;
415  }
416  }
417  }
418  }
419 
420  for (auto& metric : metric_plugins_)
421  {
422  metric->sendMetrics(false, processing_start);
423  }
424  }
425 
426  auto temp_list = std::list<std::unique_ptr<MetricData>>();
427  {
428  std::unique_lock<std::mutex> lk(metric_queue_mutex_);
429 
430  for (auto& q : metric_queue_)
431  {
432  temp_list.splice(temp_list.end(), q.second.second);
433  }
434  metric_queue_.clear();
435 
436  temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
437  auto missed = missed_metric_calls_.exchange(0);
438 
439  temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
440  TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
441  }
442 
443  while (temp_list.size() > 0)
444  {
445  auto data_ = std::move(temp_list.front());
446  temp_list.pop_front();
447  if (data_->Type == MetricType::InvalidMetric) continue;
448  if (!data_->UseNameOverride)
449  {
450  if (data_->MetricPrefix.size() > 0)
451  {
452  data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
453  }
454  else
455  {
456  data_->Name = prefix_ + "." + data_->Name;
457  }
458  }
459 
460  for (auto& metric : metric_plugins_)
461  {
462  if (metric->getRunLevel() >= data_->Level)
463  {
464  try
465  {
466  metric->addMetricData(*data_);
467  last_send_time = std::chrono::steady_clock::now();
468  }
469  catch (...)
470  {
471  TLOG(TLVL_ERROR) <<
472  "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
473  << metric->getLibName() ;
474  }
475  }
476  }
477  }
478 
479  for (auto& metric : metric_plugins_)
480  {
481  try
482  {
483  metric->stopMetrics();
484  TLOG(TLVL_DEBUG) << "Metric Plugin " << metric->getLibName() << " stopped." ;
485  }
486  catch (...)
487  {
488  TLOG(TLVL_ERROR) <<
489  "Exception caught in MetricManager::do_stop(), error stopping plugin with name " <<
490  metric->getLibName() ;
491  }
492  }
493  TLOG(TLVL_DEBUG) << "MetricManager has been stopped." ;
494 }
void reinitialize(fhicl::ParameterSet const &pset, std::string prefix="")
Reinitialize all MetricPlugin Instances.
void shutdown()
Call the destructors for all configured MetricPlugin instances.
void initialize(fhicl::ParameterSet const &pset, std::string prefix="")
Initialize the MetricPlugin instances.
void sendMetric(std::string const &name, std::string const &value, std::string const &unit, int level, MetricMode mode, std::string const &metricPrefix="", bool useNameOverride=false)
Send a metric with the given parameters to any MetricPlugins with a threshold level &gt;= to level...
MetricMode
The Mode of the metric indicates how multiple metric values should be combined within a reporting int...
Definition: MetricData.hh:30
size_t metricQueueSize(std::string name="")
Return the size of the named metric queue
MetricManager()
Construct an instance of the MetricManager class.
void do_start()
Perform startup actions for each configured MetricPlugin.
void do_stop()
Stop sending metrics to the MetricPlugin instances.
virtual ~MetricManager() noexcept
MetricManager destructor.
std::unique_ptr< MetricPlugin > makeMetricPlugin(std::string const &generator_plugin_spec, fhicl::ParameterSet const &ps, std::string const &app_name)
Load a given MetricPlugin and return a pointer to it.
Sends both the Accumulate mode and Rate mode metric. (Rate mode metric will append &quot;/s&quot; to metric uni...
Small structure used to hold a metric data point before sending to the metric plugins ...
Definition: MetricData.hh:42
Default, invalid value.
void do_resume()
Resume metric sending. Currently a No-Op.
bool metricQueueEmpty()
Returns whether the metric queue is completely empty
void do_pause()
Pause metric sending. Currently a No-Op.