9 #define TRACE_NAME "MetricManager"
10 #include "artdaq-utilities/Plugins/MetricManager.hh"
11 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
12 #include "fhiclcpp/ParameterSet.h"
15 #include <boost/exception/all.hpp>
20 , metric_send_interval_ms_(15000)
21 , metric_holdoff_us_(1000)
22 , system_metric_collector_(nullptr)
27 , missed_metric_calls_(0)
29 , metric_cache_max_size_(1000)
30 , metric_cache_notify_size_(10) {}
41 TLOG(TLVL_INFO) <<
"Configuring metrics with parameter set: " << pset.to_string();
43 std::vector<std::string> names = pset.get_names();
45 metric_plugins_.clear();
46 bool send_system_metrics =
false;
47 bool send_process_metrics =
false;
49 for (
auto name : names)
51 if (name ==
"metric_queue_size")
53 metric_cache_max_size_ = pset.get<
size_t>(
"metric_queue_size");
55 else if (name ==
"metric_queue_notify_size")
57 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_queue_notify_size");
59 else if (name ==
"metric_cache_size")
61 metric_cache_max_size_ = pset.get<
size_t>(
"metric_cache_size");
63 else if (name ==
"metric_cache_notify_size")
65 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_cache_notify_size");
67 else if (name ==
"metric_send_maximum_delay_ms")
69 TLOG(TLVL_INFO) <<
"Setting metric_send_interval_ms_ to " << pset.get<
int>(
"metric_send_maximum_delay_ms");
70 metric_send_interval_ms_ = pset.get<
int>(
"metric_send_maximum_delay_ms");
72 else if (name ==
"metric_holdoff_us")
74 TLOG(TLVL_INFO) <<
"Setting metric_holdoff_us_ to " << pset.get<
int>(
"metric_holdoff_us");
75 metric_holdoff_us_ = pset.get<
int>(
"metric_holdoff_us");
77 else if (name ==
"send_system_metrics")
79 send_system_metrics = pset.get<
bool>(
"send_system_metrics");
81 else if (name ==
"send_process_metrics")
83 send_process_metrics = pset.get<
bool>(
"send_process_metrics");
89 TLOG(TLVL_DEBUG) <<
"Constructing metric plugin with name " << name;
90 fhicl::ParameterSet plugin_pset = pset.get<fhicl::ParameterSet>(name);
91 metric_plugins_.push_back(
92 makeMetricPlugin(plugin_pset.get<std::string>(
"metricPluginType",
""), plugin_pset, prefix_));
94 catch (
const cet::exception& e)
96 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
97 <<
", cet::exception object caught:" << e.explain_self();
99 catch (
const boost::exception& e)
101 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
102 <<
", boost::exception object caught: " << boost::diagnostic_information(e);
104 catch (
const std::exception& e)
106 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
107 <<
", std::exception caught: " << e.what();
111 TLOG(TLVL_ERROR) <<
"Unknown Exception caught in MetricManager::initialize, error loading plugin with name "
117 if (send_system_metrics || send_process_metrics)
119 system_metric_collector_.reset(
new SystemMetricCollector(send_process_metrics, send_system_metrics));
127 auto lk = std::unique_lock<std::mutex>(metric_mutex_);
130 TLOG(TLVL_DEBUG) <<
"Starting MetricManager";
131 for (
auto& metric : metric_plugins_)
133 if (!metric)
continue;
136 metric->startMetrics();
137 TLOG(TLVL_INFO) <<
"Metric Plugin " << metric->getLibName() <<
" started.";
142 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_start(), error starting plugin with name "
143 << metric->getLibName();
153 auto lk = std::unique_lock<std::mutex>(metric_mutex_);
154 TLOG(TLVL_DEBUG) <<
"Stopping Metrics";
156 metric_cv_.notify_all();
157 TLOG(TLVL_DEBUG) <<
"Joining Metric-Sending thread";
159 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
160 TLOG(TLVL_DEBUG) <<
"do_stop Complete";
173 initialize(pset, prefix);
178 TLOG(TLVL_DEBUG) <<
"MetricManager is shutting down...";
181 auto lk = std::unique_lock<std::mutex>(metric_mutex_);
184 for (
auto& i : metric_plugins_)
188 std::string name = i->getLibName();
190 TLOG(TLVL_DEBUG) <<
"Metric Plugin " << name <<
" shutdown.";
194 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::shutdown(), error shutting down metric with name "
198 metric_plugins_.clear();
199 initialized_ =
false;
204 int level,
MetricMode mode, std::string
const& metricPrefix,
205 bool useNameOverride)
209 TLOG(TLVL_WARNING) <<
"Attempted to send metric when MetricManager has not yet been initialized!";
213 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
218 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
220 last_metric_received_ = std::chrono::steady_clock::now();
221 if (!metric_cache_.count(name) || metric_cache_[name] ==
nullptr)
223 metric_cache_[name] =
224 std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
228 auto size = metric_cache_[name]->DataPointCount;
229 if (size < metric_cache_max_size_)
231 if (size >= metric_cache_notify_size_)
233 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
238 metric_cache_[name]->StringValue = value;
239 metric_cache_[name]->DataPointCount = 1;
243 metric_cache_[name]->StringValue +=
" " + value;
244 metric_cache_[name]->DataPointCount++;
249 TLOG(10) <<
"Rejecting metric because queue full";
250 missed_metric_calls_++;
254 metric_cv_.notify_all();
259 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
263 TLOG(TLVL_WARNING) <<
"Attempted to send metric when MetricManager has not yet been initialized!";
267 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
272 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
274 last_metric_received_ = std::chrono::steady_clock::now();
275 if (!metric_cache_.count(name) || metric_cache_[name] ==
nullptr)
277 metric_cache_[name] =
278 std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
282 auto size = metric_cache_[name]->DataPointCount;
283 if (size < metric_cache_max_size_)
285 if (size >= metric_cache_notify_size_)
287 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
290 metric_cache_[name]->AddPoint(value);
294 TLOG(10) <<
"Rejecting metric because queue full";
295 missed_metric_calls_++;
299 metric_cv_.notify_all();
304 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
308 TLOG(TLVL_WARNING) <<
"Attempted to send metric when MetricManager has not yet been initialized!";
312 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
317 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
319 last_metric_received_ = std::chrono::steady_clock::now();
320 if (!metric_cache_.count(name) || metric_cache_[name] ==
nullptr)
322 metric_cache_[name] =
323 std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
327 auto size = metric_cache_[name]->DataPointCount;
328 if (size < metric_cache_max_size_)
330 if (size >= metric_cache_notify_size_)
332 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
335 metric_cache_[name]->AddPoint(value);
339 TLOG(10) <<
"Rejecting metric because queue full";
340 missed_metric_calls_++;
344 metric_cv_.notify_all();
349 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
353 TLOG(TLVL_WARNING) <<
"Attempted to send metric when MetricManager has not yet been initialized!";
357 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
362 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
364 last_metric_received_ = std::chrono::steady_clock::now();
365 if (!metric_cache_.count(name) || metric_cache_[name] ==
nullptr)
367 metric_cache_[name] =
368 std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
372 auto size = metric_cache_[name]->DataPointCount;
373 if (size < metric_cache_max_size_)
375 if (size >= metric_cache_notify_size_)
377 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
380 metric_cache_[name]->AddPoint(value);
384 TLOG(10) <<
"Rejecting metric because queue full";
385 missed_metric_calls_++;
389 metric_cv_.notify_all();
394 int level,
MetricMode mode, std::string
const& metricPrefix,
395 bool useNameOverride)
399 TLOG(TLVL_WARNING) <<
"Attempted to send metric when MetricManager has not yet been initialized!";
403 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
408 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
410 last_metric_received_ = std::chrono::steady_clock::now();
411 if (!metric_cache_.count(name) || metric_cache_[name] ==
nullptr)
413 metric_cache_[name] =
414 std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
418 auto size = metric_cache_[name]->DataPointCount;
419 if (size < metric_cache_max_size_)
421 if (size >= metric_cache_notify_size_)
423 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
426 metric_cache_[name]->AddPoint(value);
430 TLOG(10) <<
"Rejecting metric because queue full";
431 missed_metric_calls_++;
435 metric_cv_.notify_all();
439 void artdaq::MetricManager::startMetricLoop_()
441 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
442 boost::thread::attributes attrs;
443 attrs.set_stack_size(4096 * 2000);
444 TLOG(TLVL_INFO) <<
"Starting Metric Sending Thread";
447 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_,
this));
449 catch (
const boost::exception& e)
451 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
452 <<
", errno=" << errno;
453 std::cerr <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
454 <<
", errno=" << errno << std::endl;
457 TLOG(TLVL_INFO) <<
"Metric Sending thread started";
462 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
463 return metric_cache_.size() == 0;
468 bool pluginsBusy =
false;
470 for (
auto& p : metric_plugins_)
472 if (p->metricsPending())
479 TLOG(TLVL_TRACE) <<
"Metric queue empty: " << metricQueueEmpty() <<
", busy_: " << busy_ <<
", Plugins busy: " << pluginsBusy;
480 return !metricQueueEmpty() || busy_ || pluginsBusy;
485 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
489 for (
auto& q : metric_cache_)
491 size += q.second->DataPointCount;
496 if (metric_cache_.count(name)) size = metric_cache_[name]->DataPointCount;
502 void artdaq::MetricManager::sendMetricLoop_()
504 TLOG(TLVL_INFO) <<
"sendMetricLoop_ START";
505 auto last_send_time = std::chrono::steady_clock::time_point();
508 TLOG(6) <<
"sendMetricLoop_: Entering Metric input wait loop";
509 while (metricQueueEmpty() && running_)
511 std::unique_lock<std::mutex> lk(metric_mutex_);
512 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
513 auto now = std::chrono::steady_clock::now();
514 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() >
515 metric_send_interval_ms_)
517 TLOG(6) <<
"sendMetricLoop_: Metric send interval exceeded: Sending metrics";
518 if (std::chrono::duration_cast<std::chrono::microseconds>(now - last_metric_received_).count() < metric_holdoff_us_)
520 usleep(metric_holdoff_us_);
522 for (
auto& metric : metric_plugins_)
524 if (metric) metric->sendMetrics();
526 last_send_time = now;
529 if (std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - last_metric_received_).count() < metric_holdoff_us_)
531 usleep(metric_holdoff_us_);
534 TLOG(6) <<
"sendMetricLoop_: After Metric input wait loop";
536 auto processing_start = std::chrono::steady_clock::now();
537 auto temp_list = std::list<std::unique_ptr<MetricData>>();
539 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
541 for (
auto& q : metric_cache_)
543 temp_list.emplace_back(std::move(q.second));
545 metric_cache_.clear();
547 auto calls = metric_calls_.exchange(0);
548 temp_list.emplace_back(
551 auto missed = missed_metric_calls_.exchange(0);
552 temp_list.emplace_back(
555 TLOG(TLVL_TRACE) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
558 if (system_metric_collector_ !=
nullptr)
560 TLOG(TLVL_TRACE) <<
"Collecting System metrics (CPU, RAM, Network)";
561 auto systemMetrics = system_metric_collector_->SendMetrics();
562 for (
auto& m : systemMetrics) { temp_list.emplace_back(std::move(m)); }
566 TLOG(6) <<
"sendMetricLoop_: Before processing temp_list";
567 while (temp_list.size() > 0)
569 auto data_ = std::move(temp_list.front());
570 temp_list.pop_front();
572 if (!data_->UseNameOverride)
574 if (data_->MetricPrefix.size() > 0)
576 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
580 data_->Name = prefix_ +
"." + data_->Name;
584 for (
auto& metric : metric_plugins_)
586 if (!metric)
continue;
587 if (metric->IsLevelEnabled(data_->Level))
591 metric->addMetricData(data_);
592 last_send_time = std::chrono::steady_clock::now();
596 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
597 << metric->getLibName();
603 TLOG(6) <<
"sendMetricLoop_: Before sending metrics";
604 for (
auto& metric : metric_plugins_)
606 if (!metric)
continue;
607 metric->sendMetrics(
false, processing_start);
611 TLOG(6) <<
"sendMetricLoop_: End of working loop";
617 auto temp_list = std::list<std::unique_ptr<MetricData>>();
619 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
621 for (
auto& q : metric_cache_)
623 temp_list.emplace_back(std::move(q.second));
625 metric_cache_.clear();
627 auto calls = metric_calls_.exchange(0);
628 temp_list.emplace_back(
631 auto missed = missed_metric_calls_.exchange(0);
632 temp_list.emplace_back(
635 TLOG(TLVL_TRACE) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
639 while (temp_list.size() > 0)
641 auto data_ = std::move(temp_list.front());
642 temp_list.pop_front();
644 if (!data_->UseNameOverride)
646 if (data_->MetricPrefix.size() > 0)
648 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
652 data_->Name = prefix_ +
"." + data_->Name;
656 for (
auto& metric : metric_plugins_)
658 if (!metric)
continue;
659 if (metric->IsLevelEnabled(data_->Level))
663 metric->addMetricData(data_);
664 last_send_time = std::chrono::steady_clock::now();
668 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
669 << metric->getLibName();
675 for (
auto& metric : metric_plugins_)
677 if (!metric)
continue;
680 metric->stopMetrics();
681 TLOG(TLVL_DEBUG) <<
"Metric Plugin " << metric->getLibName() <<
" stopped.";
685 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_stop(), error stopping plugin with name "
686 << metric->getLibName();
690 TLOG(TLVL_DEBUG) <<
"MetricManager has been stopped.";
void shutdown()
Call the destructors for all configured MetricPlugin instances.
void initialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Initialize the MetricPlugin instances.
void sendMetric(std::string const &name, std::string const &value, std::string const &unit, int level, MetricMode mode, std::string const &metricPrefix="", bool useNameOverride=false)
Send a metric with the given parameters to any MetricPlugins with a threshold level >= to level...
bool metricManagerBusy()
Determine whether the MetricManager or any of its plugins are currently processing metrics...
size_t metricQueueSize(std::string const &name="")
Return the size of the named metric queue
void reinitialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Reinitialize all MetricPlugin Instances.
Report the sum of all values. Use for counters to report accurate results.
MetricManager()
Construct an instance of the MetricManager class.
void do_start()
Perform startup actions for each configured MetricPlugin.
void do_stop()
Stop sending metrics to the MetricPlugin instances.
virtual ~MetricManager() noexcept
MetricManager destructor.
std::unique_ptr< MetricPlugin > makeMetricPlugin(std::string const &generator_plugin_spec, fhicl::ParameterSet const &ps, std::string const &app_name)
Load a given MetricPlugin and return a pointer to it.
Report only the last value recorded. Useful for event counters, run numbers, etc. ...
MetricMode
The Mode of the metric indicates how multiple metric values should be combined within a reporting int...
Collects metrics from the system, using proc filesystem or kernel API calls
void do_resume()
Resume metric sending. Currently a No-Op.
bool metricQueueEmpty()
Returns whether the metric queue is completely empty
void do_pause()
Pause metric sending. Currently a No-Op.