9 #define TRACE_NAME "MetricManager"
10 #include "artdaq-utilities/Plugins/MetricManager.hh"
11 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
12 #include "fhiclcpp/ParameterSet.h"
15 #include <boost/exception/all.hpp>
20 , metric_send_interval_ms_(15000)
21 , metric_holdoff_us_(1000)
22 , system_metric_collector_(nullptr)
27 , missed_metric_calls_(0)
29 , metric_cache_max_size_(1000)
30 , metric_cache_notify_size_(10) {}
41 TLOG(TLVL_INFO) <<
"Configuring metrics with parameter set: " << pset.to_string();
43 std::vector<std::string> names = pset.get_names();
45 metric_plugins_.clear();
46 bool send_system_metrics =
false;
47 bool send_process_metrics =
false;
49 for (
auto name : names)
51 if (name ==
"metric_queue_size")
53 metric_cache_max_size_ = pset.get<
size_t>(
"metric_queue_size");
55 else if (name ==
"metric_queue_notify_size")
57 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_queue_notify_size");
59 else if (name ==
"metric_cache_size")
61 metric_cache_max_size_ = pset.get<
size_t>(
"metric_cache_size");
63 else if (name ==
"metric_cache_notify_size")
65 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_cache_notify_size");
67 else if (name ==
"metric_send_maximum_delay_ms")
69 TLOG(TLVL_INFO) <<
"Setting metric_send_interval_ms_ to " << pset.get<
int>(
"metric_send_maximum_delay_ms");
70 metric_send_interval_ms_ = pset.get<
int>(
"metric_send_maximum_delay_ms");
72 else if (name ==
"metric_holdoff_us")
74 TLOG(TLVL_INFO) <<
"Setting metric_holdoff_us_ to " << pset.get<
int>(
"metric_holdoff_us");
75 metric_holdoff_us_ = pset.get<
int>(
"metric_holdoff_us");
77 else if (name ==
"send_system_metrics")
79 send_system_metrics = pset.get<
bool>(
"send_system_metrics");
81 else if (name ==
"send_process_metrics")
83 send_process_metrics = pset.get<
bool>(
"send_process_metrics");
89 TLOG(TLVL_DEBUG) <<
"Constructing metric plugin with name " << name;
90 fhicl::ParameterSet plugin_pset = pset.get<fhicl::ParameterSet>(name);
91 metric_plugins_.push_back(
92 makeMetricPlugin(plugin_pset.get<std::string>(
"metricPluginType",
""), plugin_pset, prefix_));
94 catch (
const cet::exception& e)
96 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
97 <<
", cet::exception object caught:" << e.explain_self();
99 catch (
const boost::exception& e)
101 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
102 <<
", boost::exception object caught: " << boost::diagnostic_information(e);
104 catch (
const std::exception& e)
106 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
107 <<
", std::exception caught: " << e.what();
111 TLOG(TLVL_ERROR) <<
"Unknown Exception caught in MetricManager::initialize, error loading plugin with name "
117 if (send_system_metrics || send_process_metrics)
119 system_metric_collector_.reset(
new SystemMetricCollector(send_process_metrics, send_system_metrics));
127 std::lock_guard<std::mutex> lk(metric_mutex_);
130 TLOG(TLVL_DEBUG) <<
"Starting MetricManager";
131 for (
auto& metric : metric_plugins_)
133 if (!metric)
continue;
136 metric->startMetrics();
137 TLOG(TLVL_INFO) <<
"Metric Plugin " << metric->getLibName() <<
" started.";
142 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_start(), error starting plugin with name "
143 << metric->getLibName();
153 std::unique_lock<std::mutex> lk(metric_mutex_);
154 TLOG(TLVL_DEBUG) <<
"Stopping Metrics";
156 metric_cv_.notify_all();
157 TLOG(TLVL_DEBUG) <<
"Joining Metric-Sending thread";
159 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
160 TLOG(TLVL_DEBUG) <<
"do_stop Complete";
173 initialize(pset, prefix);
178 TLOG(TLVL_DEBUG) <<
"MetricManager is shutting down...";
181 std::lock_guard<std::mutex> lk(metric_mutex_);
184 initialized_ =
false;
185 for (
auto& i : metric_plugins_)
189 std::string name = i->getLibName();
191 TLOG(TLVL_DEBUG) <<
"Metric Plugin " << name <<
" shutdown.";
195 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::shutdown(), error shutting down metric with name "
199 metric_plugins_.clear();
204 int level,
MetricMode mode, std::string
const& metricPrefix,
205 bool useNameOverride)
209 TLOG(TLVL_WARNING) <<
"Attempted to send metric when MetricManager has not yet been initialized!";
213 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
218 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
220 last_metric_received_ = std::chrono::steady_clock::now();
221 auto& cached = metric_cache_[name];
222 if (cached ==
nullptr)
224 metric_cache_[name].reset(
new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
228 auto size = cached->DataPointCount;
229 if (size < metric_cache_max_size_)
231 if (size >= metric_cache_notify_size_)
233 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
238 cached->StringValue = value;
239 cached->DataPointCount = 1;
243 cached->StringValue +=
" " + value;
244 cached->DataPointCount++;
249 TLOG(10) <<
"Rejecting metric because queue full";
250 missed_metric_calls_++;
254 metric_cv_.notify_all();
259 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
263 TLOG(TLVL_WARNING) <<
"Attempted to send metric when MetricManager has not yet been initialized!";
267 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
272 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
274 last_metric_received_ = std::chrono::steady_clock::now();
275 auto& cached = metric_cache_[name];
276 if (cached ==
nullptr)
278 metric_cache_[name].reset(
new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
282 auto size = cached->DataPointCount;
283 if (size < metric_cache_max_size_)
285 if (size >= metric_cache_notify_size_)
287 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
290 cached->AddPoint(value);
294 TLOG(10) <<
"Rejecting metric because queue full";
295 missed_metric_calls_++;
299 metric_cv_.notify_all();
304 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
308 TLOG(TLVL_WARNING) <<
"Attempted to send metric when MetricManager has not yet been initialized!";
312 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
317 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
319 last_metric_received_ = std::chrono::steady_clock::now();
320 auto& cached = metric_cache_[name];
321 if (cached ==
nullptr)
323 metric_cache_[name].reset(
new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
327 auto size = cached->DataPointCount;
328 if (size < metric_cache_max_size_)
330 if (size >= metric_cache_notify_size_)
332 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
335 cached->AddPoint(value);
339 TLOG(10) <<
"Rejecting metric because queue full";
340 missed_metric_calls_++;
344 metric_cv_.notify_all();
349 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
353 TLOG(TLVL_WARNING) <<
"Attempted to send metric when MetricManager has not yet been initialized!";
357 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
362 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
364 last_metric_received_ = std::chrono::steady_clock::now();
365 auto& cached = metric_cache_[name];
366 if (cached ==
nullptr)
368 metric_cache_[name].reset(
new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
372 auto size = cached->DataPointCount;
373 if (size < metric_cache_max_size_)
375 if (size >= metric_cache_notify_size_)
377 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
380 cached->AddPoint(value);
384 TLOG(10) <<
"Rejecting metric because queue full";
385 missed_metric_calls_++;
389 metric_cv_.notify_all();
394 int level,
MetricMode mode, std::string
const& metricPrefix,
395 bool useNameOverride)
399 TLOG(TLVL_WARNING) <<
"Attempted to send metric when MetricManager has not yet been initialized!";
403 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
408 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
410 last_metric_received_ = std::chrono::steady_clock::now();
411 auto& cached = metric_cache_[name];
412 if (cached ==
nullptr)
414 metric_cache_[name].reset(
new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
418 auto size = cached->DataPointCount;
419 if (size < metric_cache_max_size_)
421 if (size >= metric_cache_notify_size_)
423 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
426 cached->AddPoint(value);
430 TLOG(10) <<
"Rejecting metric because queue full";
431 missed_metric_calls_++;
435 metric_cv_.notify_all();
439 void artdaq::MetricManager::startMetricLoop_()
441 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
442 boost::thread::attributes attrs;
443 attrs.set_stack_size(4096 * 2000);
444 TLOG(TLVL_INFO) <<
"Starting Metric Sending Thread";
447 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_,
this));
449 catch (
const boost::exception& e)
451 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
452 <<
", errno=" << errno;
453 std::cerr <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
454 <<
", errno=" << errno << std::endl;
457 TLOG(TLVL_INFO) <<
"Metric Sending thread started";
462 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
463 for (
auto& cache_entry : metric_cache_)
465 if (cache_entry.second->DataPointCount > 0)
return false;
473 bool pluginsBusy =
false;
475 for (
auto& p : metric_plugins_)
477 if (p->metricsPending())
484 TLOG(TLVL_TRACE) <<
"Metric queue empty: " << metricQueueEmpty() <<
", busy_: " << busy_ <<
", Plugins busy: " << pluginsBusy;
485 return !metricQueueEmpty() || busy_ || pluginsBusy;
490 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
494 for (
auto& q : metric_cache_)
496 size += q.second->DataPointCount;
501 if (metric_cache_.count(name)) size = metric_cache_[name]->DataPointCount;
507 void artdaq::MetricManager::sendMetricLoop_()
509 TLOG(TLVL_INFO) <<
"sendMetricLoop_ START";
510 auto last_send_time = std::chrono::steady_clock::time_point();
513 TLOG(6) <<
"sendMetricLoop_: Entering Metric input wait loop";
514 while (metricQueueEmpty() && running_)
516 std::unique_lock<std::mutex> lk(metric_mutex_);
517 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
518 auto now = std::chrono::steady_clock::now();
519 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() >
520 metric_send_interval_ms_)
522 TLOG(6) <<
"sendMetricLoop_: Metric send interval exceeded: Sending metrics";
523 if (std::chrono::duration_cast<std::chrono::microseconds>(now - last_metric_received_).count() < metric_holdoff_us_)
525 usleep(metric_holdoff_us_);
527 for (
auto& metric : metric_plugins_)
529 if (metric) metric->sendMetrics();
531 last_send_time = now;
534 if (std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - last_metric_received_).count() < metric_holdoff_us_)
536 usleep(metric_holdoff_us_);
539 TLOG(6) <<
"sendMetricLoop_: After Metric input wait loop";
541 auto processing_start = std::chrono::steady_clock::now();
542 auto temp_list = std::list<std::unique_ptr<MetricData>>();
544 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
546 for (
auto& q : metric_cache_)
548 temp_list.emplace_back(
new MetricData(*q.second));
553 auto calls = metric_calls_.exchange(0);
554 temp_list.emplace_back(
557 auto missed = missed_metric_calls_.exchange(0);
558 temp_list.emplace_back(
561 TLOG(TLVL_TRACE) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
564 if (system_metric_collector_ !=
nullptr)
566 TLOG(TLVL_TRACE) <<
"Collecting System metrics (CPU, RAM, Network)";
567 auto systemMetrics = system_metric_collector_->SendMetrics();
568 for (
auto& m : systemMetrics) { temp_list.emplace_back(std::move(m)); }
571 TLOG(6) <<
"sendMetricLoop_: Before processing temp_list";
572 while (temp_list.size() > 0)
574 auto data_ = std::move(temp_list.front());
575 temp_list.pop_front();
577 if (!data_->UseNameOverride)
579 if (data_->MetricPrefix.size() > 0)
581 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
585 data_->Name = prefix_ +
"." + data_->Name;
589 for (
auto& metric : metric_plugins_)
591 if (!metric)
continue;
592 if (metric->IsLevelEnabled(data_->Level))
596 metric->addMetricData(data_);
597 last_send_time = std::chrono::steady_clock::now();
601 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
602 << metric->getLibName();
608 TLOG(6) <<
"sendMetricLoop_: Before sending metrics";
609 for (
auto& metric : metric_plugins_)
611 if (!metric)
continue;
612 metric->sendMetrics(
false, processing_start);
616 TLOG(6) <<
"sendMetricLoop_: End of working loop";
622 auto temp_list = std::list<std::unique_ptr<MetricData>>();
624 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
626 for (
auto& q : metric_cache_)
628 if (q.second !=
nullptr && q.second->DataPointCount > 0)
630 temp_list.emplace_back(
new MetricData(*q.second));
637 auto calls = metric_calls_.exchange(0);
638 temp_list.emplace_back(
641 auto missed = missed_metric_calls_.exchange(0);
642 temp_list.emplace_back(
645 TLOG(TLVL_TRACE) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
648 while (temp_list.size() > 0)
650 auto data_ = std::move(temp_list.front());
651 temp_list.pop_front();
653 if (!data_->UseNameOverride)
655 if (data_->MetricPrefix.size() > 0)
657 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
661 data_->Name = prefix_ +
"." + data_->Name;
665 for (
auto& metric : metric_plugins_)
667 if (!metric)
continue;
668 if (metric->IsLevelEnabled(data_->Level))
672 metric->addMetricData(data_);
673 last_send_time = std::chrono::steady_clock::now();
677 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
678 << metric->getLibName();
684 for (
auto& metric : metric_plugins_)
686 if (!metric)
continue;
689 metric->stopMetrics();
690 TLOG(TLVL_DEBUG) <<
"Metric Plugin " << metric->getLibName() <<
" stopped.";
694 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_stop(), error stopping plugin with name "
695 << metric->getLibName();
699 TLOG(TLVL_DEBUG) <<
"MetricManager has been stopped.";
void shutdown()
Call the destructors for all configured MetricPlugin instances.
void initialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Initialize the MetricPlugin instances.
void sendMetric(std::string const &name, std::string const &value, std::string const &unit, int level, MetricMode mode, std::string const &metricPrefix="", bool useNameOverride=false)
Send a metric with the given parameters to any MetricPlugins with a threshold level >= to level...
bool metricManagerBusy()
Determine whether the MetricManager or any of its plugins are currently processing metrics...
size_t metricQueueSize(std::string const &name="")
Return the size of the named metric queue
void reinitialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Reinitialize all MetricPlugin Instances.
Report the sum of all values. Use for counters to report accurate results.
MetricManager()
Construct an instance of the MetricManager class.
void do_start()
Perform startup actions for each configured MetricPlugin.
void do_stop()
Stop sending metrics to the MetricPlugin instances.
virtual ~MetricManager() noexcept
MetricManager destructor.
std::unique_ptr< MetricPlugin > makeMetricPlugin(std::string const &generator_plugin_spec, fhicl::ParameterSet const &ps, std::string const &app_name)
Load a given MetricPlugin and return a pointer to it.
over. Use to create rates from counters.
Report only the last value recorded. Useful for event counters, run numbers, etc. ...
MetricMode
The Mode of the metric indicates how multiple metric values should be combined within a reporting int...
Collects metrics from the system, using proc filesystem or kernel API calls
Small structure used to hold a metric data point before sending to the metric plugins ...
void do_resume()
Resume metric sending. Currently a No-Op.
bool metricQueueEmpty()
Returns whether the metric queue is completely empty
void do_pause()
Pause metric sending. Currently a No-Op.