9 #define TRACE_NAME "MetricManager"
10 #include "artdaq-utilities/Plugins/MetricManager.hh"
11 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
12 #include "fhiclcpp/ParameterSet.h"
16 #include <boost/exception/all.hpp>
22 , system_metric_collector_(nullptr)
27 , missed_metric_calls_(0)
39 TLOG(TLVL_INFO) <<
"Configuring metrics with parameter set: " << pset.to_string();
41 std::vector<std::string> names = pset.get_names();
43 metric_plugins_.clear();
44 bool send_system_metrics =
false;
45 bool send_process_metrics =
false;
47 for (
const auto& name : names)
49 if (name ==
"metric_queue_size")
51 metric_cache_max_size_ = pset.get<
size_t>(
"metric_queue_size");
53 else if (name ==
"metric_queue_notify_size")
55 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_queue_notify_size");
57 else if (name ==
"metric_cache_size")
59 metric_cache_max_size_ = pset.get<
size_t>(
"metric_cache_size");
61 else if (name ==
"metric_cache_notify_size")
63 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_cache_notify_size");
65 else if (name ==
"metric_send_maximum_delay_ms")
67 TLOG(TLVL_INFO) <<
"Setting metric_send_interval_ms_ to " << pset.get<
int>(
"metric_send_maximum_delay_ms");
68 metric_send_interval_ms_ = pset.get<
int>(
"metric_send_maximum_delay_ms");
70 else if (name ==
"metric_holdoff_us")
72 TLOG(TLVL_INFO) <<
"Setting metric_holdoff_us_ to " << pset.get<
int>(
"metric_holdoff_us");
73 metric_holdoff_us_ = pset.get<
int>(
"metric_holdoff_us");
75 else if (name ==
"send_system_metrics")
77 send_system_metrics = pset.get<
bool>(
"send_system_metrics");
79 else if (name ==
"send_process_metrics")
81 send_process_metrics = pset.get<
bool>(
"send_process_metrics");
87 TLOG(TLVL_DEBUG) <<
"Constructing metric plugin with name " << name;
88 auto plugin_pset = pset.get<fhicl::ParameterSet>(name);
89 metric_plugins_.push_back(
90 makeMetricPlugin(plugin_pset.get<std::string>(
"metricPluginType",
""), plugin_pset, prefix_));
92 catch (
const cet::exception& e)
94 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
95 <<
", cet::exception object caught:" << e.explain_self();
97 catch (
const boost::exception& e)
99 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
100 <<
", boost::exception object caught: " << boost::diagnostic_information(e);
102 catch (
const std::exception& e)
104 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
105 <<
", std::exception caught: " << e.what();
109 TLOG(TLVL_ERROR) <<
"Unknown Exception caught in MetricManager::initialize, error loading plugin with name "
115 if (send_system_metrics || send_process_metrics)
117 system_metric_collector_ = std::make_unique<SystemMetricCollector>(send_process_metrics, send_system_metrics);
125 std::lock_guard<std::mutex> lk(metric_mutex_);
128 TLOG(TLVL_DEBUG) <<
"Starting MetricManager";
129 for (
auto& metric : metric_plugins_)
137 metric->startMetrics();
138 TLOG(TLVL_INFO) <<
"Metric Plugin " << metric->getLibName() <<
" started.";
143 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_start(), error starting plugin with name "
144 << metric->getLibName();
154 std::unique_lock<std::mutex> lk(metric_mutex_);
155 TLOG(TLVL_DEBUG) <<
"Stopping Metrics";
157 metric_cv_.notify_all();
158 TLOG(TLVL_DEBUG) <<
"Joining Metric-Sending thread";
162 if (metric_sending_thread_.joinable())
164 metric_sending_thread_.join();
171 TLOG(TLVL_DEBUG) <<
"do_stop Complete";
184 initialize(pset, prefix);
189 TLOG(TLVL_DEBUG) <<
"MetricManager is shutting down...";
192 std::lock_guard<std::mutex> lk(metric_mutex_);
195 initialized_ =
false;
196 for (
auto& i : metric_plugins_)
200 std::string name = i->getLibName();
202 TLOG(TLVL_DEBUG) <<
"Metric Plugin " << name <<
" shutdown.";
206 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::shutdown(), error shutting down metric with name "
210 metric_plugins_.clear();
215 int level,
MetricMode mode, std::string
const& metricPrefix,
216 bool useNameOverride)
220 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
224 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
229 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
231 last_metric_received_ = std::chrono::steady_clock::now();
232 auto& cached = metric_cache_[name];
233 if (cached ==
nullptr)
235 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
239 auto size = cached->DataPointCount;
240 if (size < metric_cache_max_size_)
242 if (size >= metric_cache_notify_size_)
244 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
249 cached->StringValue = value;
250 cached->DataPointCount = 1;
254 cached->StringValue +=
" " + value;
255 cached->DataPointCount++;
260 TLOG(10) <<
"Rejecting metric because queue full";
261 missed_metric_calls_++;
265 metric_cv_.notify_all();
270 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
274 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
278 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
283 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
285 last_metric_received_ = std::chrono::steady_clock::now();
286 auto& cached = metric_cache_[name];
287 if (cached ==
nullptr)
289 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
293 auto size = cached->DataPointCount;
294 if (size < metric_cache_max_size_)
296 if (size >= metric_cache_notify_size_)
298 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
301 cached->AddPoint(value);
305 TLOG(10) <<
"Rejecting metric because queue full";
306 missed_metric_calls_++;
310 metric_cv_.notify_all();
315 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
319 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
323 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
328 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
330 last_metric_received_ = std::chrono::steady_clock::now();
331 auto& cached = metric_cache_[name];
332 if (cached ==
nullptr)
334 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
338 auto size = cached->DataPointCount;
339 if (size < metric_cache_max_size_)
341 if (size >= metric_cache_notify_size_)
343 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
346 cached->AddPoint(value);
350 TLOG(10) <<
"Rejecting metric because queue full";
351 missed_metric_calls_++;
355 metric_cv_.notify_all();
360 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
364 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
368 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
373 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
375 last_metric_received_ = std::chrono::steady_clock::now();
376 auto& cached = metric_cache_[name];
377 if (cached ==
nullptr)
379 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
383 auto size = cached->DataPointCount;
384 if (size < metric_cache_max_size_)
386 if (size >= metric_cache_notify_size_)
388 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
391 cached->AddPoint(value);
395 TLOG(10) <<
"Rejecting metric because queue full";
396 missed_metric_calls_++;
400 metric_cv_.notify_all();
405 int level,
MetricMode mode, std::string
const& metricPrefix,
406 bool useNameOverride)
410 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
414 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
419 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
421 last_metric_received_ = std::chrono::steady_clock::now();
422 auto& cached = metric_cache_[name];
423 if (cached ==
nullptr)
425 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
429 auto size = cached->DataPointCount;
430 if (size < metric_cache_max_size_)
432 if (size >= metric_cache_notify_size_)
434 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
437 cached->AddPoint(value);
441 TLOG(10) <<
"Rejecting metric because queue full";
442 missed_metric_calls_++;
446 metric_cv_.notify_all();
450 void artdaq::MetricManager::startMetricLoop_()
452 if (metric_sending_thread_.joinable())
454 metric_sending_thread_.join();
456 boost::thread::attributes attrs;
457 attrs.set_stack_size(4096 * 2000);
458 TLOG(TLVL_INFO) <<
"Starting Metric Sending Thread";
461 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_,
this));
464 snprintf(tname,
sizeof(tname)-1,
"%s",
"MetricSend");
465 tname[
sizeof(tname)-1] =
'\0';
466 auto handle = metric_sending_thread_.native_handle();
467 pthread_setname_np(handle, tname);
469 catch (
const boost::exception& e)
471 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
472 <<
", errno=" << errno;
473 std::cerr <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
474 <<
", errno=" << errno << std::endl;
477 TLOG(TLVL_INFO) <<
"Metric Sending thread started";
482 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
483 for (
auto& cache_entry : metric_cache_)
485 if (cache_entry.second->DataPointCount > 0)
496 bool pluginsBusy =
false;
498 for (
auto& p : metric_plugins_)
500 if (p->metricsPending())
507 TLOG(TLVL_TRACE) <<
"Metric queue empty: " << metricQueueEmpty() <<
", busy_: " << busy_ <<
", Plugins busy: " << pluginsBusy;
508 return !metricQueueEmpty() || busy_ || pluginsBusy;
513 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
517 for (
auto& q : metric_cache_)
519 size += q.second->DataPointCount;
524 if (metric_cache_.count(name) != 0u)
526 size = metric_cache_[name]->DataPointCount;
533 void artdaq::MetricManager::sendMetricLoop_()
535 TLOG(TLVL_INFO) <<
"sendMetricLoop_ START";
536 auto last_send_time = std::chrono::steady_clock::time_point();
539 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: Entering Metric input wait loop";
540 while (metricQueueEmpty() && running_)
542 std::unique_lock<std::mutex> lk(metric_mutex_);
543 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
544 auto now = std::chrono::steady_clock::now();
545 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() >
546 metric_send_interval_ms_)
548 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: Metric send interval exceeded: Sending metrics";
550 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
551 if (std::chrono::duration_cast<std::chrono::microseconds>(now - last_metric_received_).count() < metric_holdoff_us_)
554 usleep(metric_holdoff_us_);
557 for (
auto& metric : metric_plugins_)
561 metric->sendMetrics();
564 last_send_time = now;
568 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
569 if (std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - last_metric_received_).count() < metric_holdoff_us_)
572 usleep(metric_holdoff_us_);
576 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: After Metric input wait loop";
578 auto processing_start = std::chrono::steady_clock::now();
579 auto temp_list = std::list<std::unique_ptr<MetricData>>();
581 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
583 for (
auto& q : metric_cache_)
585 if (q.second !=
nullptr && q.second->DataPointCount > 0)
587 temp_list.emplace_back(
new MetricData(*q.second));
593 auto calls = metric_calls_.exchange(0);
594 temp_list.emplace_back(
597 auto missed = missed_metric_calls_.exchange(0);
598 temp_list.emplace_back(
601 TLOG(TLVL_TRACE) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
604 if (system_metric_collector_ !=
nullptr)
606 TLOG(TLVL_TRACE) <<
"Collecting System metrics (CPU, RAM, Network)";
607 auto systemMetrics = system_metric_collector_->SendMetrics();
608 for (
auto& m : systemMetrics) { temp_list.emplace_back(std::move(m)); }
611 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: Before processing temp_list";
612 while (!temp_list.empty())
614 auto data_ = std::move(temp_list.front());
615 temp_list.pop_front();
620 if (!data_->UseNameOverride)
622 if (!data_->MetricPrefix.empty())
624 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
628 data_->Name = prefix_ +
"." + data_->Name;
632 for (
auto& metric : metric_plugins_)
638 if (metric->IsLevelEnabled(data_->Level))
642 metric->addMetricData(data_);
643 last_send_time = std::chrono::steady_clock::now();
647 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
648 << metric->getLibName();
654 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: Before sending metrics";
655 for (
auto& metric : metric_plugins_)
661 metric->sendMetrics(
false, processing_start);
665 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: End of working loop";
671 auto temp_list = std::list<std::unique_ptr<MetricData>>();
673 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
675 for (
auto& q : metric_cache_)
677 if (q.second !=
nullptr && q.second->DataPointCount > 0)
679 temp_list.emplace_back(
new MetricData(*q.second));
686 auto calls = metric_calls_.exchange(0);
687 temp_list.emplace_back(
690 auto missed = missed_metric_calls_.exchange(0);
691 temp_list.emplace_back(
694 TLOG(TLVL_TRACE) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
697 while (!temp_list.empty())
699 auto data_ = std::move(temp_list.front());
700 temp_list.pop_front();
705 if (!data_->UseNameOverride)
707 if (!data_->MetricPrefix.empty())
709 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
713 data_->Name = prefix_ +
"." + data_->Name;
717 for (
auto& metric : metric_plugins_)
723 if (metric->IsLevelEnabled(data_->Level))
727 metric->addMetricData(data_);
728 last_send_time = std::chrono::steady_clock::now();
732 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
733 << metric->getLibName();
739 for (
auto& metric : metric_plugins_)
747 metric->stopMetrics();
748 TLOG(TLVL_DEBUG) <<
"Metric Plugin " << metric->getLibName() <<
" stopped.";
752 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_stop(), error stopping plugin with name "
753 << metric->getLibName();
757 TLOG(TLVL_DEBUG) <<
"MetricManager has been stopped.";
void shutdown()
Call the destructors for all configured MetricPlugin instances.
void initialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Initialize the MetricPlugin instances.
void sendMetric(std::string const &name, std::string const &value, std::string const &unit, int level, MetricMode mode, std::string const &metricPrefix="", bool useNameOverride=false)
Send a metric with the given parameters to any MetricPlugins with a threshold level >= to level...
bool metricManagerBusy()
Determine whether the MetricManager or any of its plugins are currently processing metrics...
size_t metricQueueSize(std::string const &name="")
Return the size of the named metric queue
void reinitialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Reinitialize all MetricPlugin Instances.
Report the sum of all values. Use for counters to report accurate results.
MetricManager()
Construct an instance of the MetricManager class.
void do_start()
Perform startup actions for each configured MetricPlugin.
void do_stop()
Stop sending metrics to the MetricPlugin instances.
virtual ~MetricManager() noexcept
MetricManager destructor.
std::unique_ptr< MetricPlugin > makeMetricPlugin(std::string const &generator_plugin_spec, fhicl::ParameterSet const &ps, std::string const &app_name)
Load a given MetricPlugin and return a pointer to it.
over. Use to create rates from counters.
Report only the last value recorded. Useful for event counters, run numbers, etc. ...
MetricMode
The Mode of the metric indicates how multiple metric values should be combined within a reporting int...
void do_resume()
Resume metric sending. Currently a No-Op.
bool metricQueueEmpty()
Returns whether the metric queue is completely empty
void do_pause()
Pause metric sending. Currently a No-Op.