10 #define TRACE_NAME "MetricManager"
12 #include "artdaq-utilities/Plugins/MetricManager.hh"
13 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
14 #include "fhiclcpp/ParameterSet.h"
17 #include <boost/exception/all.hpp>
23 , system_metric_collector_(nullptr)
28 , missed_metric_calls_(0)
40 TLOG(TLVL_INFO) <<
"Configuring metrics with parameter set: " << pset.to_string();
42 std::vector<std::string> names = pset.get_names();
44 metric_plugins_.clear();
45 bool send_system_metrics =
false;
46 bool send_process_metrics =
false;
48 for (
const auto& name : names)
50 if (name ==
"metric_queue_size")
52 metric_cache_max_size_ = pset.get<
size_t>(
"metric_queue_size");
54 else if (name ==
"metric_queue_notify_size")
56 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_queue_notify_size");
58 else if (name ==
"metric_cache_size")
60 metric_cache_max_size_ = pset.get<
size_t>(
"metric_cache_size");
62 else if (name ==
"metric_cache_notify_size")
64 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_cache_notify_size");
66 else if (name ==
"metric_send_maximum_delay_ms")
68 TLOG(TLVL_INFO) <<
"Setting metric_send_interval_ms_ to " << pset.get<
int>(
"metric_send_maximum_delay_ms");
69 metric_send_interval_ms_ = pset.get<
int>(
"metric_send_maximum_delay_ms");
71 else if (name ==
"metric_holdoff_us")
73 TLOG(TLVL_INFO) <<
"Setting metric_holdoff_us_ to " << pset.get<
int>(
"metric_holdoff_us");
74 metric_holdoff_us_ = pset.get<
int>(
"metric_holdoff_us");
76 else if (name ==
"send_system_metrics")
78 send_system_metrics = pset.get<
bool>(
"send_system_metrics");
80 else if (name ==
"send_process_metrics")
82 send_process_metrics = pset.get<
bool>(
"send_process_metrics");
88 TLOG(TLVL_DEBUG) <<
"Constructing metric plugin with name " << name;
89 auto plugin_pset = pset.get<fhicl::ParameterSet>(name);
90 metric_plugins_.push_back(
91 makeMetricPlugin(plugin_pset.get<std::string>(
"metricPluginType",
""), plugin_pset, prefix_, name));
93 catch (
const cet::exception& e)
95 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
96 <<
", cet::exception object caught:" << e.explain_self();
98 catch (
const boost::exception& e)
100 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
101 <<
", boost::exception object caught: " << boost::diagnostic_information(e);
103 catch (
const std::exception& e)
105 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
106 <<
", std::exception caught: " << e.what();
110 TLOG(TLVL_ERROR) <<
"Unknown Exception caught in MetricManager::initialize, error loading plugin with name "
116 if (send_system_metrics || send_process_metrics)
118 system_metric_collector_ = std::make_unique<SystemMetricCollector>(send_process_metrics, send_system_metrics);
126 std::lock_guard<std::mutex> lk(metric_mutex_);
129 TLOG(TLVL_DEBUG) <<
"Starting MetricManager";
130 for (
auto& metric : metric_plugins_)
138 metric->startMetrics();
139 TLOG(TLVL_INFO) <<
"Metric Plugin " << metric->getLibName() <<
" started.";
144 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_start(), error starting plugin with name "
145 << metric->getLibName();
155 std::unique_lock<std::mutex> lk(metric_mutex_);
156 TLOG(TLVL_DEBUG) <<
"Stopping Metrics";
158 metric_cv_.notify_all();
159 TLOG(TLVL_DEBUG) <<
"Joining Metric-Sending thread";
163 if (metric_sending_thread_.joinable())
165 metric_sending_thread_.join();
172 TLOG(TLVL_DEBUG) <<
"do_stop Complete";
185 initialize(pset, prefix);
190 TLOG(TLVL_DEBUG) <<
"MetricManager is shutting down...";
193 std::lock_guard<std::mutex> lk(metric_mutex_);
196 initialized_ =
false;
197 for (
auto& i : metric_plugins_)
201 std::string name = i->getLibName();
203 TLOG(TLVL_DEBUG) <<
"Metric Plugin " << name <<
" shutdown.";
207 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::shutdown(), error shutting down metric with name "
211 metric_plugins_.clear();
216 int level,
MetricMode mode, std::string
const& metricPrefix,
217 bool useNameOverride)
221 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
225 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
230 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
232 last_metric_received_ = std::chrono::steady_clock::now();
233 auto& cached = metric_cache_[name];
234 if (cached ==
nullptr)
236 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
240 auto size = cached->DataPointCount;
241 if (size < metric_cache_max_size_)
243 if (size >= metric_cache_notify_size_)
245 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
250 cached->StringValue = value;
251 cached->DataPointCount = 1;
255 cached->StringValue +=
" " + value;
256 cached->DataPointCount++;
261 TLOG(10) <<
"Rejecting metric because queue full";
262 missed_metric_calls_++;
266 metric_cv_.notify_all();
271 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
275 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
279 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
284 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
286 last_metric_received_ = std::chrono::steady_clock::now();
287 auto& cached = metric_cache_[name];
288 if (cached ==
nullptr)
290 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
294 auto size = cached->DataPointCount;
295 if (size < metric_cache_max_size_)
297 if (size >= metric_cache_notify_size_)
299 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
302 cached->AddPoint(value);
306 TLOG(10) <<
"Rejecting metric because queue full";
307 missed_metric_calls_++;
311 metric_cv_.notify_all();
316 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
320 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
324 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
329 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
331 last_metric_received_ = std::chrono::steady_clock::now();
332 auto& cached = metric_cache_[name];
333 if (cached ==
nullptr)
335 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
339 auto size = cached->DataPointCount;
340 if (size < metric_cache_max_size_)
342 if (size >= metric_cache_notify_size_)
344 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
347 cached->AddPoint(value);
351 TLOG(10) <<
"Rejecting metric because queue full";
352 missed_metric_calls_++;
356 metric_cv_.notify_all();
361 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
365 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
369 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
374 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
376 last_metric_received_ = std::chrono::steady_clock::now();
377 auto& cached = metric_cache_[name];
378 if (cached ==
nullptr)
380 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
384 auto size = cached->DataPointCount;
385 if (size < metric_cache_max_size_)
387 if (size >= metric_cache_notify_size_)
389 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
392 cached->AddPoint(value);
396 TLOG(10) <<
"Rejecting metric because queue full";
397 missed_metric_calls_++;
401 metric_cv_.notify_all();
406 int level,
MetricMode mode, std::string
const& metricPrefix,
407 bool useNameOverride)
411 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
415 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
420 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
422 last_metric_received_ = std::chrono::steady_clock::now();
423 auto& cached = metric_cache_[name];
424 if (cached ==
nullptr)
426 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
430 auto size = cached->DataPointCount;
431 if (size < metric_cache_max_size_)
433 if (size >= metric_cache_notify_size_)
435 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
438 cached->AddPoint(value);
442 TLOG(10) <<
"Rejecting metric because queue full";
443 missed_metric_calls_++;
447 metric_cv_.notify_all();
451 void artdaq::MetricManager::startMetricLoop_()
453 if (metric_sending_thread_.joinable())
455 metric_sending_thread_.join();
457 boost::thread::attributes attrs;
458 attrs.set_stack_size(4096 * 2000);
459 TLOG(TLVL_INFO) <<
"Starting Metric Sending Thread";
462 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_,
this));
465 snprintf(tname,
sizeof(tname) - 1,
"%s",
"MetricSend");
466 tname[
sizeof(tname) - 1] =
'\0';
467 auto handle = metric_sending_thread_.native_handle();
468 pthread_setname_np(handle, tname);
470 catch (
const boost::exception& e)
472 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
473 <<
", errno=" << errno;
474 std::cerr <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
475 <<
", errno=" << errno << std::endl;
478 TLOG(TLVL_INFO) <<
"Metric Sending thread started";
483 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
484 for (
auto& cache_entry : metric_cache_)
486 if (cache_entry.second->DataPointCount > 0)
497 bool pluginsBusy =
false;
499 for (
auto& p : metric_plugins_)
501 if (p->metricsPending())
508 TLOG(TLVL_TRACE) <<
"Metric queue empty: " << metricQueueEmpty() <<
", busy_: " << busy_ <<
", Plugins busy: " << pluginsBusy;
509 return !metricQueueEmpty() || busy_ || pluginsBusy;
514 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
518 for (
auto& q : metric_cache_)
520 size += q.second->DataPointCount;
525 if (metric_cache_.count(name) != 0u)
527 size = metric_cache_[name]->DataPointCount;
534 void artdaq::MetricManager::sendMetricLoop_()
536 TLOG(TLVL_INFO) <<
"sendMetricLoop_ START";
537 auto last_send_time = std::chrono::steady_clock::time_point();
540 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: Entering Metric input wait loop";
541 while (metricQueueEmpty() && running_)
543 std::unique_lock<std::mutex> lk(metric_mutex_);
544 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
545 auto now = std::chrono::steady_clock::now();
546 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() >
547 metric_send_interval_ms_)
549 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: Metric send interval exceeded: Sending metrics";
551 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
552 if (std::chrono::duration_cast<std::chrono::microseconds>(now - last_metric_received_).count() < metric_holdoff_us_)
555 usleep(metric_holdoff_us_);
558 for (
auto& metric : metric_plugins_)
562 metric->sendMetrics();
565 last_send_time = now;
569 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
570 if (std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - last_metric_received_).count() < metric_holdoff_us_)
573 usleep(metric_holdoff_us_);
577 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: After Metric input wait loop";
579 auto processing_start = std::chrono::steady_clock::now();
580 auto temp_list = std::list<std::unique_ptr<MetricData>>();
582 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
584 for (
auto& q : metric_cache_)
586 if (q.second !=
nullptr && q.second->DataPointCount > 0)
588 temp_list.emplace_back(
new MetricData(*q.second));
594 auto calls = metric_calls_.exchange(0);
595 temp_list.emplace_back(
598 auto missed = missed_metric_calls_.exchange(0);
599 temp_list.emplace_back(
602 TLOG(TLVL_TRACE) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
605 if (system_metric_collector_ !=
nullptr)
607 TLOG(TLVL_TRACE) <<
"Collecting System metrics (CPU, RAM, Network)";
608 auto systemMetrics = system_metric_collector_->SendMetrics();
609 for (
auto& m : systemMetrics) { temp_list.emplace_back(std::move(m)); }
612 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: Before processing temp_list";
613 while (!temp_list.empty())
615 auto data_ = std::move(temp_list.front());
616 temp_list.pop_front();
621 if (!data_->UseNameOverride)
623 if (!data_->MetricPrefix.empty())
625 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
629 data_->Name = prefix_ +
"." + data_->Name;
633 for (
auto& metric : metric_plugins_)
639 if (metric->IsLevelEnabled(data_->Level))
643 metric->addMetricData(data_);
644 last_send_time = std::chrono::steady_clock::now();
648 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
649 << metric->getLibName();
655 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: Before sending metrics";
656 for (
auto& metric : metric_plugins_)
662 metric->sendMetrics(
false, processing_start);
666 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: End of working loop";
672 auto temp_list = std::list<std::unique_ptr<MetricData>>();
674 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
676 for (
auto& q : metric_cache_)
678 if (q.second !=
nullptr && q.second->DataPointCount > 0)
680 temp_list.emplace_back(
new MetricData(*q.second));
687 auto calls = metric_calls_.exchange(0);
688 temp_list.emplace_back(
691 auto missed = missed_metric_calls_.exchange(0);
692 temp_list.emplace_back(
695 TLOG(TLVL_TRACE) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
698 while (!temp_list.empty())
700 auto data_ = std::move(temp_list.front());
701 temp_list.pop_front();
706 if (!data_->UseNameOverride)
708 if (!data_->MetricPrefix.empty())
710 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
714 data_->Name = prefix_ +
"." + data_->Name;
718 for (
auto& metric : metric_plugins_)
724 if (metric->IsLevelEnabled(data_->Level))
728 metric->addMetricData(data_);
729 last_send_time = std::chrono::steady_clock::now();
733 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
734 << metric->getLibName();
740 for (
auto& metric : metric_plugins_)
748 metric->stopMetrics();
749 TLOG(TLVL_DEBUG) <<
"Metric Plugin " << metric->getLibName() <<
" stopped.";
753 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_stop(), error stopping plugin with name "
754 << metric->getLibName();
758 TLOG(TLVL_DEBUG) <<
"MetricManager has been stopped.";
void shutdown()
Call the destructors for all configured MetricPlugin instances.
void initialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Initialize the MetricPlugin instances.
void sendMetric(std::string const &name, std::string const &value, std::string const &unit, int level, MetricMode mode, std::string const &metricPrefix="", bool useNameOverride=false)
Send a metric with the given parameters to any MetricPlugins with a threshold level >= to level...
bool metricManagerBusy()
Determine whether the MetricManager or any of its plugins are currently processing metrics...
size_t metricQueueSize(std::string const &name="")
Return the size of the named metric queue
void reinitialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Reinitialize all MetricPlugin Instances.
Report the sum of all values. Use for counters to report accurate results.
MetricManager()
Construct an instance of the MetricManager class.
void do_start()
Perform startup actions for each configured MetricPlugin.
void do_stop()
Stop sending metrics to the MetricPlugin instances.
virtual ~MetricManager() noexcept
MetricManager destructor.
over. Use to create rates from counters.
Report only the last value recorded. Useful for event counters, run numbers, etc. ...
MetricMode
The Mode of the metric indicates how multiple metric values should be combined within a reporting int...
std::unique_ptr< MetricPlugin > makeMetricPlugin(std::string const &generator_plugin_spec, fhicl::ParameterSet const &ps, std::string const &app_name, std::string const &metric_name)
Load a given MetricPlugin and return a pointer to it.
void do_resume()
Resume metric sending. Currently a No-Op.
bool metricQueueEmpty()
Returns whether the metric queue is completely empty
void do_pause()
Pause metric sending. Currently a No-Op.