9 #include "TRACE/tracemf.h"
10 #define TRACE_NAME "MetricManager"
12 #include "artdaq-utilities/Plugins/MetricManager.hh"
13 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
14 #include "fhiclcpp/ParameterSet.h"
17 #include <boost/exception/all.hpp>
23 , system_metric_collector_(nullptr)
28 , missed_metric_calls_(0)
31 TLOG(TLVL_INFO) <<
"MetricManager CONSTRUCTOR";
43 TLOG(TLVL_INFO) <<
"Configuring metrics with parameter set: " << pset.to_string();
45 std::vector<std::string> names = pset.get_names();
47 metric_plugins_.clear();
48 bool send_system_metrics =
false;
49 bool send_process_metrics =
false;
51 for (
const auto& name : names)
53 if (name ==
"metric_queue_size")
55 metric_cache_max_size_ = pset.get<
size_t>(
"metric_queue_size");
57 else if (name ==
"metric_queue_notify_size")
59 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_queue_notify_size");
61 else if (name ==
"metric_cache_size")
63 metric_cache_max_size_ = pset.get<
size_t>(
"metric_cache_size");
65 else if (name ==
"metric_cache_notify_size")
67 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_cache_notify_size");
69 else if (name ==
"metric_send_maximum_delay_ms")
71 TLOG(TLVL_INFO) <<
"Setting metric_send_interval_ms_ to " << pset.get<
int>(
"metric_send_maximum_delay_ms");
72 metric_send_interval_ms_ = pset.get<
int>(
"metric_send_maximum_delay_ms");
74 else if (name ==
"metric_holdoff_us")
76 TLOG(TLVL_INFO) <<
"Setting metric_holdoff_us_ to " << pset.get<
int>(
"metric_holdoff_us");
77 metric_holdoff_us_ = pset.get<
int>(
"metric_holdoff_us");
79 else if (name ==
"send_system_metrics")
81 send_system_metrics = pset.get<
bool>(
"send_system_metrics");
83 else if (name ==
"send_process_metrics")
85 send_process_metrics = pset.get<
bool>(
"send_process_metrics");
91 TLOG(TLVL_DEBUG + 32) <<
"Constructing metric plugin with name " << name;
92 auto plugin_pset = pset.get<fhicl::ParameterSet>(name);
93 metric_plugins_.push_back(
94 makeMetricPlugin(plugin_pset.get<std::string>(
"metricPluginType",
""), plugin_pset, prefix_, name));
96 catch (
const cet::exception& e)
98 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
99 <<
", cet::exception object caught:" << e.explain_self();
101 catch (
const boost::exception& e)
103 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
104 <<
", boost::exception object caught: " << boost::diagnostic_information(e);
106 catch (
const std::exception& e)
108 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
109 <<
", std::exception caught: " << e.what();
113 TLOG(TLVL_ERROR) <<
"Unknown Exception caught in MetricManager::initialize, error loading plugin with name "
119 if (send_system_metrics || send_process_metrics)
121 system_metric_collector_ = std::make_unique<SystemMetricCollector>(send_process_metrics, send_system_metrics);
129 std::lock_guard<std::mutex> lk(metric_mutex_);
132 TLOG(TLVL_DEBUG + 32) <<
"Starting MetricManager";
133 for (
auto& metric : metric_plugins_)
141 metric->startMetrics();
142 TLOG(TLVL_INFO) <<
"Metric Plugin " << metric->getLibName() <<
" started.";
147 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_start(), error starting plugin with name "
148 << metric->getLibName();
158 std::unique_lock<std::mutex> lk(metric_mutex_);
159 TLOG(TLVL_DEBUG + 32) <<
"Stopping Metrics";
161 metric_cv_.notify_all();
162 TLOG(TLVL_DEBUG + 32) <<
"Joining Metric-Sending thread";
166 if (metric_sending_thread_.joinable())
168 metric_sending_thread_.join();
175 TLOG(TLVL_DEBUG + 32) <<
"do_stop Complete";
188 initialize(pset, prefix);
193 TRACE_STREAMER(TLVL_DEBUG + 32, TLOG2(
"MetricManager", 0), 0) <<
"MetricManager is shutting down...";
196 std::lock_guard<std::mutex> lk(metric_mutex_);
199 TRACE_STREAMER(TLVL_DEBUG + 32, TLOG2(
"MetricManager", 0), 0) <<
"MetricManager is initialized shutting down...";
200 initialized_ =
false;
201 for (
auto& i : metric_plugins_)
205 std::string name = i->getLibName();
207 TRACE_STREAMER(TLVL_DEBUG + 32, TLOG2(
"MetricManager", 0), 0) <<
"Metric Plugin " << name <<
" shutdown.";
211 TRACE_STREAMER(TLVL_ERROR, TLOG2(
"MetricManager", 0), 0) <<
"Exception caught in MetricManager::shutdown(), error shutting down metric with name "
215 metric_plugins_.clear();
220 int level,
MetricMode mode, std::string
const& metricPrefix,
221 bool useNameOverride)
225 if (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - last_failure_).count() > 1000)
227 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
228 last_failure_ = std::chrono::steady_clock::now();
233 if (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - last_failure_).count() > 1000)
235 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
236 last_failure_ = std::chrono::steady_clock::now();
242 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
244 last_metric_received_ = std::chrono::steady_clock::now();
245 auto& cached = metric_cache_[name];
246 if (cached ==
nullptr)
248 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
252 auto size = cached->DataPointCount;
253 if (size < metric_cache_max_size_)
255 if (size >= metric_cache_notify_size_)
257 TLOG(TLVL_DEBUG + 35) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
262 cached->StringValue = value;
263 cached->DataPointCount = 1;
267 cached->StringValue +=
" " + value;
268 cached->DataPointCount++;
273 TLOG(TLVL_DEBUG + 36) <<
"Rejecting metric because queue full";
274 missed_metric_calls_++;
278 metric_cv_.notify_all();
283 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
287 if (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - last_failure_).count() > 1000)
289 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
290 last_failure_ = std::chrono::steady_clock::now();
295 if (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - last_failure_).count() > 1000)
297 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
298 last_failure_ = std::chrono::steady_clock::now();
304 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
306 last_metric_received_ = std::chrono::steady_clock::now();
307 auto& cached = metric_cache_[name];
308 if (cached ==
nullptr)
310 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
314 auto size = cached->DataPointCount;
315 if (size < metric_cache_max_size_)
317 if (size >= metric_cache_notify_size_)
319 TLOG(TLVL_DEBUG + 35) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
322 cached->AddPoint(value);
326 TLOG(TLVL_DEBUG + 36) <<
"Rejecting metric because queue full";
327 missed_metric_calls_++;
331 metric_cv_.notify_all();
336 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
340 if (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - last_failure_).count() > 1000)
342 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
343 last_failure_ = std::chrono::steady_clock::now();
348 if (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - last_failure_).count() > 1000)
350 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
351 last_failure_ = std::chrono::steady_clock::now();
357 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
359 last_metric_received_ = std::chrono::steady_clock::now();
360 auto& cached = metric_cache_[name];
361 if (cached ==
nullptr)
363 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
367 auto size = cached->DataPointCount;
368 if (size < metric_cache_max_size_)
370 if (size >= metric_cache_notify_size_)
372 TLOG(TLVL_DEBUG + 35) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
375 cached->AddPoint(value);
379 TLOG(TLVL_DEBUG + 36) <<
"Rejecting metric because queue full";
380 missed_metric_calls_++;
384 metric_cv_.notify_all();
389 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
393 if (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - last_failure_).count() > 1000)
395 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
396 last_failure_ = std::chrono::steady_clock::now();
401 if (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - last_failure_).count() > 1000)
403 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
404 last_failure_ = std::chrono::steady_clock::now();
410 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
412 last_metric_received_ = std::chrono::steady_clock::now();
413 auto& cached = metric_cache_[name];
414 if (cached ==
nullptr)
416 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
420 auto size = cached->DataPointCount;
421 if (size < metric_cache_max_size_)
423 if (size >= metric_cache_notify_size_)
425 TLOG(TLVL_DEBUG + 35) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
428 cached->AddPoint(value);
432 TLOG(TLVL_DEBUG + 36) <<
"Rejecting metric because queue full";
433 missed_metric_calls_++;
437 metric_cv_.notify_all();
442 int level,
MetricMode mode, std::string
const& metricPrefix,
443 bool useNameOverride)
447 if (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - last_failure_).count() > 1000)
449 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
450 last_failure_ = std::chrono::steady_clock::now();
455 if (std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - last_failure_).count() > 1000)
457 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
458 last_failure_ = std::chrono::steady_clock::now();
464 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
466 last_metric_received_ = std::chrono::steady_clock::now();
467 auto& cached = metric_cache_[name];
468 if (cached ==
nullptr)
470 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
474 auto size = cached->DataPointCount;
475 if (size < metric_cache_max_size_)
477 if (size >= metric_cache_notify_size_)
479 TLOG(TLVL_DEBUG + 35) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
482 cached->AddPoint(value);
486 TLOG(TLVL_DEBUG + 36) <<
"Rejecting metric because queue full";
487 missed_metric_calls_++;
491 metric_cv_.notify_all();
495 void artdaq::MetricManager::startMetricLoop_()
497 if (metric_sending_thread_.joinable())
499 metric_sending_thread_.join();
501 boost::thread::attributes attrs;
502 attrs.set_stack_size(4096 * 2000);
503 TLOG(TLVL_INFO) <<
"Starting Metric Sending Thread";
506 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_,
this));
509 snprintf(tname,
sizeof(tname) - 1,
"%s",
"MetricSend");
510 tname[
sizeof(tname) - 1] =
'\0';
511 auto handle = metric_sending_thread_.native_handle();
512 pthread_setname_np(handle, tname);
514 catch (
const boost::exception& e)
516 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
517 <<
", errno=" << errno;
518 std::cerr <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
519 <<
", errno=" << errno << std::endl;
522 TLOG(TLVL_INFO) <<
"Metric Sending thread started";
527 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
528 for (
auto& cache_entry : metric_cache_)
530 if (cache_entry.second->DataPointCount > 0)
541 bool pluginsBusy =
false;
543 for (
auto& p : metric_plugins_)
545 if (p->metricsPending())
552 TLOG(TLVL_DEBUG + 33) <<
"Metric queue empty: " << metricQueueEmpty() <<
", busy_: " << busy_ <<
", Plugins busy: " << pluginsBusy;
553 return !metricQueueEmpty() || busy_ || pluginsBusy;
558 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
562 for (
auto& q : metric_cache_)
564 size += q.second->DataPointCount;
569 if (metric_cache_.count(name) != 0u)
571 size = metric_cache_[name]->DataPointCount;
578 void artdaq::MetricManager::sendMetricLoop_()
580 TLOG(TLVL_INFO) <<
"sendMetricLoop_ START";
581 auto last_send_time = std::chrono::steady_clock::time_point();
584 TLOG(TLVL_DEBUG + 34) <<
"sendMetricLoop_: Entering Metric input wait loop";
585 while (metricQueueEmpty() && running_)
587 std::unique_lock<std::mutex> lk(metric_mutex_);
588 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
589 auto now = std::chrono::steady_clock::now();
590 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() >
591 metric_send_interval_ms_)
593 TLOG(TLVL_DEBUG + 34) <<
"sendMetricLoop_: Metric send interval exceeded: Sending metrics";
595 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
596 if (std::chrono::duration_cast<std::chrono::microseconds>(now - last_metric_received_).count() < metric_holdoff_us_)
599 usleep(metric_holdoff_us_);
602 for (
auto& metric : metric_plugins_)
606 metric->sendMetrics();
609 last_send_time = now;
613 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
614 if (std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - last_metric_received_).count() < metric_holdoff_us_)
617 usleep(metric_holdoff_us_);
621 TLOG(TLVL_DEBUG + 34) <<
"sendMetricLoop_: After Metric input wait loop";
623 auto processing_start = std::chrono::steady_clock::now();
624 auto temp_list = std::list<std::unique_ptr<MetricData>>();
626 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
628 for (
auto& q : metric_cache_)
630 if (q.second !=
nullptr && q.second->DataPointCount > 0)
632 temp_list.emplace_back(
new MetricData(*q.second));
638 auto calls = metric_calls_.exchange(0);
639 temp_list.emplace_back(
642 auto missed = missed_metric_calls_.exchange(0);
643 temp_list.emplace_back(
646 TLOG(TLVL_DEBUG + 33) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
649 if (system_metric_collector_ !=
nullptr)
651 TLOG(TLVL_DEBUG + 33) <<
"Collecting System metrics (CPU, RAM, Network)";
652 auto systemMetrics = system_metric_collector_->SendMetrics();
653 for (
auto& m : systemMetrics) { temp_list.emplace_back(std::move(m)); }
656 TLOG(TLVL_DEBUG + 34) <<
"sendMetricLoop_: Before processing temp_list";
657 while (!temp_list.empty())
659 auto data_ = std::move(temp_list.front());
660 temp_list.pop_front();
665 if (!data_->UseNameOverride)
667 if (!data_->MetricPrefix.empty())
669 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
673 data_->Name = prefix_ +
"." + data_->Name;
677 for (
auto& metric : metric_plugins_)
683 if (metric->IsLevelEnabled(data_->Level))
687 metric->addMetricData(data_);
688 last_send_time = std::chrono::steady_clock::now();
692 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
693 << metric->getLibName();
699 TLOG(TLVL_DEBUG + 34) <<
"sendMetricLoop_: Before sending metrics";
700 for (
auto& metric : metric_plugins_)
706 metric->sendMetrics(
false, processing_start);
710 TLOG(TLVL_DEBUG + 34) <<
"sendMetricLoop_: End of working loop";
716 auto temp_list = std::list<std::unique_ptr<MetricData>>();
718 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
720 for (
auto& q : metric_cache_)
722 if (q.second !=
nullptr && q.second->DataPointCount > 0)
724 temp_list.emplace_back(
new MetricData(*q.second));
731 auto calls = metric_calls_.exchange(0);
732 temp_list.emplace_back(
735 auto missed = missed_metric_calls_.exchange(0);
736 temp_list.emplace_back(
739 TLOG(TLVL_DEBUG + 33) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
742 while (!temp_list.empty())
744 auto data_ = std::move(temp_list.front());
745 temp_list.pop_front();
750 if (!data_->UseNameOverride)
752 if (!data_->MetricPrefix.empty())
754 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
758 data_->Name = prefix_ +
"." + data_->Name;
762 for (
auto& metric : metric_plugins_)
768 if (metric->IsLevelEnabled(data_->Level))
772 metric->addMetricData(data_);
773 last_send_time = std::chrono::steady_clock::now();
777 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
778 << metric->getLibName();
784 for (
auto& metric : metric_plugins_)
792 metric->stopMetrics();
793 TLOG(TLVL_DEBUG + 32) <<
"Metric Plugin " << metric->getLibName() <<
" stopped.";
797 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_stop(), error stopping plugin with name "
798 << metric->getLibName();
802 TLOG(TLVL_DEBUG + 32) <<
"MetricManager has been stopped.";
void shutdown()
Call the destructors for all configured MetricPlugin instances.
void initialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Initialize the MetricPlugin instances.
void sendMetric(std::string const &name, std::string const &value, std::string const &unit, int level, MetricMode mode, std::string const &metricPrefix="", bool useNameOverride=false)
Send a metric with the given parameters to any MetricPlugins with a threshold level >= to level...
bool metricManagerBusy()
Determine whether the MetricManager or any of its plugins are currently processing metrics...
size_t metricQueueSize(std::string const &name="")
Return the size of the named metric queue
void reinitialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Reinitialize all MetricPlugin Instances.
Report the sum of all values. Use for counters to report accurate results.
MetricManager()
Construct an instance of the MetricManager class.
void do_start()
Perform startup actions for each configured MetricPlugin.
void do_stop()
Stop sending metrics to the MetricPlugin instances.
virtual ~MetricManager() noexcept
MetricManager destructor.
over. Use to create rates from counters.
Report only the last value recorded. Useful for event counters, run numbers, etc. ...
MetricMode
The Mode of the metric indicates how multiple metric values should be combined within a reporting int...
std::unique_ptr< MetricPlugin > makeMetricPlugin(std::string const &generator_plugin_spec, fhicl::ParameterSet const &ps, std::string const &app_name, std::string const &metric_name)
Load a given MetricPlugin and return a pointer to it.
void do_resume()
Resume metric sending. Currently a No-Op.
bool metricQueueEmpty()
Returns whether the metric queue is completely empty
void do_pause()
Pause metric sending. Currently a No-Op.