9 #define TRACE_NAME "MetricManager"
10 #include "artdaq-utilities/Plugins/MetricManager.hh"
11 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
12 #include "fhiclcpp/ParameterSet.h"
15 #include <boost/exception/all.hpp>
21 , system_metric_collector_(nullptr)
26 , missed_metric_calls_(0)
38 TLOG(TLVL_INFO) <<
"Configuring metrics with parameter set: " << pset.to_string();
40 std::vector<std::string> names = pset.get_names();
42 metric_plugins_.clear();
43 bool send_system_metrics =
false;
44 bool send_process_metrics =
false;
46 for (
const auto& name : names)
48 if (name ==
"metric_queue_size")
50 metric_cache_max_size_ = pset.get<
size_t>(
"metric_queue_size");
52 else if (name ==
"metric_queue_notify_size")
54 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_queue_notify_size");
56 else if (name ==
"metric_cache_size")
58 metric_cache_max_size_ = pset.get<
size_t>(
"metric_cache_size");
60 else if (name ==
"metric_cache_notify_size")
62 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_cache_notify_size");
64 else if (name ==
"metric_send_maximum_delay_ms")
66 TLOG(TLVL_INFO) <<
"Setting metric_send_interval_ms_ to " << pset.get<
int>(
"metric_send_maximum_delay_ms");
67 metric_send_interval_ms_ = pset.get<
int>(
"metric_send_maximum_delay_ms");
69 else if (name ==
"metric_holdoff_us")
71 TLOG(TLVL_INFO) <<
"Setting metric_holdoff_us_ to " << pset.get<
int>(
"metric_holdoff_us");
72 metric_holdoff_us_ = pset.get<
int>(
"metric_holdoff_us");
74 else if (name ==
"send_system_metrics")
76 send_system_metrics = pset.get<
bool>(
"send_system_metrics");
78 else if (name ==
"send_process_metrics")
80 send_process_metrics = pset.get<
bool>(
"send_process_metrics");
86 TLOG(TLVL_DEBUG) <<
"Constructing metric plugin with name " << name;
87 auto plugin_pset = pset.get<fhicl::ParameterSet>(name);
88 metric_plugins_.push_back(
89 makeMetricPlugin(plugin_pset.get<std::string>(
"metricPluginType",
""), plugin_pset, prefix_));
91 catch (
const cet::exception& e)
93 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
94 <<
", cet::exception object caught:" << e.explain_self();
96 catch (
const boost::exception& e)
98 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
99 <<
", boost::exception object caught: " << boost::diagnostic_information(e);
101 catch (
const std::exception& e)
103 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
104 <<
", std::exception caught: " << e.what();
108 TLOG(TLVL_ERROR) <<
"Unknown Exception caught in MetricManager::initialize, error loading plugin with name "
114 if (send_system_metrics || send_process_metrics)
116 system_metric_collector_ = std::make_unique<SystemMetricCollector>(send_process_metrics, send_system_metrics);
124 std::lock_guard<std::mutex> lk(metric_mutex_);
127 TLOG(TLVL_DEBUG) <<
"Starting MetricManager";
128 for (
auto& metric : metric_plugins_)
136 metric->startMetrics();
137 TLOG(TLVL_INFO) <<
"Metric Plugin " << metric->getLibName() <<
" started.";
142 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_start(), error starting plugin with name "
143 << metric->getLibName();
153 std::unique_lock<std::mutex> lk(metric_mutex_);
154 TLOG(TLVL_DEBUG) <<
"Stopping Metrics";
156 metric_cv_.notify_all();
157 TLOG(TLVL_DEBUG) <<
"Joining Metric-Sending thread";
161 if (metric_sending_thread_.joinable())
163 metric_sending_thread_.join();
170 TLOG(TLVL_DEBUG) <<
"do_stop Complete";
183 initialize(pset, prefix);
188 TLOG(TLVL_DEBUG) <<
"MetricManager is shutting down...";
191 std::lock_guard<std::mutex> lk(metric_mutex_);
194 initialized_ =
false;
195 for (
auto& i : metric_plugins_)
199 std::string name = i->getLibName();
201 TLOG(TLVL_DEBUG) <<
"Metric Plugin " << name <<
" shutdown.";
205 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::shutdown(), error shutting down metric with name "
209 metric_plugins_.clear();
214 int level,
MetricMode mode, std::string
const& metricPrefix,
215 bool useNameOverride)
219 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
223 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
228 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
230 last_metric_received_ = std::chrono::steady_clock::now();
231 auto& cached = metric_cache_[name];
232 if (cached ==
nullptr)
234 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
238 auto size = cached->DataPointCount;
239 if (size < metric_cache_max_size_)
241 if (size >= metric_cache_notify_size_)
243 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
248 cached->StringValue = value;
249 cached->DataPointCount = 1;
253 cached->StringValue +=
" " + value;
254 cached->DataPointCount++;
259 TLOG(10) <<
"Rejecting metric because queue full";
260 missed_metric_calls_++;
264 metric_cv_.notify_all();
269 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
273 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
277 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
282 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
284 last_metric_received_ = std::chrono::steady_clock::now();
285 auto& cached = metric_cache_[name];
286 if (cached ==
nullptr)
288 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
292 auto size = cached->DataPointCount;
293 if (size < metric_cache_max_size_)
295 if (size >= metric_cache_notify_size_)
297 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
300 cached->AddPoint(value);
304 TLOG(10) <<
"Rejecting metric because queue full";
305 missed_metric_calls_++;
309 metric_cv_.notify_all();
314 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
318 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
322 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
327 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
329 last_metric_received_ = std::chrono::steady_clock::now();
330 auto& cached = metric_cache_[name];
331 if (cached ==
nullptr)
333 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
337 auto size = cached->DataPointCount;
338 if (size < metric_cache_max_size_)
340 if (size >= metric_cache_notify_size_)
342 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
345 cached->AddPoint(value);
349 TLOG(10) <<
"Rejecting metric because queue full";
350 missed_metric_calls_++;
354 metric_cv_.notify_all();
359 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
363 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
367 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
372 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
374 last_metric_received_ = std::chrono::steady_clock::now();
375 auto& cached = metric_cache_[name];
376 if (cached ==
nullptr)
378 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
382 auto size = cached->DataPointCount;
383 if (size < metric_cache_max_size_)
385 if (size >= metric_cache_notify_size_)
387 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
390 cached->AddPoint(value);
394 TLOG(10) <<
"Rejecting metric because queue full";
395 missed_metric_calls_++;
399 metric_cv_.notify_all();
404 int level,
MetricMode mode, std::string
const& metricPrefix,
405 bool useNameOverride)
409 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
413 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
418 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
420 last_metric_received_ = std::chrono::steady_clock::now();
421 auto& cached = metric_cache_[name];
422 if (cached ==
nullptr)
424 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
428 auto size = cached->DataPointCount;
429 if (size < metric_cache_max_size_)
431 if (size >= metric_cache_notify_size_)
433 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
436 cached->AddPoint(value);
440 TLOG(10) <<
"Rejecting metric because queue full";
441 missed_metric_calls_++;
445 metric_cv_.notify_all();
449 void artdaq::MetricManager::startMetricLoop_()
451 if (metric_sending_thread_.joinable())
453 metric_sending_thread_.join();
455 boost::thread::attributes attrs;
456 attrs.set_stack_size(4096 * 2000);
457 TLOG(TLVL_INFO) <<
"Starting Metric Sending Thread";
460 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_,
this));
462 catch (
const boost::exception& e)
464 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
465 <<
", errno=" << errno;
466 std::cerr <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
467 <<
", errno=" << errno << std::endl;
470 TLOG(TLVL_INFO) <<
"Metric Sending thread started";
475 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
476 for (
auto& cache_entry : metric_cache_)
478 if (cache_entry.second->DataPointCount > 0)
489 bool pluginsBusy =
false;
491 for (
auto& p : metric_plugins_)
493 if (p->metricsPending())
500 TLOG(TLVL_TRACE) <<
"Metric queue empty: " << metricQueueEmpty() <<
", busy_: " << busy_ <<
", Plugins busy: " << pluginsBusy;
501 return !metricQueueEmpty() || busy_ || pluginsBusy;
506 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
510 for (
auto& q : metric_cache_)
512 size += q.second->DataPointCount;
517 if (metric_cache_.count(name) != 0u)
519 size = metric_cache_[name]->DataPointCount;
526 void artdaq::MetricManager::sendMetricLoop_()
528 TLOG(TLVL_INFO) <<
"sendMetricLoop_ START";
529 auto last_send_time = std::chrono::steady_clock::time_point();
532 TLOG(6) <<
"sendMetricLoop_: Entering Metric input wait loop";
533 while (metricQueueEmpty() && running_)
535 std::unique_lock<std::mutex> lk(metric_mutex_);
536 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
537 auto now = std::chrono::steady_clock::now();
538 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() >
539 metric_send_interval_ms_)
541 TLOG(6) <<
"sendMetricLoop_: Metric send interval exceeded: Sending metrics";
542 if (std::chrono::duration_cast<std::chrono::microseconds>(now - last_metric_received_).count() < metric_holdoff_us_)
544 usleep(metric_holdoff_us_);
546 for (
auto& metric : metric_plugins_)
550 metric->sendMetrics();
553 last_send_time = now;
556 if (std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - last_metric_received_).count() < metric_holdoff_us_)
558 usleep(metric_holdoff_us_);
561 TLOG(6) <<
"sendMetricLoop_: After Metric input wait loop";
563 auto processing_start = std::chrono::steady_clock::now();
564 auto temp_list = std::list<std::unique_ptr<MetricData>>();
566 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
568 for (
auto& q : metric_cache_)
570 if (q.second !=
nullptr && q.second->DataPointCount > 0)
572 temp_list.emplace_back(
new MetricData(*q.second));
578 auto calls = metric_calls_.exchange(0);
579 temp_list.emplace_back(
582 auto missed = missed_metric_calls_.exchange(0);
583 temp_list.emplace_back(
586 TLOG(TLVL_TRACE) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
589 if (system_metric_collector_ !=
nullptr)
591 TLOG(TLVL_TRACE) <<
"Collecting System metrics (CPU, RAM, Network)";
592 auto systemMetrics = system_metric_collector_->SendMetrics();
593 for (
auto& m : systemMetrics) { temp_list.emplace_back(std::move(m)); }
596 TLOG(6) <<
"sendMetricLoop_: Before processing temp_list";
597 while (!temp_list.empty())
599 auto data_ = std::move(temp_list.front());
600 temp_list.pop_front();
605 if (!data_->UseNameOverride)
607 if (!data_->MetricPrefix.empty())
609 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
613 data_->Name = prefix_ +
"." + data_->Name;
617 for (
auto& metric : metric_plugins_)
623 if (metric->IsLevelEnabled(data_->Level))
627 metric->addMetricData(data_);
628 last_send_time = std::chrono::steady_clock::now();
632 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
633 << metric->getLibName();
639 TLOG(6) <<
"sendMetricLoop_: Before sending metrics";
640 for (
auto& metric : metric_plugins_)
646 metric->sendMetrics(
false, processing_start);
650 TLOG(6) <<
"sendMetricLoop_: End of working loop";
656 auto temp_list = std::list<std::unique_ptr<MetricData>>();
658 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
660 for (
auto& q : metric_cache_)
662 if (q.second !=
nullptr && q.second->DataPointCount > 0)
664 temp_list.emplace_back(
new MetricData(*q.second));
671 auto calls = metric_calls_.exchange(0);
672 temp_list.emplace_back(
675 auto missed = missed_metric_calls_.exchange(0);
676 temp_list.emplace_back(
679 TLOG(TLVL_TRACE) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
682 while (!temp_list.empty())
684 auto data_ = std::move(temp_list.front());
685 temp_list.pop_front();
690 if (!data_->UseNameOverride)
692 if (!data_->MetricPrefix.empty())
694 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
698 data_->Name = prefix_ +
"." + data_->Name;
702 for (
auto& metric : metric_plugins_)
708 if (metric->IsLevelEnabled(data_->Level))
712 metric->addMetricData(data_);
713 last_send_time = std::chrono::steady_clock::now();
717 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
718 << metric->getLibName();
724 for (
auto& metric : metric_plugins_)
732 metric->stopMetrics();
733 TLOG(TLVL_DEBUG) <<
"Metric Plugin " << metric->getLibName() <<
" stopped.";
737 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_stop(), error stopping plugin with name "
738 << metric->getLibName();
742 TLOG(TLVL_DEBUG) <<
"MetricManager has been stopped.";
void shutdown()
Call the destructors for all configured MetricPlugin instances.
void initialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Initialize the MetricPlugin instances.
void sendMetric(std::string const &name, std::string const &value, std::string const &unit, int level, MetricMode mode, std::string const &metricPrefix="", bool useNameOverride=false)
Send a metric with the given parameters to any MetricPlugins with a threshold level >= to level...
bool metricManagerBusy()
Determine whether the MetricManager or any of its plugins are currently processing metrics...
size_t metricQueueSize(std::string const &name="")
Return the size of the named metric queue
void reinitialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Reinitialize all MetricPlugin Instances.
Report the sum of all values. Use for counters to report accurate results.
MetricManager()
Construct an instance of the MetricManager class.
void do_start()
Perform startup actions for each configured MetricPlugin.
void do_stop()
Stop sending metrics to the MetricPlugin instances.
virtual ~MetricManager() noexcept
MetricManager destructor.
std::unique_ptr< MetricPlugin > makeMetricPlugin(std::string const &generator_plugin_spec, fhicl::ParameterSet const &ps, std::string const &app_name)
Load a given MetricPlugin and return a pointer to it.
over. Use to create rates from counters.
Report only the last value recorded. Useful for event counters, run numbers, etc. ...
MetricMode
The Mode of the metric indicates how multiple metric values should be combined within a reporting int...
void do_resume()
Resume metric sending. Currently a No-Op.
bool metricQueueEmpty()
Returns whether the metric queue is completely empty
void do_pause()
Pause metric sending. Currently a No-Op.