9 #define TRACE_NAME "MetricManager"
10 #include "artdaq-utilities/Plugins/MetricManager.hh"
11 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
12 #include "fhiclcpp/ParameterSet.h"
15 #include <boost/exception/all.hpp>
21 , system_metric_collector_(nullptr)
26 , missed_metric_calls_(0)
38 TLOG(TLVL_INFO) <<
"Configuring metrics with parameter set: " << pset.to_string();
40 std::vector<std::string> names = pset.get_names();
42 metric_plugins_.clear();
43 bool send_system_metrics =
false;
44 bool send_process_metrics =
false;
46 for (
const auto& name : names)
48 if (name ==
"metric_queue_size")
50 metric_cache_max_size_ = pset.get<
size_t>(
"metric_queue_size");
52 else if (name ==
"metric_queue_notify_size")
54 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_queue_notify_size");
56 else if (name ==
"metric_cache_size")
58 metric_cache_max_size_ = pset.get<
size_t>(
"metric_cache_size");
60 else if (name ==
"metric_cache_notify_size")
62 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_cache_notify_size");
64 else if (name ==
"metric_send_maximum_delay_ms")
66 TLOG(TLVL_INFO) <<
"Setting metric_send_interval_ms_ to " << pset.get<
int>(
"metric_send_maximum_delay_ms");
67 metric_send_interval_ms_ = pset.get<
int>(
"metric_send_maximum_delay_ms");
69 else if (name ==
"metric_holdoff_us")
71 TLOG(TLVL_INFO) <<
"Setting metric_holdoff_us_ to " << pset.get<
int>(
"metric_holdoff_us");
72 metric_holdoff_us_ = pset.get<
int>(
"metric_holdoff_us");
74 else if (name ==
"send_system_metrics")
76 send_system_metrics = pset.get<
bool>(
"send_system_metrics");
78 else if (name ==
"send_process_metrics")
80 send_process_metrics = pset.get<
bool>(
"send_process_metrics");
86 TLOG(TLVL_DEBUG) <<
"Constructing metric plugin with name " << name;
87 auto plugin_pset = pset.get<fhicl::ParameterSet>(name);
88 metric_plugins_.push_back(
89 makeMetricPlugin(plugin_pset.get<std::string>(
"metricPluginType",
""), plugin_pset, prefix_));
91 catch (
const cet::exception& e)
93 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
94 <<
", cet::exception object caught:" << e.explain_self();
96 catch (
const boost::exception& e)
98 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
99 <<
", boost::exception object caught: " << boost::diagnostic_information(e);
101 catch (
const std::exception& e)
103 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
104 <<
", std::exception caught: " << e.what();
108 TLOG(TLVL_ERROR) <<
"Unknown Exception caught in MetricManager::initialize, error loading plugin with name "
114 if (send_system_metrics || send_process_metrics)
116 system_metric_collector_ = std::make_unique<SystemMetricCollector>(send_process_metrics, send_system_metrics);
124 std::lock_guard<std::mutex> lk(metric_mutex_);
127 TLOG(TLVL_DEBUG) <<
"Starting MetricManager";
128 for (
auto& metric : metric_plugins_)
136 metric->startMetrics();
137 TLOG(TLVL_INFO) <<
"Metric Plugin " << metric->getLibName() <<
" started.";
142 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_start(), error starting plugin with name "
143 << metric->getLibName();
153 std::unique_lock<std::mutex> lk(metric_mutex_);
154 TLOG(TLVL_DEBUG) <<
"Stopping Metrics";
156 metric_cv_.notify_all();
157 TLOG(TLVL_DEBUG) <<
"Joining Metric-Sending thread";
161 if (metric_sending_thread_.joinable())
163 metric_sending_thread_.join();
170 TLOG(TLVL_DEBUG) <<
"do_stop Complete";
183 initialize(pset, prefix);
188 TLOG(TLVL_DEBUG) <<
"MetricManager is shutting down...";
191 std::lock_guard<std::mutex> lk(metric_mutex_);
194 initialized_ =
false;
195 for (
auto& i : metric_plugins_)
199 std::string name = i->getLibName();
201 TLOG(TLVL_DEBUG) <<
"Metric Plugin " << name <<
" shutdown.";
205 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::shutdown(), error shutting down metric with name "
209 metric_plugins_.clear();
214 int level,
MetricMode mode, std::string
const& metricPrefix,
215 bool useNameOverride)
219 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
223 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
228 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
230 last_metric_received_ = std::chrono::steady_clock::now();
231 auto& cached = metric_cache_[name];
232 if (cached ==
nullptr)
234 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
238 auto size = cached->DataPointCount;
239 if (size < metric_cache_max_size_)
241 if (size >= metric_cache_notify_size_)
243 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
248 cached->StringValue = value;
249 cached->DataPointCount = 1;
253 cached->StringValue +=
" " + value;
254 cached->DataPointCount++;
259 TLOG(10) <<
"Rejecting metric because queue full";
260 missed_metric_calls_++;
264 metric_cv_.notify_all();
269 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
273 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
277 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
282 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
284 last_metric_received_ = std::chrono::steady_clock::now();
285 auto& cached = metric_cache_[name];
286 if (cached ==
nullptr)
288 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
292 auto size = cached->DataPointCount;
293 if (size < metric_cache_max_size_)
295 if (size >= metric_cache_notify_size_)
297 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
300 cached->AddPoint(value);
304 TLOG(10) <<
"Rejecting metric because queue full";
305 missed_metric_calls_++;
309 metric_cv_.notify_all();
314 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
318 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
322 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
327 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
329 last_metric_received_ = std::chrono::steady_clock::now();
330 auto& cached = metric_cache_[name];
331 if (cached ==
nullptr)
333 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
337 auto size = cached->DataPointCount;
338 if (size < metric_cache_max_size_)
340 if (size >= metric_cache_notify_size_)
342 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
345 cached->AddPoint(value);
349 TLOG(10) <<
"Rejecting metric because queue full";
350 missed_metric_calls_++;
354 metric_cv_.notify_all();
359 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
363 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
367 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
372 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
374 last_metric_received_ = std::chrono::steady_clock::now();
375 auto& cached = metric_cache_[name];
376 if (cached ==
nullptr)
378 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
382 auto size = cached->DataPointCount;
383 if (size < metric_cache_max_size_)
385 if (size >= metric_cache_notify_size_)
387 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
390 cached->AddPoint(value);
394 TLOG(10) <<
"Rejecting metric because queue full";
395 missed_metric_calls_++;
399 metric_cv_.notify_all();
404 int level,
MetricMode mode, std::string
const& metricPrefix,
405 bool useNameOverride)
409 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
413 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
418 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
420 last_metric_received_ = std::chrono::steady_clock::now();
421 auto& cached = metric_cache_[name];
422 if (cached ==
nullptr)
424 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
428 auto size = cached->DataPointCount;
429 if (size < metric_cache_max_size_)
431 if (size >= metric_cache_notify_size_)
433 TLOG(9) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
436 cached->AddPoint(value);
440 TLOG(10) <<
"Rejecting metric because queue full";
441 missed_metric_calls_++;
445 metric_cv_.notify_all();
449 void artdaq::MetricManager::startMetricLoop_()
451 if (metric_sending_thread_.joinable())
453 metric_sending_thread_.join();
455 boost::thread::attributes attrs;
456 attrs.set_stack_size(4096 * 2000);
457 TLOG(TLVL_INFO) <<
"Starting Metric Sending Thread";
460 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_,
this));
462 catch (
const boost::exception& e)
464 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
465 <<
", errno=" << errno;
466 std::cerr <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
467 <<
", errno=" << errno << std::endl;
470 TLOG(TLVL_INFO) <<
"Metric Sending thread started";
475 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
476 for (
auto& cache_entry : metric_cache_)
478 if (cache_entry.second->DataPointCount > 0)
489 bool pluginsBusy =
false;
491 for (
auto& p : metric_plugins_)
493 if (p->metricsPending())
500 TLOG(TLVL_TRACE) <<
"Metric queue empty: " << metricQueueEmpty() <<
", busy_: " << busy_ <<
", Plugins busy: " << pluginsBusy;
501 return !metricQueueEmpty() || busy_ || pluginsBusy;
506 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
510 for (
auto& q : metric_cache_)
512 size += q.second->DataPointCount;
517 if (metric_cache_.count(name) != 0u)
519 size = metric_cache_[name]->DataPointCount;
526 void artdaq::MetricManager::sendMetricLoop_()
528 TLOG(TLVL_INFO) <<
"sendMetricLoop_ START";
529 auto last_send_time = std::chrono::steady_clock::time_point();
532 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: Entering Metric input wait loop";
533 while (metricQueueEmpty() && running_)
535 std::unique_lock<std::mutex> lk(metric_mutex_);
536 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
537 auto now = std::chrono::steady_clock::now();
538 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() >
539 metric_send_interval_ms_)
541 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: Metric send interval exceeded: Sending metrics";
543 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
544 if (std::chrono::duration_cast<std::chrono::microseconds>(now - last_metric_received_).count() < metric_holdoff_us_)
547 usleep(metric_holdoff_us_);
550 for (
auto& metric : metric_plugins_)
554 metric->sendMetrics();
557 last_send_time = now;
561 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
562 if (std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - last_metric_received_).count() < metric_holdoff_us_)
565 usleep(metric_holdoff_us_);
569 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: After Metric input wait loop";
571 auto processing_start = std::chrono::steady_clock::now();
572 auto temp_list = std::list<std::unique_ptr<MetricData>>();
574 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
576 for (
auto& q : metric_cache_)
578 if (q.second !=
nullptr && q.second->DataPointCount > 0)
580 temp_list.emplace_back(
new MetricData(*q.second));
586 auto calls = metric_calls_.exchange(0);
587 temp_list.emplace_back(
590 auto missed = missed_metric_calls_.exchange(0);
591 temp_list.emplace_back(
594 TLOG(TLVL_TRACE) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
597 if (system_metric_collector_ !=
nullptr)
599 TLOG(TLVL_TRACE) <<
"Collecting System metrics (CPU, RAM, Network)";
600 auto systemMetrics = system_metric_collector_->SendMetrics();
601 for (
auto& m : systemMetrics) { temp_list.emplace_back(std::move(m)); }
604 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: Before processing temp_list";
605 while (!temp_list.empty())
607 auto data_ = std::move(temp_list.front());
608 temp_list.pop_front();
613 if (!data_->UseNameOverride)
615 if (!data_->MetricPrefix.empty())
617 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
621 data_->Name = prefix_ +
"." + data_->Name;
625 for (
auto& metric : metric_plugins_)
631 if (metric->IsLevelEnabled(data_->Level))
635 metric->addMetricData(data_);
636 last_send_time = std::chrono::steady_clock::now();
640 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
641 << metric->getLibName();
647 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: Before sending metrics";
648 for (
auto& metric : metric_plugins_)
654 metric->sendMetrics(
false, processing_start);
658 TLOG(TLVL_DEBUG + 3) <<
"sendMetricLoop_: End of working loop";
664 auto temp_list = std::list<std::unique_ptr<MetricData>>();
666 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
668 for (
auto& q : metric_cache_)
670 if (q.second !=
nullptr && q.second->DataPointCount > 0)
672 temp_list.emplace_back(
new MetricData(*q.second));
679 auto calls = metric_calls_.exchange(0);
680 temp_list.emplace_back(
683 auto missed = missed_metric_calls_.exchange(0);
684 temp_list.emplace_back(
687 TLOG(TLVL_TRACE) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
690 while (!temp_list.empty())
692 auto data_ = std::move(temp_list.front());
693 temp_list.pop_front();
698 if (!data_->UseNameOverride)
700 if (!data_->MetricPrefix.empty())
702 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
706 data_->Name = prefix_ +
"." + data_->Name;
710 for (
auto& metric : metric_plugins_)
716 if (metric->IsLevelEnabled(data_->Level))
720 metric->addMetricData(data_);
721 last_send_time = std::chrono::steady_clock::now();
725 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
726 << metric->getLibName();
732 for (
auto& metric : metric_plugins_)
740 metric->stopMetrics();
741 TLOG(TLVL_DEBUG) <<
"Metric Plugin " << metric->getLibName() <<
" stopped.";
745 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_stop(), error stopping plugin with name "
746 << metric->getLibName();
750 TLOG(TLVL_DEBUG) <<
"MetricManager has been stopped.";
void shutdown()
Call the destructors for all configured MetricPlugin instances.
void initialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Initialize the MetricPlugin instances.
void sendMetric(std::string const &name, std::string const &value, std::string const &unit, int level, MetricMode mode, std::string const &metricPrefix="", bool useNameOverride=false)
Send a metric with the given parameters to any MetricPlugins with a threshold level >= to level...
bool metricManagerBusy()
Determine whether the MetricManager or any of its plugins are currently processing metrics...
size_t metricQueueSize(std::string const &name="")
Return the size of the named metric queue
void reinitialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Reinitialize all MetricPlugin Instances.
Report the sum of all values. Use for counters to report accurate results.
MetricManager()
Construct an instance of the MetricManager class.
void do_start()
Perform startup actions for each configured MetricPlugin.
void do_stop()
Stop sending metrics to the MetricPlugin instances.
virtual ~MetricManager() noexcept
MetricManager destructor.
std::unique_ptr< MetricPlugin > makeMetricPlugin(std::string const &generator_plugin_spec, fhicl::ParameterSet const &ps, std::string const &app_name)
Load a given MetricPlugin and return a pointer to it.
over. Use to create rates from counters.
Report only the last value recorded. Useful for event counters, run numbers, etc. ...
MetricMode
The Mode of the metric indicates how multiple metric values should be combined within a reporting int...
void do_resume()
Resume metric sending. Currently a No-Op.
bool metricQueueEmpty()
Returns whether the metric queue is completely empty
void do_pause()
Pause metric sending. Currently a No-Op.