9 #include "TRACE/tracemf.h"
10 #define TRACE_NAME "MetricManager"
12 #include "artdaq-utilities/Plugins/MetricManager.hh"
13 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
14 #include "fhiclcpp/ParameterSet.h"
17 #include <boost/exception/all.hpp>
23 , system_metric_collector_(nullptr)
28 , missed_metric_calls_(0)
31 TLOG(TLVL_INFO) <<
"MetricManager CONSTRUCTOR";
43 TLOG(TLVL_INFO) <<
"Configuring metrics with parameter set: " << pset.to_string();
45 std::vector<std::string> names = pset.get_names();
47 metric_plugins_.clear();
48 bool send_system_metrics =
false;
49 bool send_process_metrics =
false;
51 for (
const auto& name : names)
53 if (name ==
"metric_queue_size")
55 metric_cache_max_size_ = pset.get<
size_t>(
"metric_queue_size");
57 else if (name ==
"metric_queue_notify_size")
59 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_queue_notify_size");
61 else if (name ==
"metric_cache_size")
63 metric_cache_max_size_ = pset.get<
size_t>(
"metric_cache_size");
65 else if (name ==
"metric_cache_notify_size")
67 metric_cache_notify_size_ = pset.get<
size_t>(
"metric_cache_notify_size");
69 else if (name ==
"metric_send_maximum_delay_ms")
71 TLOG(TLVL_INFO) <<
"Setting metric_send_interval_ms_ to " << pset.get<
int>(
"metric_send_maximum_delay_ms");
72 metric_send_interval_ms_ = pset.get<
int>(
"metric_send_maximum_delay_ms");
74 else if (name ==
"metric_holdoff_us")
76 TLOG(TLVL_INFO) <<
"Setting metric_holdoff_us_ to " << pset.get<
int>(
"metric_holdoff_us");
77 metric_holdoff_us_ = pset.get<
int>(
"metric_holdoff_us");
79 else if (name ==
"send_system_metrics")
81 send_system_metrics = pset.get<
bool>(
"send_system_metrics");
83 else if (name ==
"send_process_metrics")
85 send_process_metrics = pset.get<
bool>(
"send_process_metrics");
91 TLOG(TLVL_DEBUG + 32) <<
"Constructing metric plugin with name " << name;
92 auto plugin_pset = pset.get<fhicl::ParameterSet>(name);
93 metric_plugins_.push_back(
94 makeMetricPlugin(plugin_pset.get<std::string>(
"metricPluginType",
""), plugin_pset, prefix_, name));
96 catch (
const cet::exception& e)
98 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
99 <<
", cet::exception object caught:" << e.explain_self();
101 catch (
const boost::exception& e)
103 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
104 <<
", boost::exception object caught: " << boost::diagnostic_information(e);
106 catch (
const std::exception& e)
108 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::initialize, error loading plugin with name " << name
109 <<
", std::exception caught: " << e.what();
113 TLOG(TLVL_ERROR) <<
"Unknown Exception caught in MetricManager::initialize, error loading plugin with name "
119 if (send_system_metrics || send_process_metrics)
121 system_metric_collector_ = std::make_unique<SystemMetricCollector>(send_process_metrics, send_system_metrics);
129 std::lock_guard<std::mutex> lk(metric_mutex_);
132 TLOG(TLVL_DEBUG + 32) <<
"Starting MetricManager";
133 for (
auto& metric : metric_plugins_)
141 metric->startMetrics();
142 TLOG(TLVL_INFO) <<
"Metric Plugin " << metric->getLibName() <<
" started.";
147 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_start(), error starting plugin with name "
148 << metric->getLibName();
158 std::unique_lock<std::mutex> lk(metric_mutex_);
159 TLOG(TLVL_DEBUG + 32) <<
"Stopping Metrics";
161 metric_cv_.notify_all();
162 TLOG(TLVL_DEBUG + 32) <<
"Joining Metric-Sending thread";
166 if (metric_sending_thread_.joinable())
168 metric_sending_thread_.join();
175 TLOG(TLVL_DEBUG + 32) <<
"do_stop Complete";
188 initialize(pset, prefix);
193 TRACE_STREAMER(TLVL_DEBUG + 32, TLOG2(
"MetricManager", 0), 0) <<
"MetricManager is shutting down...";
196 std::lock_guard<std::mutex> lk(metric_mutex_);
199 TRACE_STREAMER(TLVL_DEBUG + 32, TLOG2(
"MetricManager", 0), 0) <<
"MetricManager is initialized shutting down...";
200 initialized_ =
false;
201 for (
auto& i : metric_plugins_)
205 std::string name = i->getLibName();
207 TRACE_STREAMER(TLVL_DEBUG + 32, TLOG2(
"MetricManager", 0), 0) <<
"Metric Plugin " << name <<
" shutdown.";
211 TRACE_STREAMER(TLVL_ERROR, TLOG2(
"MetricManager", 0), 0) <<
"Exception caught in MetricManager::shutdown(), error shutting down metric with name "
215 metric_plugins_.clear();
220 int level,
MetricMode mode, std::string
const& metricPrefix,
221 bool useNameOverride)
225 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
229 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
234 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
236 last_metric_received_ = std::chrono::steady_clock::now();
237 auto& cached = metric_cache_[name];
238 if (cached ==
nullptr)
240 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
244 auto size = cached->DataPointCount;
245 if (size < metric_cache_max_size_)
247 if (size >= metric_cache_notify_size_)
249 TLOG(TLVL_DEBUG + 35) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
254 cached->StringValue = value;
255 cached->DataPointCount = 1;
259 cached->StringValue +=
" " + value;
260 cached->DataPointCount++;
265 TLOG(TLVL_DEBUG + 36) <<
"Rejecting metric because queue full";
266 missed_metric_calls_++;
270 metric_cv_.notify_all();
275 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
279 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
283 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
288 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
290 last_metric_received_ = std::chrono::steady_clock::now();
291 auto& cached = metric_cache_[name];
292 if (cached ==
nullptr)
294 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
298 auto size = cached->DataPointCount;
299 if (size < metric_cache_max_size_)
301 if (size >= metric_cache_notify_size_)
303 TLOG(TLVL_DEBUG + 35) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
306 cached->AddPoint(value);
310 TLOG(TLVL_DEBUG + 36) <<
"Rejecting metric because queue full";
311 missed_metric_calls_++;
315 metric_cv_.notify_all();
320 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
324 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
328 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
333 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
335 last_metric_received_ = std::chrono::steady_clock::now();
336 auto& cached = metric_cache_[name];
337 if (cached ==
nullptr)
339 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
343 auto size = cached->DataPointCount;
344 if (size < metric_cache_max_size_)
346 if (size >= metric_cache_notify_size_)
348 TLOG(TLVL_DEBUG + 35) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
351 cached->AddPoint(value);
355 TLOG(TLVL_DEBUG + 36) <<
"Rejecting metric because queue full";
356 missed_metric_calls_++;
360 metric_cv_.notify_all();
365 MetricMode mode, std::string
const& metricPrefix,
bool useNameOverride)
369 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
373 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
378 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
380 last_metric_received_ = std::chrono::steady_clock::now();
381 auto& cached = metric_cache_[name];
382 if (cached ==
nullptr)
384 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
388 auto size = cached->DataPointCount;
389 if (size < metric_cache_max_size_)
391 if (size >= metric_cache_notify_size_)
393 TLOG(TLVL_DEBUG + 35) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
396 cached->AddPoint(value);
400 TLOG(TLVL_DEBUG + 36) <<
"Rejecting metric because queue full";
401 missed_metric_calls_++;
405 metric_cv_.notify_all();
410 int level,
MetricMode mode, std::string
const& metricPrefix,
411 bool useNameOverride)
415 TLOG(TLVL_WARNING) <<
"Attempted to send metric " << name <<
" when MetricManager has not yet been initialized!";
419 TLOG(TLVL_INFO) <<
"Attempted to send metric when MetricManager stopped!";
424 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
426 last_metric_received_ = std::chrono::steady_clock::now();
427 auto& cached = metric_cache_[name];
428 if (cached ==
nullptr)
430 metric_cache_[name] = std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride);
434 auto size = cached->DataPointCount;
435 if (size < metric_cache_max_size_)
437 if (size >= metric_cache_notify_size_)
439 TLOG(TLVL_DEBUG + 35) <<
"Metric cache is at size " << size <<
" of " << metric_cache_max_size_ <<
" for metric " << name
442 cached->AddPoint(value);
446 TLOG(TLVL_DEBUG + 36) <<
"Rejecting metric because queue full";
447 missed_metric_calls_++;
451 metric_cv_.notify_all();
455 void artdaq::MetricManager::startMetricLoop_()
457 if (metric_sending_thread_.joinable())
459 metric_sending_thread_.join();
461 boost::thread::attributes attrs;
462 attrs.set_stack_size(4096 * 2000);
463 TLOG(TLVL_INFO) <<
"Starting Metric Sending Thread";
466 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_,
this));
469 snprintf(tname,
sizeof(tname) - 1,
"%s",
"MetricSend");
470 tname[
sizeof(tname) - 1] =
'\0';
471 auto handle = metric_sending_thread_.native_handle();
472 pthread_setname_np(handle, tname);
474 catch (
const boost::exception& e)
476 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
477 <<
", errno=" << errno;
478 std::cerr <<
"Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e)
479 <<
", errno=" << errno << std::endl;
482 TLOG(TLVL_INFO) <<
"Metric Sending thread started";
487 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
488 for (
auto& cache_entry : metric_cache_)
490 if (cache_entry.second->DataPointCount > 0)
501 bool pluginsBusy =
false;
503 for (
auto& p : metric_plugins_)
505 if (p->metricsPending())
512 TLOG(TLVL_DEBUG + 33) <<
"Metric queue empty: " << metricQueueEmpty() <<
", busy_: " << busy_ <<
", Plugins busy: " << pluginsBusy;
513 return !metricQueueEmpty() || busy_ || pluginsBusy;
518 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
522 for (
auto& q : metric_cache_)
524 size += q.second->DataPointCount;
529 if (metric_cache_.count(name) != 0u)
531 size = metric_cache_[name]->DataPointCount;
538 void artdaq::MetricManager::sendMetricLoop_()
540 TLOG(TLVL_INFO) <<
"sendMetricLoop_ START";
541 auto last_send_time = std::chrono::steady_clock::time_point();
544 TLOG(TLVL_DEBUG + 34) <<
"sendMetricLoop_: Entering Metric input wait loop";
545 while (metricQueueEmpty() && running_)
547 std::unique_lock<std::mutex> lk(metric_mutex_);
548 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
549 auto now = std::chrono::steady_clock::now();
550 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() >
551 metric_send_interval_ms_)
553 TLOG(TLVL_DEBUG + 34) <<
"sendMetricLoop_: Metric send interval exceeded: Sending metrics";
555 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
556 if (std::chrono::duration_cast<std::chrono::microseconds>(now - last_metric_received_).count() < metric_holdoff_us_)
559 usleep(metric_holdoff_us_);
562 for (
auto& metric : metric_plugins_)
566 metric->sendMetrics();
569 last_send_time = now;
573 std::unique_lock<std::mutex> lk(metric_cache_mutex_);
574 if (std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - last_metric_received_).count() < metric_holdoff_us_)
577 usleep(metric_holdoff_us_);
581 TLOG(TLVL_DEBUG + 34) <<
"sendMetricLoop_: After Metric input wait loop";
583 auto processing_start = std::chrono::steady_clock::now();
584 auto temp_list = std::list<std::unique_ptr<MetricData>>();
586 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
588 for (
auto& q : metric_cache_)
590 if (q.second !=
nullptr && q.second->DataPointCount > 0)
592 temp_list.emplace_back(
new MetricData(*q.second));
598 auto calls = metric_calls_.exchange(0);
599 temp_list.emplace_back(
602 auto missed = missed_metric_calls_.exchange(0);
603 temp_list.emplace_back(
606 TLOG(TLVL_DEBUG + 33) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
609 if (system_metric_collector_ !=
nullptr)
611 TLOG(TLVL_DEBUG + 33) <<
"Collecting System metrics (CPU, RAM, Network)";
612 auto systemMetrics = system_metric_collector_->SendMetrics();
613 for (
auto& m : systemMetrics) { temp_list.emplace_back(std::move(m)); }
616 TLOG(TLVL_DEBUG + 34) <<
"sendMetricLoop_: Before processing temp_list";
617 while (!temp_list.empty())
619 auto data_ = std::move(temp_list.front());
620 temp_list.pop_front();
625 if (!data_->UseNameOverride)
627 if (!data_->MetricPrefix.empty())
629 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
633 data_->Name = prefix_ +
"." + data_->Name;
637 for (
auto& metric : metric_plugins_)
643 if (metric->IsLevelEnabled(data_->Level))
647 metric->addMetricData(data_);
648 last_send_time = std::chrono::steady_clock::now();
652 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
653 << metric->getLibName();
659 TLOG(TLVL_DEBUG + 34) <<
"sendMetricLoop_: Before sending metrics";
660 for (
auto& metric : metric_plugins_)
666 metric->sendMetrics(
false, processing_start);
670 TLOG(TLVL_DEBUG + 34) <<
"sendMetricLoop_: End of working loop";
676 auto temp_list = std::list<std::unique_ptr<MetricData>>();
678 std::lock_guard<std::mutex> lk(metric_cache_mutex_);
680 for (
auto& q : metric_cache_)
682 if (q.second !=
nullptr && q.second->DataPointCount > 0)
684 temp_list.emplace_back(
new MetricData(*q.second));
691 auto calls = metric_calls_.exchange(0);
692 temp_list.emplace_back(
695 auto missed = missed_metric_calls_.exchange(0);
696 temp_list.emplace_back(
699 TLOG(TLVL_DEBUG + 33) <<
"There are " << temp_list.size() <<
" Metrics to process (" << calls <<
" calls, " << missed
702 while (!temp_list.empty())
704 auto data_ = std::move(temp_list.front());
705 temp_list.pop_front();
710 if (!data_->UseNameOverride)
712 if (!data_->MetricPrefix.empty())
714 data_->Name = prefix_ +
"." + data_->MetricPrefix +
"." + data_->Name;
718 data_->Name = prefix_ +
"." + data_->Name;
722 for (
auto& metric : metric_plugins_)
728 if (metric->IsLevelEnabled(data_->Level))
732 metric->addMetricData(data_);
733 last_send_time = std::chrono::steady_clock::now();
737 TLOG(TLVL_ERROR) <<
"Error in MetricManager::sendMetric: error sending value to metric plugin with name "
738 << metric->getLibName();
744 for (
auto& metric : metric_plugins_)
752 metric->stopMetrics();
753 TLOG(TLVL_DEBUG + 32) <<
"Metric Plugin " << metric->getLibName() <<
" stopped.";
757 TLOG(TLVL_ERROR) <<
"Exception caught in MetricManager::do_stop(), error stopping plugin with name "
758 << metric->getLibName();
762 TLOG(TLVL_DEBUG + 32) <<
"MetricManager has been stopped.";
void shutdown()
Call the destructors for all configured MetricPlugin instances.
void initialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Initialize the MetricPlugin instances.
void sendMetric(std::string const &name, std::string const &value, std::string const &unit, int level, MetricMode mode, std::string const &metricPrefix="", bool useNameOverride=false)
Send a metric with the given parameters to any MetricPlugins with a threshold level >= to level...
bool metricManagerBusy()
Determine whether the MetricManager or any of its plugins are currently processing metrics...
size_t metricQueueSize(std::string const &name="")
Return the size of the named metric queue
void reinitialize(fhicl::ParameterSet const &pset, std::string const &prefix="")
Reinitialize all MetricPlugin Instances.
Report the sum of all values. Use for counters to report accurate results.
MetricManager()
Construct an instance of the MetricManager class.
void do_start()
Perform startup actions for each configured MetricPlugin.
void do_stop()
Stop sending metrics to the MetricPlugin instances.
virtual ~MetricManager() noexcept
MetricManager destructor.
over. Use to create rates from counters.
Report only the last value recorded. Useful for event counters, run numbers, etc. ...
MetricMode
The Mode of the metric indicates how multiple metric values should be combined within a reporting int...
std::unique_ptr< MetricPlugin > makeMetricPlugin(std::string const &generator_plugin_spec, fhicl::ParameterSet const &ps, std::string const &app_name, std::string const &metric_name)
Load a given MetricPlugin and return a pointer to it.
void do_resume()
Resume metric sending. Currently a No-Op.
bool metricQueueEmpty()
Returns whether the metric queue is completely empty
void do_pause()
Pause metric sending. Currently a No-Op.