00001
00002
00003
00004
00005
00006
00007
00008
00009 #define TRACE_NAME "MetricManager"
00010 #include "tracemf.h"
00011 #include "artdaq-utilities/Plugins/MetricManager.hh"
00012 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
00013 #include "fhiclcpp/ParameterSet.h"
00014
00015 #include <chrono>
00016 #include <boost/exception/all.hpp>
00017
00018 artdaq::MetricManager::
00019 MetricManager() : metric_plugins_(0)
00020 , metric_send_interval_ms_(15000)
00021 , initialized_(false)
00022 , running_(false)
00023 , active_(false)
00024 , missed_metric_calls_(0)
00025 , metric_queue_max_size_(1000)
00026 , metric_queue_notify_size_(10)
00027 {}
00028
00029 artdaq::MetricManager::~MetricManager() noexcept
00030 {
00031 shutdown();
00032 }
00033
00034 void artdaq::MetricManager::initialize(fhicl::ParameterSet const& pset, std::string const& prefix)
00035 {
00036 prefix_ = prefix;
00037 if (initialized_)
00038 {
00039 shutdown();
00040 }
00041 TLOG(TLVL_INFO) << "Configuring metrics with parameter set: " << pset.to_string() ;
00042
00043 std::vector<std::string> names = pset.get_pset_names();
00044
00045 for (auto name : names)
00046 {
00047 if (name == "metric_queue_size")
00048 {
00049 metric_queue_max_size_ = pset.get<size_t>("metric_queue_size");
00050 }
00051 else if (name == "metric_queue_notify_size")
00052 {
00053 metric_queue_notify_size_ = pset.get<size_t>("metric_queue_notify_size");
00054 }
00055 else if (name == "metric_send_maximum_delay_ms")
00056 {
00057 metric_send_interval_ms_ = pset.get<int>("metric_send_maximum_delay_ms");
00058 }
00059 else
00060 {
00061 try
00062 {
00063 TLOG(TLVL_DEBUG) << "Constructing metric plugin with name " << name ;
00064 fhicl::ParameterSet plugin_pset = pset.get<fhicl::ParameterSet>(name);
00065 metric_plugins_.push_back(makeMetricPlugin(
00066 plugin_pset.get<std::string>("metricPluginType", ""), plugin_pset, prefix_));
00067 }
00068 catch (const cet::exception& e)
00069 {
00070 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00071 ", cet::exception object caught:" << e.explain_self() ;
00072 }
00073 catch (const boost::exception& e)
00074 {
00075 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00076 ", boost::exception object caught: " << boost::diagnostic_information(e) ;
00077 }
00078 catch (const std::exception& e)
00079 {
00080 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00081 ", std::exception caught: " << e.what() ;
00082 }
00083 catch (...)
00084 {
00085 TLOG(TLVL_ERROR) << "Unknown Exception caught in MetricManager::initialize, error loading plugin with name " << name ;
00086 }
00087 }
00088 }
00089
00090 initialized_ = true;
00091 }
00092
00093 void artdaq::MetricManager::do_start()
00094 {
00095 if (!running_)
00096 {
00097 TLOG(TLVL_DEBUG) << "Starting MetricManager" ;
00098 for (auto& metric : metric_plugins_)
00099 {
00100 try
00101 {
00102 metric->startMetrics();
00103 TLOG(TLVL_INFO) << "Metric Plugin " << metric->getLibName() << " started." ;
00104 active_ = true;
00105 }
00106 catch (...)
00107 {
00108 TLOG(TLVL_ERROR) <<
00109 "Exception caught in MetricManager::do_start(), error starting plugin with name " <<
00110 metric->getLibName() ;
00111 }
00112 }
00113 running_ = true;
00114 startMetricLoop_();
00115 }
00116 }
00117
00118 void artdaq::MetricManager::do_stop()
00119 {
00120 TLOG(TLVL_DEBUG) << "Stopping Metrics" ;
00121 running_ = false;
00122 metric_cv_.notify_all();
00123 TLOG(TLVL_DEBUG) << "Joining Metric-Sending thread" ;
00124 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
00125 TLOG(TLVL_DEBUG) << "do_stop Complete" ;
00126 }
00127
00128 void artdaq::MetricManager::do_pause() { }
00129 void artdaq::MetricManager::do_resume() { }
00130
00131 void artdaq::MetricManager::reinitialize(fhicl::ParameterSet const& pset, std::string const& prefix)
00132 {
00133 shutdown();
00134 initialize(pset, prefix);
00135 }
00136
00137 void artdaq::MetricManager::shutdown()
00138 {
00139 TLOG(TLVL_DEBUG) << "MetricManager is shutting down..." ;
00140 do_stop();
00141
00142 if (initialized_)
00143 {
00144 for (auto& i : metric_plugins_)
00145 {
00146 try
00147 {
00148 std::string name = i->getLibName();
00149 i.reset(nullptr);
00150 TLOG(TLVL_DEBUG) << "Metric Plugin " << name << " shutdown." ;
00151 }
00152 catch (...)
00153 {
00154 TLOG(TLVL_ERROR) <<
00155 "Exception caught in MetricManager::shutdown(), error shutting down metric with name " <<
00156 i->getLibName() ;
00157 }
00158 }
00159 initialized_ = false;
00160 }
00161 }
00162
00163 void artdaq::MetricManager::sendMetric(std::string const& name, std::string const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00164 {
00165 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00166 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00167 else if (active_)
00168 {
00169 if (!metric_queue_.count(name)) {
00170 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00171 }
00172 auto entry = &(metric_queue_[name]);
00173
00174 auto size = entry->first;
00175 if (size < metric_queue_max_size_)
00176 {
00177 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00178 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00179 {
00180 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00181 entry->first++;
00182 entry->second.emplace_back(std::move(metric));
00183 }
00184 }
00185 else
00186 {
00187 TLOG(10) << "Rejecting metric because queue full" ;
00188 missed_metric_calls_++;
00189 }
00190 metric_cv_.notify_all();
00191 }
00192 }
00193
00194 void artdaq::MetricManager::sendMetric(std::string const& name, int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00195 {
00196 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00197 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00198 else if (active_)
00199 {
00200 if (!metric_queue_.count(name)) {
00201 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00202 }
00203 auto entry = &(metric_queue_[name]);
00204
00205 auto size = entry->first;
00206 if (size < metric_queue_max_size_)
00207 {
00208 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00209 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00210 {
00211 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00212 entry->first++;
00213 entry->second.emplace_back(std::move(metric));
00214 }
00215 }
00216 else
00217 {
00218 TLOG(10) << "Rejecting metric because queue full" ;
00219 missed_metric_calls_++;
00220 }
00221 metric_cv_.notify_all();
00222 }
00223 }
00224
00225 void artdaq::MetricManager::sendMetric(std::string const& name, double const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00226 {
00227 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00228 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00229 else if (active_)
00230 {
00231 if (!metric_queue_.count(name)) {
00232 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00233 }
00234 auto entry = &(metric_queue_[name]);
00235
00236 auto size = entry->first;
00237 if (size < metric_queue_max_size_)
00238 {
00239 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00240 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00241 {
00242 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00243 entry->first++;
00244 entry->second.emplace_back(std::move(metric));
00245 }
00246 }
00247 else
00248 {
00249 TLOG(10) << "Rejecting metric because queue full" ;
00250 missed_metric_calls_++;
00251 }
00252 metric_cv_.notify_all();
00253 }
00254 }
00255
00256 void artdaq::MetricManager::sendMetric(std::string const& name, float const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00257 {
00258 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00259 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00260 else if (active_)
00261 {
00262 if (!metric_queue_.count(name)) {
00263 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00264 }
00265 auto entry = &(metric_queue_[name]);
00266
00267 auto size = entry->first;
00268 if (size < metric_queue_max_size_)
00269 {
00270 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00271 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00272 {
00273 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00274 entry->first++;
00275 entry->second.emplace_back(std::move(metric));
00276 }
00277 }
00278 else
00279 {
00280 TLOG(10) << "Rejecting metric because queue full" ;
00281 missed_metric_calls_++;
00282 }
00283 metric_cv_.notify_all();
00284 }
00285 }
00286
00287 void artdaq::MetricManager::sendMetric(std::string const& name, long unsigned int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00288 {
00289 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00290 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00291 else if (active_)
00292 {
00293 if (!metric_queue_.count(name)) {
00294 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00295 }
00296 auto entry = &(metric_queue_[name]);
00297
00298 auto size = entry->first;
00299 if (size < metric_queue_max_size_)
00300 {
00301 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00302 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00303 {
00304 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00305 entry->first++;
00306 entry->second.emplace_back(std::move(metric));
00307 }
00308 }
00309 else
00310 {
00311 TLOG(10) << "Rejecting metric because queue full" ;
00312 missed_metric_calls_++;
00313 }
00314 metric_cv_.notify_all();
00315 }
00316 }
00317
00318 void artdaq::MetricManager::startMetricLoop_()
00319 {
00320 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
00321 TLOG(TLVL_INFO) << "Starting Metric Sending Thread" ;
00322 boost::thread::attributes attrs;
00323 attrs.set_stack_size(4096 * 200);
00324 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_, this));
00325 }
00326
00327 bool artdaq::MetricManager::metricQueueEmpty()
00328 {
00329 for (auto& q : metric_queue_)
00330 {
00331 if (q.second.first != 0) return false;
00332 }
00333 return true;
00334 }
00335
00336 size_t artdaq::MetricManager::metricQueueSize(std::string const& name)
00337 {
00338 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00339 size_t size = 0;
00340 if (name == "") {
00341 for (auto& q : metric_queue_)
00342 {
00343 size += q.second.first;
00344 }
00345 }
00346 else {
00347 if (metric_queue_.count(name)) size = metric_queue_[name].first;
00348 }
00349
00350 return size;
00351 }
00352
00353 void artdaq::MetricManager::sendMetricLoop_()
00354 {
00355 auto last_send_time = std::chrono::steady_clock::time_point();
00356 while (running_)
00357 {
00358 while (metricQueueEmpty() && running_)
00359 {
00360 std::unique_lock<std::mutex> lk(metric_mutex_);
00361 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
00362 auto now = std::chrono::steady_clock::now();
00363 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() > metric_send_interval_ms_)
00364 {
00365 for (auto& metric : metric_plugins_) { metric->sendMetrics(); }
00366 last_send_time = now;
00367 }
00368 }
00369
00370 auto processing_start = std::chrono::steady_clock::now();
00371 auto temp_list = std::list<std::unique_ptr<MetricData>>();
00372 {
00373 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00374
00375 for (auto& q : metric_queue_)
00376 {
00377 temp_list.splice(temp_list.end(), q.second.second);
00378 q.second.first = 0;
00379 }
00380
00381 temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00382 auto missed = missed_metric_calls_.exchange(0);
00383
00384 temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00385 TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
00386 }
00387
00388 while (temp_list.size() > 0)
00389 {
00390 auto data_ = std::move(temp_list.front());
00391 temp_list.pop_front();
00392 if (data_->Type == MetricType::InvalidMetric) continue;
00393 if (!data_->UseNameOverride)
00394 {
00395 if (data_->MetricPrefix.size() > 0)
00396 {
00397 data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
00398 }
00399 else
00400 {
00401 data_->Name = prefix_ + "." + data_->Name;
00402 }
00403 }
00404
00405 for (auto& metric : metric_plugins_)
00406 {
00407 if (metric->getRunLevel() >= data_->Level)
00408 {
00409 try
00410 {
00411 metric->addMetricData(*data_);
00412 last_send_time = std::chrono::steady_clock::now();
00413 }
00414 catch (...)
00415 {
00416 TLOG(TLVL_ERROR) <<
00417 "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
00418 << metric->getLibName() ;
00419 }
00420 }
00421 }
00422 }
00423
00424 for (auto& metric : metric_plugins_)
00425 {
00426 metric->sendMetrics(false, processing_start);
00427 }
00428 }
00429
00430 auto temp_list = std::list<std::unique_ptr<MetricData>>();
00431 {
00432 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00433
00434 for (auto& q : metric_queue_)
00435 {
00436 temp_list.splice(temp_list.end(), q.second.second);
00437 }
00438 metric_queue_.clear();
00439
00440 temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00441 auto missed = missed_metric_calls_.exchange(0);
00442
00443 temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00444 TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
00445 }
00446
00447 while (temp_list.size() > 0)
00448 {
00449 auto data_ = std::move(temp_list.front());
00450 temp_list.pop_front();
00451 if (data_->Type == MetricType::InvalidMetric) continue;
00452 if (!data_->UseNameOverride)
00453 {
00454 if (data_->MetricPrefix.size() > 0)
00455 {
00456 data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
00457 }
00458 else
00459 {
00460 data_->Name = prefix_ + "." + data_->Name;
00461 }
00462 }
00463
00464 for (auto& metric : metric_plugins_)
00465 {
00466 if (metric->getRunLevel() >= data_->Level)
00467 {
00468 try
00469 {
00470 metric->addMetricData(*data_);
00471 last_send_time = std::chrono::steady_clock::now();
00472 }
00473 catch (...)
00474 {
00475 TLOG(TLVL_ERROR) <<
00476 "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
00477 << metric->getLibName() ;
00478 }
00479 }
00480 }
00481 }
00482
00483 for (auto& metric : metric_plugins_)
00484 {
00485 try
00486 {
00487 metric->stopMetrics();
00488 TLOG(TLVL_DEBUG) << "Metric Plugin " << metric->getLibName() << " stopped." ;
00489 }
00490 catch (...)
00491 {
00492 TLOG(TLVL_ERROR) <<
00493 "Exception caught in MetricManager::do_stop(), error stopping plugin with name " <<
00494 metric->getLibName() ;
00495 }
00496 }
00497 TLOG(TLVL_DEBUG) << "MetricManager has been stopped." ;
00498 }