00001
00002
00003
00004
00005
00006
00007
00008
00009 #define TRACE_NAME "MetricManager"
00010 #include "tracemf.h"
00011 #include "artdaq-utilities/Plugins/MetricManager.hh"
00012 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
00013 #include "fhiclcpp/ParameterSet.h"
00014
00015 #include <chrono>
00016 #include <boost/exception/all.hpp>
00017
00018 artdaq::MetricManager::
00019 MetricManager() : metric_plugins_(0)
00020 , metric_send_interval_ms_(15000)
00021 , initialized_(false)
00022 , running_(false)
00023 , active_(false)
00024 , missed_metric_calls_(0)
00025 , metric_queue_max_size_(1000)
00026 , metric_queue_notify_size_(10)
00027 {}
00028
00029 artdaq::MetricManager::~MetricManager() noexcept
00030 {
00031 shutdown();
00032 }
00033
00034 void artdaq::MetricManager::initialize(fhicl::ParameterSet const& pset, std::string const& prefix)
00035 {
00036 prefix_ = prefix;
00037 if (initialized_)
00038 {
00039 shutdown();
00040 }
00041 TLOG(TLVL_INFO) << "Configuring metrics with parameter set: " << pset.to_string() ;
00042
00043 std::vector<std::string> names = pset.get_pset_names();
00044
00045 metric_plugins_.clear();
00046
00047 for (auto name : names)
00048 {
00049 if (name == "metric_queue_size")
00050 {
00051 metric_queue_max_size_ = pset.get<size_t>("metric_queue_size");
00052 }
00053 else if (name == "metric_queue_notify_size")
00054 {
00055 metric_queue_notify_size_ = pset.get<size_t>("metric_queue_notify_size");
00056 }
00057 else if (name == "metric_send_maximum_delay_ms")
00058 {
00059 metric_send_interval_ms_ = pset.get<int>("metric_send_maximum_delay_ms");
00060 }
00061 else
00062 {
00063 try
00064 {
00065 TLOG(TLVL_DEBUG) << "Constructing metric plugin with name " << name ;
00066 fhicl::ParameterSet plugin_pset = pset.get<fhicl::ParameterSet>(name);
00067 metric_plugins_.push_back(makeMetricPlugin(
00068 plugin_pset.get<std::string>("metricPluginType", ""), plugin_pset, prefix_));
00069 }
00070 catch (const cet::exception& e)
00071 {
00072 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00073 ", cet::exception object caught:" << e.explain_self() ;
00074 }
00075 catch (const boost::exception& e)
00076 {
00077 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00078 ", boost::exception object caught: " << boost::diagnostic_information(e) ;
00079 }
00080 catch (const std::exception& e)
00081 {
00082 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00083 ", std::exception caught: " << e.what() ;
00084 }
00085 catch (...)
00086 {
00087 TLOG(TLVL_ERROR) << "Unknown Exception caught in MetricManager::initialize, error loading plugin with name " << name ;
00088 }
00089 }
00090 }
00091
00092 initialized_ = true;
00093 }
00094
00095 void artdaq::MetricManager::do_start()
00096 {
00097 auto lk = std::unique_lock<std::mutex>(metric_mutex_);
00098 if (!running_)
00099 {
00100 TLOG(TLVL_DEBUG) << "Starting MetricManager" ;
00101 for (auto& metric : metric_plugins_)
00102 {
00103 if (!metric) continue;
00104 try
00105 {
00106 metric->startMetrics();
00107 TLOG(TLVL_INFO) << "Metric Plugin " << metric->getLibName() << " started." ;
00108 active_ = true;
00109 }
00110 catch (...)
00111 {
00112 TLOG(TLVL_ERROR) <<
00113 "Exception caught in MetricManager::do_start(), error starting plugin with name " <<
00114 metric->getLibName() ;
00115 }
00116 }
00117 running_ = true;
00118 startMetricLoop_();
00119 }
00120 }
00121
00122 void artdaq::MetricManager::do_stop()
00123 {
00124 auto lk = std::unique_lock<std::mutex>(metric_mutex_);
00125 TLOG(TLVL_DEBUG) << "Stopping Metrics" ;
00126 running_ = false;
00127 metric_cv_.notify_all();
00128 TLOG(TLVL_DEBUG) << "Joining Metric-Sending thread" ;
00129 lk.unlock();
00130 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
00131 TLOG(TLVL_DEBUG) << "do_stop Complete" ;
00132 }
00133
00134 void artdaq::MetricManager::do_pause() { }
00135 void artdaq::MetricManager::do_resume() { }
00136
00137 void artdaq::MetricManager::reinitialize(fhicl::ParameterSet const& pset, std::string const& prefix)
00138 {
00139 shutdown();
00140 initialize(pset, prefix);
00141 }
00142
00143 void artdaq::MetricManager::shutdown()
00144 {
00145 TLOG(TLVL_DEBUG) << "MetricManager is shutting down..." ;
00146 do_stop();
00147
00148 auto lk = std::unique_lock<std::mutex>(metric_mutex_);
00149 if (initialized_)
00150 {
00151 for (auto& i : metric_plugins_)
00152 {
00153 try
00154 {
00155 std::string name = i->getLibName();
00156 i.reset(nullptr);
00157 TLOG(TLVL_DEBUG) << "Metric Plugin " << name << " shutdown." ;
00158 }
00159 catch (...)
00160 {
00161 TLOG(TLVL_ERROR) <<
00162 "Exception caught in MetricManager::shutdown(), error shutting down metric with name " <<
00163 i->getLibName() ;
00164 }
00165 }
00166 metric_plugins_.clear();
00167 initialized_ = false;
00168 }
00169 }
00170
00171 void artdaq::MetricManager::sendMetric(std::string const& name, std::string const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00172 {
00173 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00174 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00175 else if (active_)
00176 {
00177 {
00178 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00179 if (!metric_queue_.count(name)) {
00180 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00181 }
00182 }
00183 auto entry = &(metric_queue_[name]);
00184
00185 auto size = entry->first.load();
00186 if (size < metric_queue_max_size_)
00187 {
00188 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00189 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00190 {
00191 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00192 entry->first++;
00193 entry->second.emplace_back(std::move(metric));
00194 }
00195 }
00196 else
00197 {
00198 TLOG(10) << "Rejecting metric because queue full" ;
00199 missed_metric_calls_++;
00200 }
00201 metric_cv_.notify_all();
00202 }
00203 }
00204
00205 void artdaq::MetricManager::sendMetric(std::string const& name, int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00206 {
00207 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00208 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00209 else if (active_)
00210 {
00211 {
00212 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00213 if (!metric_queue_.count(name)) {
00214 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00215 }
00216 }
00217 auto entry = &(metric_queue_[name]);
00218
00219 auto size = entry->first.load();
00220 if (size < metric_queue_max_size_)
00221 {
00222 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00223 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00224 {
00225 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00226 entry->first++;
00227 entry->second.emplace_back(std::move(metric));
00228 }
00229 }
00230 else
00231 {
00232 TLOG(10) << "Rejecting metric because queue full" ;
00233 missed_metric_calls_++;
00234 }
00235 metric_cv_.notify_all();
00236 }
00237 }
00238
00239 void artdaq::MetricManager::sendMetric(std::string const& name, double const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00240 {
00241 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00242 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00243 else if (active_)
00244 {
00245 {
00246 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00247 if (!metric_queue_.count(name)) {
00248 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00249 }
00250 }
00251 auto entry = &(metric_queue_[name]);
00252
00253 auto size = entry->first.load();
00254 if (size < metric_queue_max_size_)
00255 {
00256 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00257 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00258 {
00259 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00260 entry->first++;
00261 entry->second.emplace_back(std::move(metric));
00262 }
00263 }
00264 else
00265 {
00266 TLOG(10) << "Rejecting metric because queue full" ;
00267 missed_metric_calls_++;
00268 }
00269 metric_cv_.notify_all();
00270 }
00271 }
00272
00273 void artdaq::MetricManager::sendMetric(std::string const& name, float const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00274 {
00275 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00276 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00277 else if (active_)
00278 {
00279 {
00280 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00281 if (!metric_queue_.count(name)) {
00282 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00283 }
00284 }
00285 auto entry = &(metric_queue_[name]);
00286
00287 auto size = entry->first.load();
00288 if (size < metric_queue_max_size_)
00289 {
00290 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00291 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00292 {
00293 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00294 entry->first++;
00295 entry->second.emplace_back(std::move(metric));
00296 }
00297 }
00298 else
00299 {
00300 TLOG(10) << "Rejecting metric because queue full" ;
00301 missed_metric_calls_++;
00302 }
00303 metric_cv_.notify_all();
00304 }
00305 }
00306
00307 void artdaq::MetricManager::sendMetric(std::string const& name, long unsigned int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00308 {
00309 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00310 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00311 else if (active_)
00312 {
00313 {
00314 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00315 if (!metric_queue_.count(name)) {
00316 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00317 }
00318 }
00319 auto entry = &(metric_queue_[name]);
00320
00321 auto size = entry->first.load();
00322 if (size < metric_queue_max_size_)
00323 {
00324 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00325 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00326 {
00327 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00328 entry->first++;
00329 entry->second.emplace_back(std::move(metric));
00330 }
00331 }
00332 else
00333 {
00334 TLOG(10) << "Rejecting metric because queue full" ;
00335 missed_metric_calls_++;
00336 }
00337 metric_cv_.notify_all();
00338 }
00339 }
00340
00341 void artdaq::MetricManager::startMetricLoop_()
00342 {
00343 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
00344 boost::thread::attributes attrs;
00345 attrs.set_stack_size(4096 * 2000);
00346 TLOG(TLVL_INFO) << "Starting Metric Sending Thread" ;
00347 try {
00348 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_, this));
00349 }
00350 catch (const boost::exception& e)
00351 {
00352 TLOG(TLVL_ERROR) << "Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
00353 std::cerr << "Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
00354 exit(5);
00355 }
00356 TLOG(TLVL_INFO) << "Metric Sending thread started";
00357 }
00358
00359 bool artdaq::MetricManager::metricQueueEmpty()
00360 {
00361 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00362 for (auto& q : metric_queue_)
00363 {
00364 if (q.second.first != 0) return false;
00365 }
00366 return true;
00367 }
00368
00369 size_t artdaq::MetricManager::metricQueueSize(std::string const& name)
00370 {
00371 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00372 size_t size = 0;
00373 if (name == "") {
00374 for (auto& q : metric_queue_)
00375 {
00376 size += q.second.first;
00377 }
00378 }
00379 else {
00380 if (metric_queue_.count(name)) size = metric_queue_[name].first;
00381 }
00382
00383 return size;
00384 }
00385
00386 void artdaq::MetricManager::sendMetricLoop_()
00387 {
00388 TLOG(TLVL_INFO) << "sendMetricLoop_ START";
00389 auto last_send_time = std::chrono::steady_clock::time_point();
00390 while (running_)
00391 {
00392 while (metricQueueEmpty() && running_)
00393 {
00394 std::unique_lock<std::mutex> lk(metric_mutex_);
00395 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
00396 auto now = std::chrono::steady_clock::now();
00397 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() > metric_send_interval_ms_)
00398 {
00399 for (auto& metric : metric_plugins_) { if(metric) metric->sendMetrics(); }
00400 last_send_time = now;
00401 }
00402 }
00403
00404 auto processing_start = std::chrono::steady_clock::now();
00405 auto temp_list = std::list<std::unique_ptr<MetricData>>();
00406 {
00407 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00408
00409 for (auto& q : metric_queue_)
00410 {
00411 temp_list.splice(temp_list.end(), q.second.second);
00412 q.second.first = 0;
00413 }
00414
00415 temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00416 auto missed = missed_metric_calls_.exchange(0);
00417
00418 temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00419 TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
00420 }
00421
00422 while (temp_list.size() > 0)
00423 {
00424 auto data_ = std::move(temp_list.front());
00425 temp_list.pop_front();
00426 if (data_->Type == MetricType::InvalidMetric) continue;
00427 if (!data_->UseNameOverride)
00428 {
00429 if (data_->MetricPrefix.size() > 0)
00430 {
00431 data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
00432 }
00433 else
00434 {
00435 data_->Name = prefix_ + "." + data_->Name;
00436 }
00437 }
00438
00439 for (auto& metric : metric_plugins_)
00440 {
00441 if (!metric) continue;
00442 if (metric->getRunLevel() >= data_->Level)
00443 {
00444 try
00445 {
00446 metric->addMetricData(*data_);
00447 last_send_time = std::chrono::steady_clock::now();
00448 }
00449 catch (...)
00450 {
00451 TLOG(TLVL_ERROR) <<
00452 "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
00453 << metric->getLibName() ;
00454 }
00455 }
00456 }
00457 }
00458
00459 for (auto& metric : metric_plugins_)
00460 {
00461 if (!metric) continue;
00462 metric->sendMetrics(false, processing_start);
00463 }
00464 }
00465
00466 auto temp_list = std::list<std::unique_ptr<MetricData>>();
00467 {
00468 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00469
00470 for (auto& q : metric_queue_)
00471 {
00472 temp_list.splice(temp_list.end(), q.second.second);
00473 }
00474 metric_queue_.clear();
00475
00476 temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00477 auto missed = missed_metric_calls_.exchange(0);
00478
00479 temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00480 TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
00481 }
00482
00483 while (temp_list.size() > 0)
00484 {
00485 auto data_ = std::move(temp_list.front());
00486 temp_list.pop_front();
00487 if (data_->Type == MetricType::InvalidMetric) continue;
00488 if (!data_->UseNameOverride)
00489 {
00490 if (data_->MetricPrefix.size() > 0)
00491 {
00492 data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
00493 }
00494 else
00495 {
00496 data_->Name = prefix_ + "." + data_->Name;
00497 }
00498 }
00499
00500 for (auto& metric : metric_plugins_)
00501 {
00502 if (!metric) continue;
00503 if (metric->getRunLevel() >= data_->Level)
00504 {
00505 try
00506 {
00507 metric->addMetricData(*data_);
00508 last_send_time = std::chrono::steady_clock::now();
00509 }
00510 catch (...)
00511 {
00512 TLOG(TLVL_ERROR) <<
00513 "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
00514 << metric->getLibName() ;
00515 }
00516 }
00517 }
00518 }
00519
00520 for (auto& metric : metric_plugins_)
00521 {
00522 if (!metric) continue;
00523 try
00524 {
00525 metric->stopMetrics();
00526 TLOG(TLVL_DEBUG) << "Metric Plugin " << metric->getLibName() << " stopped." ;
00527 }
00528 catch (...)
00529 {
00530 TLOG(TLVL_ERROR) <<
00531 "Exception caught in MetricManager::do_stop(), error stopping plugin with name " <<
00532 metric->getLibName() ;
00533 }
00534 }
00535 TLOG(TLVL_DEBUG) << "MetricManager has been stopped." ;
00536 }