00001
00002
00003
00004
00005
00006
00007
00008
00009 #define TRACE_NAME "MetricManager"
00010 #include "tracemf.h"
00011 #include "artdaq-utilities/Plugins/MetricManager.hh"
00012 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
00013 #include "fhiclcpp/ParameterSet.h"
00014
00015 #include <chrono>
00016 #include <boost/exception/all.hpp>
00017
00018 artdaq::MetricManager::
00019 MetricManager() : metric_plugins_(0)
00020 , metric_send_interval_ms_(15000)
00021 , initialized_(false)
00022 , running_(false)
00023 , active_(false)
00024 , missed_metric_calls_(0)
00025 , metric_queue_max_size_(1000)
00026 , metric_queue_notify_size_(10)
00027 {}
00028
00029 artdaq::MetricManager::~MetricManager() noexcept
00030 {
00031 shutdown();
00032 }
00033
00034 void artdaq::MetricManager::initialize(fhicl::ParameterSet const& pset, std::string const& prefix)
00035 {
00036 prefix_ = prefix;
00037 if (initialized_)
00038 {
00039 shutdown();
00040 }
00041 TLOG(TLVL_INFO) << "Configuring metrics with parameter set: " << pset.to_string() ;
00042
00043 std::vector<std::string> names = pset.get_pset_names();
00044
00045 for (auto name : names)
00046 {
00047 if (name == "metric_queue_size")
00048 {
00049 metric_queue_max_size_ = pset.get<size_t>("metric_queue_size");
00050 }
00051 else if (name == "metric_queue_notify_size")
00052 {
00053 metric_queue_notify_size_ = pset.get<size_t>("metric_queue_notify_size");
00054 }
00055 else if (name == "metric_send_maximum_delay_ms")
00056 {
00057 metric_send_interval_ms_ = pset.get<int>("metric_send_maximum_delay_ms");
00058 }
00059 else
00060 {
00061 try
00062 {
00063 TLOG(TLVL_DEBUG) << "Constructing metric plugin with name " << name ;
00064 fhicl::ParameterSet plugin_pset = pset.get<fhicl::ParameterSet>(name);
00065 metric_plugins_.push_back(makeMetricPlugin(
00066 plugin_pset.get<std::string>("metricPluginType", ""), plugin_pset, prefix_));
00067 }
00068 catch (const cet::exception& e)
00069 {
00070 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00071 ", cet::exception object caught:" << e.explain_self() ;
00072 }
00073 catch (const boost::exception& e)
00074 {
00075 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00076 ", boost::exception object caught: " << boost::diagnostic_information(e) ;
00077 }
00078 catch (const std::exception& e)
00079 {
00080 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00081 ", std::exception caught: " << e.what() ;
00082 }
00083 catch (...)
00084 {
00085 TLOG(TLVL_ERROR) << "Unknown Exception caught in MetricManager::initialize, error loading plugin with name " << name ;
00086 }
00087 }
00088 }
00089
00090 initialized_ = true;
00091 }
00092
00093 void artdaq::MetricManager::do_start()
00094 {
00095 auto lk = std::unique_lock<std::mutex>(metric_mutex_);
00096 if (!running_)
00097 {
00098 TLOG(TLVL_DEBUG) << "Starting MetricManager" ;
00099 for (auto& metric : metric_plugins_)
00100 {
00101 try
00102 {
00103 metric->startMetrics();
00104 TLOG(TLVL_INFO) << "Metric Plugin " << metric->getLibName() << " started." ;
00105 active_ = true;
00106 }
00107 catch (...)
00108 {
00109 TLOG(TLVL_ERROR) <<
00110 "Exception caught in MetricManager::do_start(), error starting plugin with name " <<
00111 metric->getLibName() ;
00112 }
00113 }
00114 running_ = true;
00115 startMetricLoop_();
00116 }
00117 }
00118
00119 void artdaq::MetricManager::do_stop()
00120 {
00121 auto lk = std::unique_lock<std::mutex>(metric_mutex_);
00122 TLOG(TLVL_DEBUG) << "Stopping Metrics" ;
00123 running_ = false;
00124 metric_cv_.notify_all();
00125 TLOG(TLVL_DEBUG) << "Joining Metric-Sending thread" ;
00126 lk.unlock();
00127 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
00128 TLOG(TLVL_DEBUG) << "do_stop Complete" ;
00129 }
00130
00131 void artdaq::MetricManager::do_pause() { }
00132 void artdaq::MetricManager::do_resume() { }
00133
00134 void artdaq::MetricManager::reinitialize(fhicl::ParameterSet const& pset, std::string const& prefix)
00135 {
00136 shutdown();
00137 initialize(pset, prefix);
00138 }
00139
00140 void artdaq::MetricManager::shutdown()
00141 {
00142 TLOG(TLVL_DEBUG) << "MetricManager is shutting down..." ;
00143 do_stop();
00144
00145 auto lk = std::unique_lock<std::mutex>(metric_mutex_);
00146 if (initialized_)
00147 {
00148 for (auto& i : metric_plugins_)
00149 {
00150 try
00151 {
00152 std::string name = i->getLibName();
00153 i.reset(nullptr);
00154 TLOG(TLVL_DEBUG) << "Metric Plugin " << name << " shutdown." ;
00155 }
00156 catch (...)
00157 {
00158 TLOG(TLVL_ERROR) <<
00159 "Exception caught in MetricManager::shutdown(), error shutting down metric with name " <<
00160 i->getLibName() ;
00161 }
00162 }
00163 initialized_ = false;
00164 }
00165 }
00166
00167 void artdaq::MetricManager::sendMetric(std::string const& name, std::string const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00168 {
00169 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00170 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00171 else if (active_)
00172 {
00173 {
00174 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00175 if (!metric_queue_.count(name)) {
00176 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00177 }
00178 }
00179 auto entry = &(metric_queue_[name]);
00180
00181 auto size = entry->first.load();
00182 if (size < metric_queue_max_size_)
00183 {
00184 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00185 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00186 {
00187 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00188 entry->first++;
00189 entry->second.emplace_back(std::move(metric));
00190 }
00191 }
00192 else
00193 {
00194 TLOG(10) << "Rejecting metric because queue full" ;
00195 missed_metric_calls_++;
00196 }
00197 metric_cv_.notify_all();
00198 }
00199 }
00200
00201 void artdaq::MetricManager::sendMetric(std::string const& name, int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00202 {
00203 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00204 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00205 else if (active_)
00206 {
00207 {
00208 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00209 if (!metric_queue_.count(name)) {
00210 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00211 }
00212 }
00213 auto entry = &(metric_queue_[name]);
00214
00215 auto size = entry->first.load();
00216 if (size < metric_queue_max_size_)
00217 {
00218 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00219 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00220 {
00221 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00222 entry->first++;
00223 entry->second.emplace_back(std::move(metric));
00224 }
00225 }
00226 else
00227 {
00228 TLOG(10) << "Rejecting metric because queue full" ;
00229 missed_metric_calls_++;
00230 }
00231 metric_cv_.notify_all();
00232 }
00233 }
00234
00235 void artdaq::MetricManager::sendMetric(std::string const& name, double const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00236 {
00237 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00238 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00239 else if (active_)
00240 {
00241 {
00242 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00243 if (!metric_queue_.count(name)) {
00244 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00245 }
00246 }
00247 auto entry = &(metric_queue_[name]);
00248
00249 auto size = entry->first.load();
00250 if (size < metric_queue_max_size_)
00251 {
00252 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00253 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00254 {
00255 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00256 entry->first++;
00257 entry->second.emplace_back(std::move(metric));
00258 }
00259 }
00260 else
00261 {
00262 TLOG(10) << "Rejecting metric because queue full" ;
00263 missed_metric_calls_++;
00264 }
00265 metric_cv_.notify_all();
00266 }
00267 }
00268
00269 void artdaq::MetricManager::sendMetric(std::string const& name, float const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00270 {
00271 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00272 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00273 else if (active_)
00274 {
00275 {
00276 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00277 if (!metric_queue_.count(name)) {
00278 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00279 }
00280 }
00281 auto entry = &(metric_queue_[name]);
00282
00283 auto size = entry->first.load();
00284 if (size < metric_queue_max_size_)
00285 {
00286 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00287 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00288 {
00289 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00290 entry->first++;
00291 entry->second.emplace_back(std::move(metric));
00292 }
00293 }
00294 else
00295 {
00296 TLOG(10) << "Rejecting metric because queue full" ;
00297 missed_metric_calls_++;
00298 }
00299 metric_cv_.notify_all();
00300 }
00301 }
00302
00303 void artdaq::MetricManager::sendMetric(std::string const& name, long unsigned int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00304 {
00305 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00306 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00307 else if (active_)
00308 {
00309 {
00310 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00311 if (!metric_queue_.count(name)) {
00312 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00313 }
00314 }
00315 auto entry = &(metric_queue_[name]);
00316
00317 auto size = entry->first.load();
00318 if (size < metric_queue_max_size_)
00319 {
00320 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00321 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00322 {
00323 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00324 entry->first++;
00325 entry->second.emplace_back(std::move(metric));
00326 }
00327 }
00328 else
00329 {
00330 TLOG(10) << "Rejecting metric because queue full" ;
00331 missed_metric_calls_++;
00332 }
00333 metric_cv_.notify_all();
00334 }
00335 }
00336
00337 void artdaq::MetricManager::startMetricLoop_()
00338 {
00339 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
00340 boost::thread::attributes attrs;
00341 attrs.set_stack_size(4096 * 2000);
00342 TLOG(TLVL_INFO) << "Starting Metric Sending Thread" ;
00343 try {
00344 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_, this));
00345 }
00346 catch (const boost::exception& e)
00347 {
00348 TLOG(TLVL_ERROR) << "Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
00349 std::cerr << "Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
00350 exit(5);
00351 }
00352 TLOG(TLVL_INFO) << "Metric Sending thread started";
00353 }
00354
00355 bool artdaq::MetricManager::metricQueueEmpty()
00356 {
00357 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00358 for (auto& q : metric_queue_)
00359 {
00360 if (q.second.first != 0) return false;
00361 }
00362 return true;
00363 }
00364
00365 size_t artdaq::MetricManager::metricQueueSize(std::string const& name)
00366 {
00367 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00368 size_t size = 0;
00369 if (name == "") {
00370 for (auto& q : metric_queue_)
00371 {
00372 size += q.second.first;
00373 }
00374 }
00375 else {
00376 if (metric_queue_.count(name)) size = metric_queue_[name].first;
00377 }
00378
00379 return size;
00380 }
00381
00382 void artdaq::MetricManager::sendMetricLoop_()
00383 {
00384 TLOG(TLVL_INFO) << "sendMetricLoop_ START";
00385 auto last_send_time = std::chrono::steady_clock::time_point();
00386 while (running_)
00387 {
00388 while (metricQueueEmpty() && running_)
00389 {
00390 std::unique_lock<std::mutex> lk(metric_mutex_);
00391 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
00392 auto now = std::chrono::steady_clock::now();
00393 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() > metric_send_interval_ms_)
00394 {
00395 for (auto& metric : metric_plugins_) { metric->sendMetrics(); }
00396 last_send_time = now;
00397 }
00398 }
00399
00400 auto processing_start = std::chrono::steady_clock::now();
00401 auto temp_list = std::list<std::unique_ptr<MetricData>>();
00402 {
00403 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00404
00405 for (auto& q : metric_queue_)
00406 {
00407 temp_list.splice(temp_list.end(), q.second.second);
00408 q.second.first = 0;
00409 }
00410
00411 temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00412 auto missed = missed_metric_calls_.exchange(0);
00413
00414 temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00415 TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
00416 }
00417
00418 while (temp_list.size() > 0)
00419 {
00420 auto data_ = std::move(temp_list.front());
00421 temp_list.pop_front();
00422 if (data_->Type == MetricType::InvalidMetric) continue;
00423 if (!data_->UseNameOverride)
00424 {
00425 if (data_->MetricPrefix.size() > 0)
00426 {
00427 data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
00428 }
00429 else
00430 {
00431 data_->Name = prefix_ + "." + data_->Name;
00432 }
00433 }
00434
00435 for (auto& metric : metric_plugins_)
00436 {
00437 if (metric->getRunLevel() >= data_->Level)
00438 {
00439 try
00440 {
00441 metric->addMetricData(*data_);
00442 last_send_time = std::chrono::steady_clock::now();
00443 }
00444 catch (...)
00445 {
00446 TLOG(TLVL_ERROR) <<
00447 "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
00448 << metric->getLibName() ;
00449 }
00450 }
00451 }
00452 }
00453
00454 for (auto& metric : metric_plugins_)
00455 {
00456 metric->sendMetrics(false, processing_start);
00457 }
00458 }
00459
00460 auto temp_list = std::list<std::unique_ptr<MetricData>>();
00461 {
00462 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00463
00464 for (auto& q : metric_queue_)
00465 {
00466 temp_list.splice(temp_list.end(), q.second.second);
00467 }
00468 metric_queue_.clear();
00469
00470 temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00471 auto missed = missed_metric_calls_.exchange(0);
00472
00473 temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00474 TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
00475 }
00476
00477 while (temp_list.size() > 0)
00478 {
00479 auto data_ = std::move(temp_list.front());
00480 temp_list.pop_front();
00481 if (data_->Type == MetricType::InvalidMetric) continue;
00482 if (!data_->UseNameOverride)
00483 {
00484 if (data_->MetricPrefix.size() > 0)
00485 {
00486 data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
00487 }
00488 else
00489 {
00490 data_->Name = prefix_ + "." + data_->Name;
00491 }
00492 }
00493
00494 for (auto& metric : metric_plugins_)
00495 {
00496 if (metric->getRunLevel() >= data_->Level)
00497 {
00498 try
00499 {
00500 metric->addMetricData(*data_);
00501 last_send_time = std::chrono::steady_clock::now();
00502 }
00503 catch (...)
00504 {
00505 TLOG(TLVL_ERROR) <<
00506 "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
00507 << metric->getLibName() ;
00508 }
00509 }
00510 }
00511 }
00512
00513 for (auto& metric : metric_plugins_)
00514 {
00515 try
00516 {
00517 metric->stopMetrics();
00518 TLOG(TLVL_DEBUG) << "Metric Plugin " << metric->getLibName() << " stopped." ;
00519 }
00520 catch (...)
00521 {
00522 TLOG(TLVL_ERROR) <<
00523 "Exception caught in MetricManager::do_stop(), error stopping plugin with name " <<
00524 metric->getLibName() ;
00525 }
00526 }
00527 TLOG(TLVL_DEBUG) << "MetricManager has been stopped." ;
00528 }