00001
00002
00003
00004
00005
00006
00007
00008
00009 #define TRACE_NAME "MetricManager"
00010 #include "tracemf.h"
00011 #include "artdaq-utilities/Plugins/MetricManager.hh"
00012 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
00013 #include "fhiclcpp/ParameterSet.h"
00014
00015 #include <chrono>
00016 #include <boost/exception/all.hpp>
00017
00018 artdaq::MetricManager::
00019 MetricManager() : metric_plugins_(0)
00020 , metric_send_interval_ms_(15000)
00021 , initialized_(false)
00022 , running_(false)
00023 , active_(false)
00024 , missed_metric_calls_(0)
00025 , metric_queue_max_size_(1000)
00026 , metric_queue_notify_size_(10)
00027 {}
00028
00029 artdaq::MetricManager::~MetricManager() noexcept
00030 {
00031 shutdown();
00032 }
00033
00034 void artdaq::MetricManager::initialize(fhicl::ParameterSet const& pset, std::string prefix)
00035 {
00036 prefix_ = prefix;
00037 if (initialized_)
00038 {
00039 shutdown();
00040 }
00041 TLOG(TLVL_INFO) << "Configuring metrics with parameter set:\n" << pset.to_string() ;
00042
00043 std::vector<std::string> names = pset.get_pset_names();
00044
00045 for (auto name : names)
00046 {
00047 if (name == "metric_queue_size")
00048 {
00049 metric_queue_max_size_ = pset.get<size_t>("metric_queue_size");
00050 }
00051 else if (name == "metric_queue_notify_size")
00052 {
00053 metric_queue_notify_size_ = pset.get<size_t>("metric_queue_notify_size");
00054 }
00055 else if (name == "metric_send_maximum_delay_ms")
00056 {
00057 metric_send_interval_ms_ = pset.get<int>("metric_send_maximum_delay_ms");
00058 }
00059 else
00060 {
00061 try
00062 {
00063 TLOG(TLVL_DEBUG) << "Constructing metric plugin with name " << name ;
00064 fhicl::ParameterSet plugin_pset = pset.get<fhicl::ParameterSet>(name);
00065 metric_plugins_.push_back(makeMetricPlugin(
00066 plugin_pset.get<std::string>("metricPluginType", ""), plugin_pset, prefix_));
00067 }
00068 catch (const cet::exception& e)
00069 {
00070 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00071 ", cet::exception object caught:" << e.explain_self() ;
00072 }
00073 catch (const boost::exception& e)
00074 {
00075 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00076 ", boost::exception object caught: " << boost::diagnostic_information(e) ;
00077 }
00078 catch (const std::exception& e)
00079 {
00080 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00081 ", std::exception caught: " << e.what() ;
00082 }
00083 catch (...)
00084 {
00085 TLOG(TLVL_ERROR) << "Unknown Exception caught in MetricManager::initialize, error loading plugin with name " << name ;
00086 }
00087 }
00088 }
00089
00090 initialized_ = true;
00091 }
00092
00093 void artdaq::MetricManager::do_start()
00094 {
00095 if (!running_)
00096 {
00097 TLOG(TLVL_DEBUG) << "Starting MetricManager" ;
00098 for (auto& metric : metric_plugins_)
00099 {
00100 try
00101 {
00102 metric->startMetrics();
00103 TLOG(TLVL_INFO) << "Metric Plugin " << metric->getLibName() << " started." ;
00104 active_ = true;
00105 }
00106 catch (...)
00107 {
00108 TLOG(TLVL_ERROR) <<
00109 "Exception caught in MetricManager::do_start(), error starting plugin with name " <<
00110 metric->getLibName() ;
00111 }
00112 }
00113 running_ = true;
00114 startMetricLoop_();
00115 }
00116 }
00117
00118 void artdaq::MetricManager::do_stop()
00119 {
00120 TLOG(TLVL_DEBUG) << "Stopping Metrics" ;
00121 running_ = false;
00122 metric_cv_.notify_all();
00123 TLOG(TLVL_DEBUG) << "Joining Metric-Sending thread" ;
00124 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
00125 TLOG(TLVL_DEBUG) << "do_stop Complete" ;
00126 }
00127
00128 void artdaq::MetricManager::do_pause() { }
00129 void artdaq::MetricManager::do_resume() { }
00130
00131 void artdaq::MetricManager::reinitialize(fhicl::ParameterSet const& pset, std::string prefix)
00132 {
00133 shutdown();
00134 initialize(pset, prefix);
00135 }
00136
00137 void artdaq::MetricManager::shutdown()
00138 {
00139 TLOG(TLVL_DEBUG) << "MetricManager is shutting down..." ;
00140 do_stop();
00141
00142 if (initialized_)
00143 {
00144 for (auto& i : metric_plugins_)
00145 {
00146 try
00147 {
00148 std::string name = i->getLibName();
00149 i.reset(nullptr);
00150 TLOG(TLVL_DEBUG) << "Metric Plugin " << name << " shutdown." ;
00151 }
00152 catch (...)
00153 {
00154 TLOG(TLVL_ERROR) <<
00155 "Exception caught in MetricManager::shutdown(), error shutting down metric with name " <<
00156 i->getLibName() ;
00157 }
00158 }
00159 initialized_ = false;
00160 }
00161 }
00162
00163 void artdaq::MetricManager::sendMetric(std::string const& name, std::string const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00164 {
00165 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00166 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00167 else if (active_)
00168 {
00169 if (!metric_queue_.count(name)) {
00170 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00171 }
00172 auto entry = &(metric_queue_[name]);
00173
00174 auto size = entry->first;
00175 if (size < metric_queue_max_size_)
00176 {
00177 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00178 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00179 {
00180 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00181 entry->first++;
00182 entry->second.emplace_back(std::move(metric));
00183 }
00184 }
00185 else
00186 {
00187 TLOG(10) << "Rejecting metric because queue full" ;
00188 missed_metric_calls_++;
00189 }
00190 metric_cv_.notify_all();
00191 }
00192 }
00193
00194 void artdaq::MetricManager::sendMetric(std::string const& name, int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00195 {
00196 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00197 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00198 else if (active_)
00199 {
00200 if (!metric_queue_.count(name)) {
00201 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00202 }
00203 auto entry = &(metric_queue_[name]);
00204
00205 auto size = entry->first;
00206 if (size < metric_queue_max_size_)
00207 {
00208 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00209 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00210 {
00211 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00212 entry->first++;
00213 entry->second.emplace_back(std::move(metric));
00214 }
00215 }
00216 else
00217 {
00218 TLOG(10) << "Rejecting metric because queue full" ;
00219 missed_metric_calls_++;
00220 }
00221 metric_cv_.notify_all();
00222 }
00223 }
00224
00225 void artdaq::MetricManager::sendMetric(std::string const& name, double const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00226 {
00227 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00228 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00229 else if (active_)
00230 {
00231 if (!metric_queue_.count(name)) {
00232 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00233 }
00234 auto entry = &(metric_queue_[name]);
00235
00236 auto size = entry->first;
00237 if (size < metric_queue_max_size_)
00238 {
00239 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00240 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00241 {
00242 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00243 entry->first++;
00244 entry->second.emplace_back(std::move(metric));
00245 }
00246 }
00247 else
00248 {
00249 TLOG(10) << "Rejecting metric because queue full" ;
00250 missed_metric_calls_++;
00251 }
00252 metric_cv_.notify_all();
00253 }
00254 }
00255
00256 void artdaq::MetricManager::sendMetric(std::string const& name, float const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00257 {
00258 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00259 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00260 else if (active_)
00261 {
00262 if (!metric_queue_.count(name)) {
00263 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00264 }
00265 auto entry = &(metric_queue_[name]);
00266
00267 auto size = entry->first;
00268 if (size < metric_queue_max_size_)
00269 {
00270 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00271 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00272 {
00273 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00274 entry->first++;
00275 entry->second.emplace_back(std::move(metric));
00276 }
00277 }
00278 else
00279 {
00280 TLOG(10) << "Rejecting metric because queue full" ;
00281 missed_metric_calls_++;
00282 }
00283 metric_cv_.notify_all();
00284 }
00285 }
00286
00287 void artdaq::MetricManager::sendMetric(std::string const& name, long unsigned int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00288 {
00289 if (!initialized_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!" ; }
00290 else if (!running_) { TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager stopped!" ; }
00291 else if (active_)
00292 {
00293 if (!metric_queue_.count(name)) {
00294 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00295 }
00296 auto entry = &(metric_queue_[name]);
00297
00298 auto size = entry->first;
00299 if (size < metric_queue_max_size_)
00300 {
00301 if (size >= metric_queue_notify_size_) TLOG(9) << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." ;
00302 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00303 {
00304 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00305 entry->first++;
00306 entry->second.emplace_back(std::move(metric));
00307 }
00308 }
00309 else
00310 {
00311 TLOG(10) << "Rejecting metric because queue full" ;
00312 missed_metric_calls_++;
00313 }
00314 metric_cv_.notify_all();
00315 }
00316 }
00317
00318 void artdaq::MetricManager::startMetricLoop_()
00319 {
00320 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
00321 TLOG(TLVL_INFO) << "Starting Metric Sending Thread" ;
00322 boost::thread::attributes attrs;
00323 attrs.set_stack_size(4096 * 200);
00324 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_, this));
00325 }
00326
00327 bool artdaq::MetricManager::metricQueueEmpty()
00328 {
00329 for (auto& q : metric_queue_)
00330 {
00331 if (q.second.first != 0) return false;
00332 }
00333 return true;
00334 }
00335
00336 size_t artdaq::MetricManager::metricQueueSize(std::string name)
00337 {
00338 size_t size = 0;
00339 if (name == "") {
00340
00341 }
00342 else {
00343 if (metric_queue_.count(name)) size = metric_queue_[name].first;
00344 }
00345
00346 return size;
00347 }
00348
00349 void artdaq::MetricManager::sendMetricLoop_()
00350 {
00351 auto last_send_time = std::chrono::steady_clock::time_point();
00352 while (running_)
00353 {
00354 while (metricQueueEmpty() && running_)
00355 {
00356 std::unique_lock<std::mutex> lk(metric_mutex_);
00357 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
00358 auto now = std::chrono::steady_clock::now();
00359 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() > metric_send_interval_ms_)
00360 {
00361 for (auto& metric : metric_plugins_) { metric->sendMetrics(); }
00362 last_send_time = now;
00363 }
00364 }
00365
00366 auto processing_start = std::chrono::steady_clock::now();
00367 auto temp_list = std::list<std::unique_ptr<MetricData>>();
00368 {
00369 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00370
00371 for (auto& q : metric_queue_)
00372 {
00373 temp_list.splice(temp_list.end(), q.second.second);
00374 q.second.first = 0;
00375 }
00376
00377 temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00378 auto missed = missed_metric_calls_.exchange(0);
00379
00380 temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00381 TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
00382 }
00383
00384 while (temp_list.size() > 0)
00385 {
00386 auto data_ = std::move(temp_list.front());
00387 temp_list.pop_front();
00388 if (data_->Type == MetricType::InvalidMetric) continue;
00389 if (!data_->UseNameOverride)
00390 {
00391 if (data_->MetricPrefix.size() > 0)
00392 {
00393 data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
00394 }
00395 else
00396 {
00397 data_->Name = prefix_ + "." + data_->Name;
00398 }
00399 }
00400
00401 for (auto& metric : metric_plugins_)
00402 {
00403 if (metric->getRunLevel() >= data_->Level)
00404 {
00405 try
00406 {
00407 metric->addMetricData(*data_);
00408 last_send_time = std::chrono::steady_clock::now();
00409 }
00410 catch (...)
00411 {
00412 TLOG(TLVL_ERROR) <<
00413 "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
00414 << metric->getLibName() ;
00415 }
00416 }
00417 }
00418 }
00419
00420 for (auto& metric : metric_plugins_)
00421 {
00422 metric->sendMetrics(false, processing_start);
00423 }
00424 }
00425
00426 auto temp_list = std::list<std::unique_ptr<MetricData>>();
00427 {
00428 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00429
00430 for (auto& q : metric_queue_)
00431 {
00432 temp_list.splice(temp_list.end(), q.second.second);
00433 }
00434 metric_queue_.clear();
00435
00436 temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00437 auto missed = missed_metric_calls_.exchange(0);
00438
00439 temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false));
00440 TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" ;
00441 }
00442
00443 while (temp_list.size() > 0)
00444 {
00445 auto data_ = std::move(temp_list.front());
00446 temp_list.pop_front();
00447 if (data_->Type == MetricType::InvalidMetric) continue;
00448 if (!data_->UseNameOverride)
00449 {
00450 if (data_->MetricPrefix.size() > 0)
00451 {
00452 data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
00453 }
00454 else
00455 {
00456 data_->Name = prefix_ + "." + data_->Name;
00457 }
00458 }
00459
00460 for (auto& metric : metric_plugins_)
00461 {
00462 if (metric->getRunLevel() >= data_->Level)
00463 {
00464 try
00465 {
00466 metric->addMetricData(*data_);
00467 last_send_time = std::chrono::steady_clock::now();
00468 }
00469 catch (...)
00470 {
00471 TLOG(TLVL_ERROR) <<
00472 "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
00473 << metric->getLibName() ;
00474 }
00475 }
00476 }
00477 }
00478
00479 for (auto& metric : metric_plugins_)
00480 {
00481 try
00482 {
00483 metric->stopMetrics();
00484 TLOG(TLVL_DEBUG) << "Metric Plugin " << metric->getLibName() << " stopped." ;
00485 }
00486 catch (...)
00487 {
00488 TLOG(TLVL_ERROR) <<
00489 "Exception caught in MetricManager::do_stop(), error stopping plugin with name " <<
00490 metric->getLibName() ;
00491 }
00492 }
00493 TLOG(TLVL_DEBUG) << "MetricManager has been stopped." ;
00494 }