00001
00002
00003
00004
00005
00006
00007
00008
00009 #define TRACE_NAME "MetricManager"
00010 #include "tracemf.h"
00011 #include "artdaq-utilities/Plugins/MetricManager.hh"
00012 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh"
00013 #include "fhiclcpp/ParameterSet.h"
00014
00015 #include <chrono>
00016 #include <boost/exception/all.hpp>
00017
00018 artdaq::MetricManager::
00019 MetricManager() : metric_plugins_(0)
00020 , initialized_(false)
00021 , running_(false)
00022 , active_(false)
00023 , missed_metric_calls_(0)
00024 , metric_queue_max_size_(1000)
00025 , metric_queue_notify_size_(10)
00026 {}
00027
00028 artdaq::MetricManager::~MetricManager()
00029 {
00030 shutdown();
00031 }
00032
00033 void artdaq::MetricManager::initialize(fhicl::ParameterSet const& pset, std::string prefix)
00034 {
00035 prefix_ = prefix;
00036 if (initialized_)
00037 {
00038 shutdown();
00039 }
00040 TLOG_INFO("MetricManager") << "Configuring metrics with parameter set:\n" << pset.to_string() << TLOG_ENDL;
00041
00042 std::vector<std::string> names = pset.get_pset_names();
00043
00044 for (auto name : names)
00045 {
00046 if (name == "metric_queue_size")
00047 {
00048 metric_queue_max_size_ = pset.get<size_t>("metric_queue_size");
00049 }
00050 else if (name == "metric_queue_notify_size")
00051 {
00052 metric_queue_notify_size_ = pset.get<size_t>("metric_queue_notify_size");
00053 }
00054 else
00055 {
00056 try
00057 {
00058 TLOG_DEBUG("MetricManager") << "Constructing metric plugin with name " << name << TLOG_ENDL;
00059 fhicl::ParameterSet plugin_pset = pset.get<fhicl::ParameterSet>(name);
00060 metric_plugins_.push_back(makeMetricPlugin(
00061 plugin_pset.get<std::string>("metricPluginType", ""), plugin_pset));
00062 }
00063 catch (const cet::exception& e)
00064 {
00065 TLOG_ERROR("MetricManager") << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00066 ", cet::exception object caught:" << e.explain_self() << TLOG_ENDL;
00067 }
00068 catch (const boost::exception& e)
00069 {
00070 TLOG_ERROR("MetricManager") << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00071 ", boost::exception object caught: " << boost::diagnostic_information(e) << TLOG_ENDL;
00072 }
00073 catch (const std::exception& e)
00074 {
00075 TLOG_ERROR("MetricManager") << "Exception caught in MetricManager::initialize, error loading plugin with name " << name <<
00076 ", std::exception caught: " << e.what() << TLOG_ENDL;
00077 }
00078 catch (...)
00079 {
00080 TLOG_ERROR("MetricManager") << "Unknown Exception caught in MetricManager::initialize, error loading plugin with name " << name << TLOG_ENDL;
00081 }
00082 }
00083 }
00084
00085 initialized_ = true;
00086 }
00087
00088 void artdaq::MetricManager::do_start()
00089 {
00090 if (!running_)
00091 {
00092 TLOG_DEBUG("MetricManager") << "Starting MetricManager" << TLOG_ENDL;
00093 for (auto& metric : metric_plugins_)
00094 {
00095 try
00096 {
00097 metric->startMetrics();
00098 TLOG_INFO("MetricManager") << "Metric Plugin " << metric->getLibName() << " started." << TLOG_ENDL;
00099 active_ = true;
00100 }
00101 catch (...)
00102 {
00103 TLOG_ERROR("MetricManager") <<
00104 "Exception caught in MetricManager::do_start(), error starting plugin with name " <<
00105 metric->getLibName() << TLOG_ENDL;
00106 }
00107 }
00108 running_ = true;
00109 startMetricLoop_();
00110 }
00111 }
00112
00113 void artdaq::MetricManager::do_stop()
00114 {
00115 TLOG_DEBUG("MetricManager") << "Stopping Metrics" << TLOG_ENDL;
00116 running_ = false;
00117 metric_cv_.notify_all();
00118 TLOG_DEBUG("MetricManager") << "Joining Metric-Sending thread" << TLOG_ENDL;
00119 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
00120 TLOG_DEBUG("MetricManager") << "do_stop Complete" << TLOG_ENDL;
00121 }
00122
00123 void artdaq::MetricManager::do_pause() { }
00124 void artdaq::MetricManager::do_resume() { }
00125
00126 void artdaq::MetricManager::reinitialize(fhicl::ParameterSet const& pset, std::string prefix)
00127 {
00128 shutdown();
00129 initialize(pset, prefix);
00130 }
00131
00132 void artdaq::MetricManager::shutdown()
00133 {
00134 TLOG_DEBUG("MetricManager") << "MetricManager is shutting down..." << TLOG_ENDL;
00135 do_stop();
00136
00137 if (initialized_)
00138 {
00139 for (auto& i : metric_plugins_)
00140 {
00141 try
00142 {
00143 std::string name = i->getLibName();
00144 i.reset(nullptr);
00145 TLOG_DEBUG("MetricManager") << "Metric Plugin " << name << " shutdown." << TLOG_ENDL;
00146 }
00147 catch (...)
00148 {
00149 TLOG_ERROR("MetricManager") <<
00150 "Exception caught in MetricManager::shutdown(), error shutting down metric with name " <<
00151 i->getLibName() << TLOG_ENDL;
00152 }
00153 }
00154 initialized_ = false;
00155 }
00156 }
00157
00158 void artdaq::MetricManager::sendMetric(std::string const& name, std::string const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00159 {
00160 if (!initialized_) { TLOG_WARNING("MetricManager") << "Attempted to send metric when MetricManager has not yet been initialized!" << TLOG_ENDL; }
00161 else if (!running_) { TLOG_WARNING("MetricManager") << "Attempted to send metric when MetricManager stopped!" << TLOG_ENDL; }
00162 else if (active_)
00163 {
00164 if (!metric_queue_.count(name)) {
00165 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00166 }
00167 auto entry = &(metric_queue_[name]);
00168
00169 auto size = entry->first;
00170 if (size < metric_queue_max_size_)
00171 {
00172 if (size >= metric_queue_notify_size_) TLOG_ARB(9, "MetricManager") << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." << TLOG_ENDL;
00173 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00174 {
00175 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00176 entry->first++;
00177 entry->second.emplace_back(std::move(metric));
00178 }
00179 }
00180 else
00181 {
00182 TLOG_ARB(10, "MetricManager") << "Rejecting metric because queue full" << TLOG_ENDL;
00183 missed_metric_calls_++;
00184 }
00185 metric_cv_.notify_all();
00186 }
00187 }
00188
00189 void artdaq::MetricManager::sendMetric(std::string const& name, int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00190 {
00191 if (!initialized_) { TLOG_WARNING("MetricManager") << "Attempted to send metric when MetricManager has not yet been initialized!" << TLOG_ENDL; }
00192 else if (!running_) { TLOG_WARNING("MetricManager") << "Attempted to send metric when MetricManager stopped!" << TLOG_ENDL; }
00193 else if (active_)
00194 {
00195 if (!metric_queue_.count(name)) {
00196 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00197 }
00198 auto entry = &(metric_queue_[name]);
00199
00200 auto size = entry->first;
00201 if (size < metric_queue_max_size_)
00202 {
00203 if (size >= metric_queue_notify_size_) TLOG_ARB(9, "MetricManager") << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." << TLOG_ENDL;
00204 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00205 {
00206 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00207 entry->first++;
00208 entry->second.emplace_back(std::move(metric));
00209 }
00210 }
00211 else
00212 {
00213 TLOG_ARB(10, "MetricManager") << "Rejecting metric because queue full" << TLOG_ENDL;
00214 missed_metric_calls_++;
00215 }
00216 metric_cv_.notify_all();
00217 }
00218 }
00219
00220 void artdaq::MetricManager::sendMetric(std::string const& name, double const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00221 {
00222 if (!initialized_) { TLOG_WARNING("MetricManager") << "Attempted to send metric when MetricManager has not yet been initialized!" << TLOG_ENDL; }
00223 else if (!running_) { TLOG_WARNING("MetricManager") << "Attempted to send metric when MetricManager stopped!" << TLOG_ENDL; }
00224 else if (active_)
00225 {
00226 if (!metric_queue_.count(name)) {
00227 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00228 }
00229 auto entry = &(metric_queue_[name]);
00230
00231 auto size = entry->first;
00232 if (size < metric_queue_max_size_)
00233 {
00234 if (size >= metric_queue_notify_size_) TLOG_ARB(9, "MetricManager") << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." << TLOG_ENDL;
00235 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00236 {
00237 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00238 entry->first++;
00239 entry->second.emplace_back(std::move(metric));
00240 }
00241 }
00242 else
00243 {
00244 TLOG_ARB(10, "MetricManager") << "Rejecting metric because queue full" << TLOG_ENDL;
00245 missed_metric_calls_++;
00246 }
00247 metric_cv_.notify_all();
00248 }
00249 }
00250
00251 void artdaq::MetricManager::sendMetric(std::string const& name, float const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00252 {
00253 if (!initialized_) { TLOG_WARNING("MetricManager") << "Attempted to send metric when MetricManager has not yet been initialized!" << TLOG_ENDL; }
00254 else if (!running_) { TLOG_WARNING("MetricManager") << "Attempted to send metric when MetricManager stopped!" << TLOG_ENDL; }
00255 else if (active_)
00256 {
00257 if (!metric_queue_.count(name)) {
00258 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00259 }
00260 auto entry = &(metric_queue_[name]);
00261
00262 auto size = entry->first;
00263 if (size < metric_queue_max_size_)
00264 {
00265 if (size >= metric_queue_notify_size_) TLOG_ARB(9, "MetricManager") << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." << TLOG_ENDL;
00266 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00267 {
00268 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00269 entry->first++;
00270 entry->second.emplace_back(std::move(metric));
00271 }
00272 }
00273 else
00274 {
00275 TLOG_ARB(10, "MetricManager") << "Rejecting metric because queue full" << TLOG_ENDL;
00276 missed_metric_calls_++;
00277 }
00278 metric_cv_.notify_all();
00279 }
00280 }
00281
00282 void artdaq::MetricManager::sendMetric(std::string const& name, long unsigned int const& value, std::string const& unit, int level, MetricMode mode, std::string const& metricPrefix, bool useNameOverride)
00283 {
00284 if (!initialized_) { TLOG_WARNING("MetricManager") << "Attempted to send metric when MetricManager has not yet been initialized!" << TLOG_ENDL; }
00285 else if (!running_) { TLOG_WARNING("MetricManager") << "Attempted to send metric when MetricManager stopped!" << TLOG_ENDL; }
00286 else if (active_)
00287 {
00288 if (!metric_queue_.count(name)) {
00289 metric_queue_[name] = std::make_pair<size_t, std::list<metric_data_ptr>>(0, std::list<metric_data_ptr>());
00290 }
00291 auto entry = &(metric_queue_[name]);
00292
00293 auto size = entry->first;
00294 if (size < metric_queue_max_size_)
00295 {
00296 if (size >= metric_queue_notify_size_) TLOG_ARB(9, "MetricManager") << "Metric queue is at size " << size << " of " << metric_queue_max_size_ << "." << TLOG_ENDL;
00297 std::unique_ptr<MetricData> metric(new MetricData(name, value, unit, level, mode, metricPrefix, useNameOverride));
00298 {
00299 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00300 entry->first++;
00301 entry->second.emplace_back(std::move(metric));
00302 }
00303 }
00304 else
00305 {
00306 TLOG_ARB(10, "MetricManager") << "Rejecting metric because queue full" << TLOG_ENDL;
00307 missed_metric_calls_++;
00308 }
00309 metric_cv_.notify_all();
00310 }
00311 }
00312
00313 void artdaq::MetricManager::startMetricLoop_()
00314 {
00315 if (metric_sending_thread_.joinable()) metric_sending_thread_.join();
00316 TLOG_INFO("MetricManager") << "Starting Metric Sending Thread" << TLOG_ENDL;
00317 boost::thread::attributes attrs;
00318 attrs.set_stack_size(4096 * 200);
00319 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_, this));
00320 }
00321
00322 bool artdaq::MetricManager::metricQueueEmpty()
00323 {
00324 for (auto& q : metric_queue_)
00325 {
00326 if (q.second.first != 0) return false;
00327 }
00328 return true;
00329 }
00330
00331 size_t artdaq::MetricManager::metricQueueSize(std::string name)
00332 {
00333 size_t size = 0;
00334 if (name == "") {
00335
00336 }
00337 else {
00338 if (metric_queue_.count(name)) size = metric_queue_[name].first;
00339 }
00340
00341 return size;
00342 }
00343
00344 void artdaq::MetricManager::sendMetricLoop_()
00345 {
00346 auto last_send_time = std::chrono::steady_clock::time_point();
00347 while (running_)
00348 {
00349 while (metricQueueEmpty() && running_)
00350 {
00351 std::unique_lock<std::mutex> lk(metric_mutex_);
00352 metric_cv_.wait_for(lk, std::chrono::milliseconds(100));
00353 auto now = std::chrono::steady_clock::now();
00354 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() > metric_send_interval_ms_)
00355 {
00356 for (auto& metric : metric_plugins_) { metric->sendMetrics(); }
00357 last_send_time = now;
00358 }
00359 }
00360
00361 auto temp_list = std::list<std::unique_ptr<MetricData>>();
00362 {
00363 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00364
00365 for (auto& q : metric_queue_)
00366 {
00367 temp_list.splice(temp_list.end(), q.second.second);
00368 q.second.first = 0;
00369 }
00370
00371 temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::Accumulate, "", false));
00372 auto missed = missed_metric_calls_.exchange(0);
00373
00374 temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::Accumulate, "", false));
00375 TLOG_TRACE("MetricManager") << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" << TLOG_ENDL;
00376 }
00377
00378 while (temp_list.size() > 0)
00379 {
00380 auto data_ = std::move(temp_list.front());
00381 temp_list.pop_front();
00382 if (data_->Type == MetricType::InvalidMetric) continue;
00383 if (!data_->UseNameOverride)
00384 {
00385 if (data_->MetricPrefix.size() > 0)
00386 {
00387 data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
00388 }
00389 else
00390 {
00391 data_->Name = prefix_ + "." + data_->Name;
00392 }
00393 }
00394
00395 for (auto& metric : metric_plugins_)
00396 {
00397 if (metric->getRunLevel() >= data_->Level)
00398 {
00399 try
00400 {
00401 metric->addMetricData(*data_);
00402 last_send_time = std::chrono::steady_clock::now();
00403 }
00404 catch (...)
00405 {
00406 TLOG_ERROR("MetricManager") <<
00407 "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
00408 << metric->getLibName() << TLOG_ENDL;
00409 }
00410 }
00411 }
00412 }
00413
00414 for (auto& metric : metric_plugins_)
00415 {
00416 metric->sendMetrics();
00417 }
00418 }
00419
00420 auto temp_list = std::list<std::unique_ptr<MetricData>>();
00421 {
00422 std::unique_lock<std::mutex> lk(metric_queue_mutex_);
00423
00424 for (auto& q : metric_queue_)
00425 {
00426 temp_list.splice(temp_list.end(), q.second.second);
00427 }
00428 metric_queue_.clear();
00429
00430 temp_list.emplace_back(new MetricData("Metric Calls", temp_list.size(), "metrics", 4, MetricMode::Accumulate, "", false));
00431 auto missed = missed_metric_calls_.exchange(0);
00432
00433 temp_list.emplace_back(new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::Accumulate, "", false));
00434 TLOG_TRACE("MetricManager") << "There are " << temp_list.size() << " Metric Calls to process (missed " << missed << ")" << TLOG_ENDL;
00435 }
00436
00437 while (temp_list.size() > 0)
00438 {
00439 auto data_ = std::move(temp_list.front());
00440 temp_list.pop_front();
00441 if (data_->Type == MetricType::InvalidMetric) continue;
00442 if (!data_->UseNameOverride)
00443 {
00444 if (data_->MetricPrefix.size() > 0)
00445 {
00446 data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name;
00447 }
00448 else
00449 {
00450 data_->Name = prefix_ + "." + data_->Name;
00451 }
00452 }
00453
00454 for (auto& metric : metric_plugins_)
00455 {
00456 if (metric->getRunLevel() >= data_->Level)
00457 {
00458 try
00459 {
00460 metric->addMetricData(*data_);
00461 last_send_time = std::chrono::steady_clock::now();
00462 }
00463 catch (...)
00464 {
00465 TLOG_ERROR("MetricManager") <<
00466 "Error in MetricManager::sendMetric: error sending value to metric plugin with name "
00467 << metric->getLibName() << TLOG_ENDL;
00468 }
00469 }
00470 }
00471 }
00472
00473 for (auto& metric : metric_plugins_)
00474 {
00475 try
00476 {
00477 metric->stopMetrics();
00478 TLOG_DEBUG("MetricManager") << "Metric Plugin " << metric->getLibName() << " stopped." << TLOG_ENDL;
00479 }
00480 catch (...)
00481 {
00482 TLOG_ERROR("MetricManager") <<
00483 "Exception caught in MetricManager::do_stop(), error stopping plugin with name " <<
00484 metric->getLibName() << TLOG_ENDL;
00485 }
00486 }
00487 TLOG_DEBUG("MetricManager") << "MetricManager has been stopped." << TLOG_ENDL;
00488 }