$treeview $search $mathjax $extrastylesheet
artdaq_utilities
v1_04_10
$projectbrief
|
$projectbrief
|
$searchbox |
00001 // MetricManager.cc: MetricManager class implementation file 00002 // Author: Eric Flumerfelt 00003 // Last Modified: 11/14/2014 00004 // 00005 // MetricManager loads a user-specified set of plugins, sends them their configuration, 00006 // and sends them data as it is recieved. It also maintains the state of the plugins 00007 // relative to the application state. 00008 00009 #define TRACE_NAME "MetricManager" 00010 #include "artdaq-utilities/Plugins/MetricManager.hh" 00011 #include "artdaq-utilities/Plugins/makeMetricPlugin.hh" 00012 #include "fhiclcpp/ParameterSet.h" 00013 #include "tracemf.h" 00014 00015 #include <boost/exception/all.hpp> 00016 #include <chrono> 00017 00018 artdaq::MetricManager::MetricManager() 00019 : metric_plugins_(0), 00020 metric_send_interval_ms_(15000), 00021 initialized_(false), 00022 running_(false), 00023 active_(false), 00024 missed_metric_calls_(0), 00025 metric_calls_(0), 00026 metric_cache_max_size_(1000), 00027 metric_cache_notify_size_(10) {} 00028 00029 artdaq::MetricManager::~MetricManager() noexcept { shutdown(); } 00030 00031 void artdaq::MetricManager::initialize(fhicl::ParameterSet const& pset, std::string const& prefix) { 00032 prefix_ = prefix; 00033 if (initialized_) { 00034 shutdown(); 00035 } 00036 TLOG(TLVL_INFO) << "Configuring metrics with parameter set: " << pset.to_string() ; 00037 00038 std::vector<std::string> names = pset.get_pset_names(); 00039 00040 metric_plugins_.clear(); 00041 00042 for (auto name : names) { 00043 if (name == "metric_queue_size") { 00044 metric_cache_max_size_ = pset.get<size_t>("metric_queue_size"); 00045 } else if (name == "metric_queue_notify_size") { 00046 metric_cache_notify_size_ = pset.get<size_t>("metric_queue_notify_size"); 00047 } else if (name == "metric_cache_size") { 00048 metric_cache_max_size_ = pset.get<size_t>("metric_cache_size"); 00049 } else if (name == "metric_cache_notify_size") { 00050 metric_cache_notify_size_ = pset.get<size_t>("metric_cache_notify_size"); 00051 } else if (name == "metric_send_maximum_delay_ms") { 00052 metric_send_interval_ms_ = pset.get<int>("metric_send_maximum_delay_ms"); 00053 } else { 00054 try { 00055 TLOG(TLVL_DEBUG) << "Constructing metric plugin with name " << name ; 00056 fhicl::ParameterSet plugin_pset = pset.get<fhicl::ParameterSet>(name); 00057 metric_plugins_.push_back( 00058 makeMetricPlugin(plugin_pset.get<std::string>("metricPluginType", ""), plugin_pset, prefix_)); 00059 } catch (const cet::exception& e) { 00060 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name 00061 << ", cet::exception object caught:" << e.explain_self(); 00062 } catch (const boost::exception& e) { 00063 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name 00064 << ", boost::exception object caught: " << boost::diagnostic_information(e); 00065 } catch (const std::exception& e) { 00066 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::initialize, error loading plugin with name " << name 00067 << ", std::exception caught: " << e.what(); 00068 } catch (...) { 00069 TLOG(TLVL_ERROR) << "Unknown Exception caught in MetricManager::initialize, error loading plugin with name " 00070 << name; 00071 } 00072 } 00073 } 00074 00075 initialized_ = true; 00076 } 00077 00078 void artdaq::MetricManager::do_start() { 00079 auto lk = std::unique_lock<std::mutex>(metric_mutex_); 00080 if (!running_) { 00081 TLOG(TLVL_DEBUG) << "Starting MetricManager" ; 00082 for (auto& metric : metric_plugins_) { 00083 if (!metric) continue; 00084 try { 00085 metric->startMetrics(); 00086 TLOG(TLVL_INFO) << "Metric Plugin " << metric->getLibName() << " started." ; 00087 active_ = true; 00088 } catch (...) { 00089 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::do_start(), error starting plugin with name " 00090 << metric->getLibName(); 00091 } 00092 } 00093 running_ = true; 00094 startMetricLoop_(); 00095 } 00096 } 00097 00098 void artdaq::MetricManager::do_stop() { 00099 auto lk = std::unique_lock<std::mutex>(metric_mutex_); 00100 TLOG(TLVL_DEBUG) << "Stopping Metrics" ; 00101 running_ = false; 00102 metric_cv_.notify_all(); 00103 TLOG(TLVL_DEBUG) << "Joining Metric-Sending thread" ; 00104 lk.unlock(); 00105 if (metric_sending_thread_.joinable()) metric_sending_thread_.join(); 00106 TLOG(TLVL_DEBUG) << "do_stop Complete" ; 00107 } 00108 00109 void artdaq::MetricManager::do_pause() { /*do_stop();*/ 00110 } 00111 void artdaq::MetricManager::do_resume() { /*do_start();*/ 00112 } 00113 00114 void artdaq::MetricManager::reinitialize(fhicl::ParameterSet const& pset, std::string const& prefix) { 00115 shutdown(); 00116 initialize(pset, prefix); 00117 } 00118 00119 void artdaq::MetricManager::shutdown() { 00120 TLOG(TLVL_DEBUG) << "MetricManager is shutting down..." ; 00121 do_stop(); 00122 00123 auto lk = std::unique_lock<std::mutex>(metric_mutex_); 00124 if (initialized_) { 00125 for (auto& i : metric_plugins_) { 00126 try { 00127 std::string name = i->getLibName(); 00128 i.reset(nullptr); 00129 TLOG(TLVL_DEBUG) << "Metric Plugin " << name << " shutdown." ; 00130 } catch (...) { 00131 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::shutdown(), error shutting down metric with name " 00132 << i->getLibName(); 00133 } 00134 } 00135 metric_plugins_.clear(); 00136 initialized_ = false; 00137 } 00138 } 00139 00140 void artdaq::MetricManager::sendMetric(std::string const& name, std::string const& value, std::string const& unit, 00141 int level, MetricMode mode, std::string const& metricPrefix, 00142 bool useNameOverride) { 00143 if (!initialized_) { 00144 TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!"; 00145 } else if (!running_) { 00146 TLOG(TLVL_INFO) << "Attempted to send metric when MetricManager stopped!"; 00147 } else if (active_) { 00148 { 00149 std::unique_lock<std::mutex> lk(metric_cache_mutex_); 00150 metric_calls_++; 00151 if (!metric_cache_.count(name) || metric_cache_[name] == nullptr) { 00152 metric_cache_[name] = 00153 std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride); 00154 } else { 00155 auto size = metric_cache_[name]->DataPointCount; 00156 if (size < metric_cache_max_size_) { 00157 if (size >= metric_cache_notify_size_) { 00158 TLOG(9) << "Metric cache is at size " << size << " of " << metric_cache_max_size_ << " for metric " << name 00159 << "."; 00160 } 00161 if (mode == MetricMode::LastPoint) { 00162 metric_cache_[name]->StringValue = value; 00163 metric_cache_[name]->DataPointCount = 1; 00164 } else { 00165 metric_cache_[name]->StringValue += " " + value; 00166 metric_cache_[name]->DataPointCount++; 00167 } 00168 } else { 00169 TLOG(10) << "Rejecting metric because queue full" ; 00170 missed_metric_calls_++; 00171 } 00172 } 00173 } 00174 metric_cv_.notify_all(); 00175 } 00176 } 00177 00178 void artdaq::MetricManager::sendMetric(std::string const& name, int const& value, std::string const& unit, int level, 00179 MetricMode mode, std::string const& metricPrefix, bool useNameOverride) { 00180 if (!initialized_) { 00181 TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!"; 00182 } else if (!running_) { 00183 TLOG(TLVL_INFO) << "Attempted to send metric when MetricManager stopped!"; 00184 } else if (active_) { 00185 { 00186 std::unique_lock<std::mutex> lk(metric_cache_mutex_); 00187 metric_calls_++; 00188 if (!metric_cache_.count(name) || metric_cache_[name] == nullptr) { 00189 metric_cache_[name] = 00190 std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride); 00191 } else { 00192 auto size = metric_cache_[name]->DataPointCount; 00193 if (size < metric_cache_max_size_) { 00194 if (size >= metric_cache_notify_size_) { 00195 TLOG(9) << "Metric cache is at size " << size << " of " << metric_cache_max_size_ << " for metric " << name 00196 << "."; 00197 } 00198 if (mode == MetricMode::LastPoint) { 00199 metric_cache_[name]->IntValue = value; 00200 metric_cache_[name]->DataPointCount = 1; 00201 } else { 00202 metric_cache_[name]->IntValue += value; 00203 metric_cache_[name]->DataPointCount++; 00204 } 00205 } else { 00206 TLOG(10) << "Rejecting metric because queue full" ; 00207 missed_metric_calls_++; 00208 } 00209 } 00210 } 00211 metric_cv_.notify_all(); 00212 } 00213 } 00214 00215 void artdaq::MetricManager::sendMetric(std::string const& name, double const& value, std::string const& unit, int level, 00216 MetricMode mode, std::string const& metricPrefix, bool useNameOverride) { 00217 if (!initialized_) { 00218 TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!"; 00219 } else if (!running_) { 00220 TLOG(TLVL_INFO) << "Attempted to send metric when MetricManager stopped!"; 00221 } else if (active_) { 00222 { 00223 std::unique_lock<std::mutex> lk(metric_cache_mutex_); 00224 metric_calls_++; 00225 if (!metric_cache_.count(name) || metric_cache_[name] == nullptr) { 00226 metric_cache_[name] = 00227 std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride); 00228 } else { 00229 auto size = metric_cache_[name]->DataPointCount; 00230 if (size < metric_cache_max_size_) { 00231 if (size >= metric_cache_notify_size_) { 00232 TLOG(9) << "Metric cache is at size " << size << " of " << metric_cache_max_size_ << " for metric " << name 00233 << "."; 00234 } 00235 if (mode == MetricMode::LastPoint) { 00236 metric_cache_[name]->DoubleValue = value; 00237 metric_cache_[name]->DataPointCount = 1; 00238 } else { 00239 metric_cache_[name]->DoubleValue += value; 00240 metric_cache_[name]->DataPointCount++; 00241 } 00242 } else { 00243 TLOG(10) << "Rejecting metric because queue full" ; 00244 missed_metric_calls_++; 00245 } 00246 } 00247 } 00248 metric_cv_.notify_all(); 00249 } 00250 } 00251 00252 void artdaq::MetricManager::sendMetric(std::string const& name, float const& value, std::string const& unit, int level, 00253 MetricMode mode, std::string const& metricPrefix, bool useNameOverride) { 00254 if (!initialized_) { 00255 TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!"; 00256 } else if (!running_) { 00257 TLOG(TLVL_INFO) << "Attempted to send metric when MetricManager stopped!"; 00258 } else if (active_) { 00259 { 00260 std::unique_lock<std::mutex> lk(metric_cache_mutex_); 00261 metric_calls_++; 00262 if (!metric_cache_.count(name) || metric_cache_[name] == nullptr) { 00263 metric_cache_[name] = 00264 std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride); 00265 } else { 00266 auto size = metric_cache_[name]->DataPointCount; 00267 if (size < metric_cache_max_size_) { 00268 if (size >= metric_cache_notify_size_) { 00269 TLOG(9) << "Metric cache is at size " << size << " of " << metric_cache_max_size_ << " for metric " << name 00270 << "."; 00271 } 00272 if (mode == MetricMode::LastPoint) { 00273 metric_cache_[name]->FloatValue = value; 00274 metric_cache_[name]->DataPointCount = 1; 00275 } else { 00276 metric_cache_[name]->FloatValue += value; 00277 metric_cache_[name]->DataPointCount++; 00278 } 00279 } else { 00280 TLOG(10) << "Rejecting metric because queue full" ; 00281 missed_metric_calls_++; 00282 } 00283 } 00284 } 00285 metric_cv_.notify_all(); 00286 } 00287 } 00288 00289 void artdaq::MetricManager::sendMetric(std::string const& name, long unsigned int const& value, std::string const& unit, 00290 int level, MetricMode mode, std::string const& metricPrefix, 00291 bool useNameOverride) { 00292 if (!initialized_) { 00293 TLOG(TLVL_WARNING) << "Attempted to send metric when MetricManager has not yet been initialized!"; 00294 } else if (!running_) { 00295 TLOG(TLVL_INFO) << "Attempted to send metric when MetricManager stopped!"; 00296 } else if (active_) { 00297 { 00298 std::unique_lock<std::mutex> lk(metric_cache_mutex_); 00299 metric_calls_++; 00300 if (!metric_cache_.count(name) || metric_cache_[name] == nullptr) { 00301 metric_cache_[name] = 00302 std::make_unique<MetricData>(name, value, unit, level, mode, metricPrefix, useNameOverride); 00303 } else { 00304 auto size = metric_cache_[name]->DataPointCount; 00305 if (size < metric_cache_max_size_) { 00306 if (size >= metric_cache_notify_size_) { 00307 TLOG(9) << "Metric cache is at size " << size << " of " << metric_cache_max_size_ << " for metric " << name 00308 << "."; 00309 } 00310 if (mode == MetricMode::LastPoint) { 00311 metric_cache_[name]->UnsignedValue = value; 00312 metric_cache_[name]->DataPointCount = 1; 00313 } else { 00314 metric_cache_[name]->UnsignedValue += value; 00315 metric_cache_[name]->DataPointCount++; 00316 } 00317 } else { 00318 TLOG(10) << "Rejecting metric because queue full" ; 00319 missed_metric_calls_++; 00320 } 00321 } 00322 } 00323 metric_cv_.notify_all(); 00324 } 00325 } 00326 00327 void artdaq::MetricManager::startMetricLoop_() { 00328 if (metric_sending_thread_.joinable()) metric_sending_thread_.join(); 00329 boost::thread::attributes attrs; 00330 attrs.set_stack_size(4096 * 2000); // 8000 KB 00331 TLOG(TLVL_INFO) << "Starting Metric Sending Thread" ; 00332 try { 00333 metric_sending_thread_ = boost::thread(attrs, boost::bind(&MetricManager::sendMetricLoop_, this)); 00334 } catch (const boost::exception& e) { 00335 TLOG(TLVL_ERROR) << "Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e) 00336 << ", errno=" << errno; 00337 std::cerr << "Caught boost::exception starting Metric Sending thread: " << boost::diagnostic_information(e) 00338 << ", errno=" << errno << std::endl; 00339 exit(5); 00340 } 00341 TLOG(TLVL_INFO) << "Metric Sending thread started"; 00342 } 00343 00344 bool artdaq::MetricManager::metricQueueEmpty() { 00345 std::unique_lock<std::mutex> lk(metric_cache_mutex_); 00346 return metric_cache_.size() == 0; 00347 } 00348 00349 size_t artdaq::MetricManager::metricQueueSize(std::string const& name) { 00350 std::unique_lock<std::mutex> lk(metric_cache_mutex_); 00351 size_t size = 0; 00352 if (name == "") { 00353 for (auto& q : metric_cache_) { 00354 size += q.second->DataPointCount; 00355 } 00356 } else { 00357 if (metric_cache_.count(name)) size = metric_cache_[name]->DataPointCount; 00358 } 00359 00360 return size; 00361 } 00362 00363 void artdaq::MetricManager::sendMetricLoop_() { 00364 TLOG(TLVL_INFO) << "sendMetricLoop_ START"; 00365 auto last_send_time = std::chrono::steady_clock::time_point(); 00366 while (running_) { 00367 while (metricQueueEmpty() && running_) { 00368 std::unique_lock<std::mutex> lk(metric_mutex_); 00369 metric_cv_.wait_for(lk, std::chrono::milliseconds(100)); 00370 auto now = std::chrono::steady_clock::now(); 00371 if (std::chrono::duration_cast<std::chrono::milliseconds>(now - last_send_time).count() > 00372 metric_send_interval_ms_) { 00373 for (auto& metric : metric_plugins_) { 00374 if (metric) metric->sendMetrics(); 00375 } 00376 last_send_time = now; 00377 } 00378 } 00379 00380 auto processing_start = std::chrono::steady_clock::now(); 00381 auto temp_list = std::list<std::unique_ptr<MetricData>>(); 00382 { 00383 std::unique_lock<std::mutex> lk(metric_cache_mutex_); 00384 00385 for (auto& q : metric_cache_) { 00386 temp_list.emplace_back(std::move(q.second)); 00387 } 00388 metric_cache_.clear(); 00389 00390 auto calls = metric_calls_.exchange(0); 00391 temp_list.emplace_back( 00392 new MetricData("Metric Calls", calls, "metrics", 4, MetricMode::AccumulateAndRate, "", false)); 00393 00394 auto missed = missed_metric_calls_.exchange(0); 00395 temp_list.emplace_back( 00396 new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false)); 00397 00398 TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metrics to process (" << calls << " calls, " << missed 00399 << " missed)"; 00400 } 00401 00402 while (temp_list.size() > 0) { 00403 auto data_ = std::move(temp_list.front()); 00404 temp_list.pop_front(); 00405 if (data_->Type == MetricType::InvalidMetric) continue; 00406 if (!data_->UseNameOverride) { 00407 if (data_->MetricPrefix.size() > 0) { 00408 data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name; 00409 } else { 00410 data_->Name = prefix_ + "." + data_->Name; 00411 } 00412 } 00413 00414 for (auto& metric : metric_plugins_) { 00415 if (!metric) continue; 00416 if (metric->getRunLevel() >= data_->Level) { 00417 try { 00418 metric->addMetricData(data_); 00419 last_send_time = std::chrono::steady_clock::now(); 00420 } catch (...) { 00421 TLOG(TLVL_ERROR) << "Error in MetricManager::sendMetric: error sending value to metric plugin with name " 00422 << metric->getLibName() ; 00423 } 00424 } 00425 } 00426 } 00427 00428 for (auto& metric : metric_plugins_) { 00429 if (!metric) continue; 00430 metric->sendMetrics(false, processing_start); 00431 } 00432 00433 // Limit rate of metrics going to plugins 00434 usleep(10000); 00435 } 00436 00437 auto temp_list = std::list<std::unique_ptr<MetricData>>(); 00438 { 00439 std::unique_lock<std::mutex> lk(metric_cache_mutex_); 00440 00441 for (auto& q : metric_cache_) { 00442 temp_list.emplace_back(std::move(q.second)); 00443 } 00444 metric_cache_.clear(); 00445 00446 auto calls = metric_calls_.exchange(0); 00447 temp_list.emplace_back( 00448 new MetricData("Metric Calls", calls, "metrics", 4, MetricMode::AccumulateAndRate, "", false)); 00449 00450 auto missed = missed_metric_calls_.exchange(0); 00451 temp_list.emplace_back( 00452 new MetricData("Missed Metric Calls", missed, "metrics", 4, MetricMode::AccumulateAndRate, "", false)); 00453 00454 TLOG(TLVL_TRACE) << "There are " << temp_list.size() << " Metrics to process (" << calls << " calls, " << missed 00455 << " missed)"; 00456 } 00457 00458 while (temp_list.size() > 0) { 00459 auto data_ = std::move(temp_list.front()); 00460 temp_list.pop_front(); 00461 if (data_->Type == MetricType::InvalidMetric) continue; 00462 if (!data_->UseNameOverride) { 00463 if (data_->MetricPrefix.size() > 0) { 00464 data_->Name = prefix_ + "." + data_->MetricPrefix + "." + data_->Name; 00465 } else { 00466 data_->Name = prefix_ + "." + data_->Name; 00467 } 00468 } 00469 00470 for (auto& metric : metric_plugins_) { 00471 if (!metric) continue; 00472 if (metric->getRunLevel() >= data_->Level) { 00473 try { 00474 metric->addMetricData(data_); 00475 last_send_time = std::chrono::steady_clock::now(); 00476 } catch (...) { 00477 TLOG(TLVL_ERROR) << "Error in MetricManager::sendMetric: error sending value to metric plugin with name " 00478 << metric->getLibName() ; 00479 } 00480 } 00481 } 00482 } 00483 00484 for (auto& metric : metric_plugins_) { 00485 if (!metric) continue; 00486 try { 00487 metric->stopMetrics(); 00488 TLOG(TLVL_DEBUG) << "Metric Plugin " << metric->getLibName() << " stopped." ; 00489 } catch (...) { 00490 TLOG(TLVL_ERROR) << "Exception caught in MetricManager::do_stop(), error stopping plugin with name " 00491 << metric->getLibName(); 00492 } 00493 } 00494 TLOG(TLVL_DEBUG) << "MetricManager has been stopped." ; 00495 }