00001 #include <sys/un.h>
00002 #include <sys/time.h>
00003 #include <sys/epoll.h>
00004 #include <arpa/inet.h>
00005 #include <netdb.h>
00006 #include <pthread.h>
00007 #include <sched.h>
00008 #include <algorithm>
00009
00010 #include "canvas/Utilities/Exception.h"
00011 #include "cetlib/exception.h"
00012
00013 #include "artdaq-core/Data/Fragment.hh"
00014 #include "artdaq-core/Utilities/ExceptionHandler.hh"
00015
00016 #include "artdaq/Application/RoutingMasterCore.hh"
00017 #include "artdaq/DAQdata/Globals.hh"
00018 #include "artdaq/Application/Routing/makeRoutingMasterPolicy.hh"
00019 #include "artdaq/DAQdata/TCP_listen_fd.hh"
00020 #include "artdaq/DAQdata/TCPConnect.hh"
00021
00022 #define TRACE_NAME "RoutingMasterCore"
00023
00024 const std::string artdaq::RoutingMasterCore::
00025 TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates");
00026 const std::string artdaq::RoutingMasterCore::
00027 TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived");
00028
00029 artdaq::RoutingMasterCore::RoutingMasterCore(int rank, std::string name) :
00030 name_(name)
00031 , received_token_counter_()
00032 , shutdown_requested_(false)
00033 , stop_requested_(false)
00034 , pause_requested_(false)
00035 , token_socket_(-1)
00036 , table_socket_(-1)
00037 , ack_socket_(-1)
00038 {
00039 TLOG_DEBUG(name_) << "Constructor" << TLOG_ENDL;
00040 statsHelper_.addMonitoredQuantityName(TABLE_UPDATES_STAT_KEY);
00041 statsHelper_.addMonitoredQuantityName(TOKENS_RECEIVED_STAT_KEY);
00042 metricMan = &metricMan_;
00043 my_rank = rank;
00044 }
00045
00046 artdaq::RoutingMasterCore::~RoutingMasterCore()
00047 {
00048 TLOG_DEBUG(name_) << "Destructor" << TLOG_ENDL;
00049 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00050 }
00051
00052 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t)
00053 {
00054 TLOG_DEBUG(name_) << "initialize method called with "
00055 << "ParameterSet = \"" << pset.to_string()
00056 << "\"." << TLOG_ENDL;
00057
00058
00059 fhicl::ParameterSet daq_pset;
00060 try
00061 {
00062 daq_pset = pset.get<fhicl::ParameterSet>("daq");
00063 }
00064 catch (...)
00065 {
00066 TLOG_ERROR(name_)
00067 << "Unable to find the DAQ parameters in the initialization "
00068 << "ParameterSet: \"" + pset.to_string() + "\"." << TLOG_ENDL;
00069 return false;
00070 }
00071 try
00072 {
00073 policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
00074 }
00075 catch (...)
00076 {
00077 TLOG_ERROR(name_)
00078 << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\"." << TLOG_ENDL;
00079 return false;
00080 }
00081
00082
00083 fhicl::ParameterSet metric_pset;
00084 try
00085 {
00086 metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
00087 }
00088 catch (...) {}
00089
00090 if (metric_pset.is_empty())
00091 {
00092 TLOG_INFO(name_) << "No metric plugins appear to be defined" << TLOG_ENDL;
00093 }
00094 try
00095 {
00096 metricMan_.initialize(metric_pset, name_);
00097 }
00098 catch (...)
00099 {
00100 ExceptionHandler(ExceptionHandlerRethrow::no,
00101 "Error loading metrics in RoutingMasterCore::initialize()");
00102 }
00103
00104
00105 auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
00106 if (policy_plugin_spec.length() == 0)
00107 {
00108 TLOG_ERROR(name_)
00109 << "No fragment generator (parameter name = \"policy\") was "
00110 << "specified in the policy ParameterSet. The "
00111 << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\"." << TLOG_ENDL;
00112 return false;
00113 }
00114 try
00115 {
00116 policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_);
00117 }
00118 catch (...)
00119 {
00120 std::stringstream exception_string;
00121 exception_string << "Exception thrown during initialization of policy of type \""
00122 << policy_plugin_spec << "\"";
00123
00124 ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
00125
00126 TLOG_DEBUG(name_) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string() << TLOG_ENDL;
00127
00128 return false;
00129 }
00130
00131 rt_priority_ = daq_pset.get<int>("rt_priority", 0);
00132 sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
00133 num_receivers_ = policy_->GetReceiverCount();
00134
00135 receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
00136 receive_token_events_ = std::vector<epoll_event>(num_receivers_);
00137
00138 auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
00139 routing_mode_ = mode ? detail::RoutingMasterMode::RouteBySendCount : detail::RoutingMasterMode::RouteBySequenceID;
00140 max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
00141 current_table_interval_ms_ = max_table_update_interval_ms_;
00142 max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
00143 receive_token_port_ = daq_pset.get<int>("routing_token_port", 35555);
00144 send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
00145 receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
00146 send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
00147 receive_address_ = daq_pset.get<std::string>("routing_master_hostname", "localhost");
00148
00149
00150 statsHelper_.createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
00151
00152 shutdown_requested_.store(false);
00153 start_recieve_token_thread_();
00154 return true;
00155 }
00156
00157 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t)
00158 {
00159 stop_requested_.store(false);
00160 pause_requested_.store(false);
00161
00162 statsHelper_.resetStatistics();
00163 policy_->Reset();
00164
00165 metricMan_.do_start();
00166 run_id_ = id;
00167 table_update_count_ = 0;
00168 received_token_count_ = 0;
00169
00170 TLOG_DEBUG(name_) << "Started run " << std::to_string(run_id_.run()) << TLOG_ENDL;
00171 return true;
00172 }
00173
00174 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t)
00175 {
00176 TLOG_DEBUG(name_) << "Stopping run " << std::to_string(run_id_.run())
00177 << " after " << std::to_string(table_update_count_) << " table updates."
00178 << " and " << received_token_count_ << " received tokens." << TLOG_ENDL;
00179 stop_requested_.store(true);
00180 return true;
00181 }
00182
00183 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t)
00184 {
00185 TLOG_DEBUG(name_) << "Pausing run " << std::to_string(run_id_.run())
00186 << " after " << table_update_count_ << " table updates."
00187 << " and " << received_token_count_ << " received tokens." << TLOG_ENDL;
00188 pause_requested_.store(true);
00189 return true;
00190 }
00191
00192 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t)
00193 {
00194 TLOG_DEBUG(name_) << "Resuming run " << run_id_.run() << TLOG_ENDL;
00195 policy_->Reset();
00196 pause_requested_.store(false);
00197 metricMan_.do_start();
00198 return true;
00199 }
00200
00201 bool artdaq::RoutingMasterCore::shutdown(uint64_t)
00202 {
00203 policy_.reset(nullptr);
00204 metricMan_.shutdown();
00205 shutdown_requested_.store(true);
00206 return true;
00207 }
00208
00209 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00210 {
00211 TLOG_DEBUG(name_) << "soft_initialize method called with "
00212 << "ParameterSet = \"" << pset.to_string()
00213 << "\"." << TLOG_ENDL;
00214 return initialize(pset, e, f);
00215 }
00216
00217 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00218 {
00219 TLOG_DEBUG(name_) << "reinitialize method called with "
00220 << "ParameterSet = \"" << pset.to_string()
00221 << "\"." << TLOG_ENDL;
00222 return initialize(pset, e, f);
00223 }
00224
00225 size_t artdaq::RoutingMasterCore::process_event_table()
00226 {
00227 if (rt_priority_ > 0)
00228 {
00229 #pragma GCC diagnostic push
00230 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00231 sched_param s_param = {};
00232 s_param.sched_priority = rt_priority_;
00233 if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
00234 TLOG_WARNING(name_) << "setting realtime priority failed" << TLOG_ENDL;
00235 #pragma GCC diagnostic pop
00236 }
00237
00238
00239
00240
00241 if (rt_priority_ > 0)
00242 {
00243 #pragma GCC diagnostic push
00244 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00245 sched_param s_param = {};
00246 s_param.sched_priority = rt_priority_;
00247 int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
00248 if (status != 0)
00249 {
00250 TLOG_ERROR(name_)
00251 << "Failed to set realtime priority to " << std::to_string(rt_priority_)
00252 << ", return code = " << status << TLOG_ENDL;
00253 }
00254 #pragma GCC diagnostic pop
00255 }
00256
00257
00258
00259 TLOG_DEBUG(name_) << "Sending initial table." << TLOG_ENDL;
00260 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00261 auto nextSendTime = startTime;
00262 double delta_time;
00263 while (true)
00264 {
00265 if (stop_requested_ || pause_requested_) { break; }
00266 startTime = artdaq::MonitoredQuantity::getCurrentTime();
00267
00268 if (startTime >= nextSendTime)
00269 {
00270 auto table = policy_->GetCurrentTable();
00271 if (table.size() > 0) {
00272 send_event_table(table);
00273 ++table_update_count_;
00274 delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00275 statsHelper_.addSample(TABLE_UPDATES_STAT_KEY, delta_time);
00276 TRACE(16, "%s::process_fragments TABLE_UPDATES_STAT_KEY=%f", name_.c_str(), delta_time);
00277 }
00278 else
00279 {
00280 TLOG_WARNING(name_) << "No tokens received in this update interval! This is most likely a Very Bad Thing!" << TLOG_ENDL;
00281 }
00282 auto max_tokens = policy_->GetMaxNumberOfTokens();
00283 if (max_tokens > 0) {
00284 auto frac = table.size() / static_cast<double>(max_tokens);
00285 if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
00286 if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
00287 if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
00288 if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
00289 }
00290 nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
00291 TLOG_DEBUG(name_) << "current_table_interval_ms is now " << current_table_interval_ms_ << TLOG_ENDL;
00292 }
00293 else
00294 {
00295 usleep(current_table_interval_ms_ * 10);
00296 }
00297 }
00298
00299 metricMan_.do_stop();
00300
00301 policy_.reset(nullptr);
00302 return table_update_count_;
00303 }
00304
00305 void artdaq::RoutingMasterCore::send_event_table(detail::RoutingPacket packet)
00306 {
00307
00308 if (table_socket_ == -1)
00309 {
00310 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00311 if (!table_socket_)
00312 {
00313 TLOG_ERROR(name_) << "I failed to create the socket for sending Data Requests! Errno: " << std::to_string(errno) << TLOG_ENDL;
00314 exit(1);
00315 }
00316 auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
00317 if (sts == -1)
00318 {
00319 TLOG_ERROR(name_) << "Unable to resolve table_update_address" << TLOG_ENDL;
00320 exit(1);
00321 }
00322
00323 auto yes = 1;
00324 if (receive_address_ != "localhost") {
00325 TLOG_DEBUG(name_) << "Making sure that multicast sending uses the correct interface for hostname " << receive_address_ << TLOG_ENDL;
00326 struct in_addr addr;
00327 sts = ResolveHost(receive_address_.c_str(), addr);
00328 if (sts == -1)
00329 {
00330 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;;
00331 }
00332
00333 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00334 {
00335 throw art::Exception(art::errors::Configuration) <<
00336 "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
00337 exit(1);
00338 }
00339
00340 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
00341 {
00342 TLOG_ERROR(name_) << "Cannot set outgoing interface. Errno: " << std::to_string(errno) << TLOG_ENDL;
00343 exit(1);
00344 }
00345 }
00346 if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1)
00347 {
00348 TLOG_ERROR(name_) << "Cannot set request socket to broadcast. Errno: " << std::to_string(errno) << TLOG_ENDL;
00349 exit(1);
00350 }
00351 }
00352
00353
00354 if (ack_socket_ == -1)
00355 {
00356 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00357 if (!ack_socket_)
00358 {
00359 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
00360 exit(1);
00361 }
00362
00363 struct sockaddr_in si_me_request;
00364
00365 auto yes = 1;
00366 if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00367 {
00368 throw art::Exception(art::errors::Configuration) <<
00369 "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
00370 exit(1);
00371 }
00372 memset(&si_me_request, 0, sizeof(si_me_request));
00373 si_me_request.sin_family = AF_INET;
00374 si_me_request.sin_port = htons(receive_acks_port_);
00375 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00376 if (bind(ack_socket_, reinterpret_cast<struct sockaddr *>(&si_me_request), sizeof(si_me_request)) == -1)
00377 {
00378 throw art::Exception(art::errors::Configuration) <<
00379 "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
00380 exit(1);
00381 }
00382 TLOG_DEBUG(name_) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_ << TLOG_ENDL;
00383 }
00384
00385 auto acks = std::unordered_map<int, bool>();
00386 for (auto& r : sender_ranks_)
00387 {
00388 acks[r] = false;
00389 }
00390 auto counter = 0U;
00391 auto start_time = std::chrono::steady_clock::now();
00392 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
00393 {
00394
00395 auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
00396 auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
00397
00398 TLOG_DEBUG(name_) << "Sending table information for " << std::to_string(header.nEntries) << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ << TLOG_ENDL;
00399 if (sendto(table_socket_, &header, sizeof(detail::RoutingPacketHeader), 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
00400 {
00401 TLOG_ERROR(name_) << "Error sending request message header" << TLOG_ENDL;
00402 }
00403 if (sendto(table_socket_, &packet[0], packetSize, 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
00404 {
00405 TLOG_ERROR(name_) << "Error sending request message data" << TLOG_ENDL;
00406 }
00407
00408
00409
00410 auto first = packet[0].sequence_id;
00411 auto last = packet.rbegin()->sequence_id;
00412 TLOG_DEBUG(name_) << "Expecting acks to have first= " << std::to_string(first) << ", and last= " << std::to_string(last) << TLOG_ENDL;
00413
00414
00415 auto startTime = std::chrono::steady_clock::now();
00416 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
00417 {
00418 auto currentTime = std::chrono::steady_clock::now();
00419 auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
00420 if (static_cast<size_t>(std::chrono::duration_cast<std::chrono::milliseconds>(currentTime - startTime).count()) > table_ack_wait_time_ms)
00421 {
00422 if (counter > max_ack_cycle_count_ && table_update_count_ > 0)
00423 {
00424 TLOG_ERROR(name_) << "Did not receive acks from all senders after resending table " << std::to_string(counter) << " times during the table_update_interval. Check the status of the senders!" << TLOG_ENDL;
00425 break;
00426 }
00427 TLOG_WARNING(name_) << "Did not receive acks from all senders within the table_ack_wait_time. Resending table update" << TLOG_ENDL;
00428 break;
00429 }
00430
00431 TLOG_ARB(20, name_) << "send_event_table: Polling Request socket for new requests" << TLOG_ENDL;
00432 auto ready = true;
00433 while (ready) {
00434 detail::RoutingAckPacket buffer;
00435 if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0)
00436 {
00437 if (errno == EWOULDBLOCK || errno == EAGAIN) {
00438 TLOG_ARB(20, name_) << "send_event_table: No more ack datagrams on ack socket." << TLOG_ENDL;
00439 ready = false;
00440 }
00441 else
00442 {
00443 TLOG_ERROR(name_) << "An unexpected error occurred during ack packet receive" << TLOG_ENDL;
00444 exit(2);
00445 }
00446 }
00447 else
00448 {
00449 TLOG_DEBUG(name_) << "Ack packet from rank " << buffer.rank << " has first= " << std::to_string(buffer.first_sequence_id) << " and last= " << std::to_string(buffer.last_sequence_id) << TLOG_ENDL;
00450 if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
00451 {
00452 TLOG_DEBUG(name_) << "Received table update acknowledgement from sender with rank " << std::to_string(buffer.rank) << "." << TLOG_ENDL;
00453 acks[buffer.rank] = true;
00454 TLOG_DEBUG(name_) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) << " acks outstanding" << TLOG_ENDL;
00455 }
00456 else
00457 {
00458 if (!acks.count(buffer.rank)) { TLOG_ERROR(name_) << "Received acknowledgement from invalid rank " << buffer.rank << "! Cross-talk between RoutingMasters means there's a configuration error!" << TLOG_ENDL; }
00459 else { TLOG_WARNING(name_) << "Received acknowledgement from rank " << buffer.rank << " that had incorrect sequence ID information. Discarding." << TLOG_ENDL; }
00460 }
00461 }
00462 }
00463 usleep(table_ack_wait_time_ms * 1000 / 10);
00464 }
00465 }
00466 if (metricMan)
00467 {
00468 std::chrono::duration<double, std::ratio<1>> delta = std::chrono::steady_clock::now() - start_time;
00469 metricMan->sendMetric("Table Acknowledge Time", delta.count(), "seconds", 3);
00470 }
00471 }
00472
00473 void artdaq::RoutingMasterCore::receive_tokens_()
00474 {
00475 while (!shutdown_requested_) {
00476 TLOG_DEBUG(name_) << "Receive Token loop start" << TLOG_ENDL;
00477 if (token_socket_ == -1)
00478 {
00479 TLOG_DEBUG(name_) << "Opening token listener socket" << TLOG_ENDL;
00480 token_socket_ = TCP_listen_fd(receive_token_port_, 3 * sizeof(detail::RoutingToken));
00481
00482 if (token_epoll_fd_ != -1) close(token_epoll_fd_);
00483 struct epoll_event ev;
00484 token_epoll_fd_ = epoll_create1(0);
00485 ev.events = EPOLLIN | EPOLLPRI;
00486 ev.data.fd = token_socket_;
00487 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, token_socket_, &ev) == -1)
00488 {
00489 TLOG_ERROR(name_) << "Could not register listen socket to epoll fd" << TLOG_ENDL;
00490 exit(3);
00491 }
00492 }
00493 if (token_socket_ == -1 || token_epoll_fd_ == -1)
00494 {
00495 TLOG_DEBUG(name_) << "One of the listen sockets was not opened successfully." << TLOG_ENDL;
00496 return;
00497 }
00498
00499 auto nfds = epoll_wait(token_epoll_fd_, &receive_token_events_[0], receive_token_events_.size(), current_table_interval_ms_);
00500 if (nfds == -1) {
00501 perror("epoll_wait");
00502 exit(EXIT_FAILURE);
00503 }
00504
00505 TLOG_DEBUG(name_) << "Received " << std::to_string(nfds) << " events" << TLOG_ENDL;
00506 for (auto n = 0; n < nfds; ++n) {
00507 if (receive_token_events_[n].data.fd == token_socket_) {
00508 TLOG_DEBUG(name_) << "Accepting new connection on token_socket" << TLOG_ENDL;
00509 sockaddr_in addr;
00510 socklen_t arglen = sizeof(addr);
00511 auto conn_sock = accept(token_socket_, (struct sockaddr *)&addr, &arglen);
00512
00513 if (conn_sock == -1) {
00514 perror("accept");
00515 exit(EXIT_FAILURE);
00516 }
00517
00518 receive_token_addrs_[conn_sock] = std::string(inet_ntoa(addr.sin_addr));
00519 struct epoll_event ev;
00520 ev.events = EPOLLIN | EPOLLET;
00521 ev.data.fd = conn_sock;
00522 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, conn_sock, &ev) == -1) {
00523 perror("epoll_ctl: conn_sock");
00524 exit(EXIT_FAILURE);
00525 }
00526 }
00527 else {
00528 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00529 detail::RoutingToken buff;
00530 auto sts = read(receive_token_events_[n].data.fd, &buff, sizeof(detail::RoutingToken));
00531 if (sts != sizeof(detail::RoutingToken) || buff.header != TOKEN_MAGIC)
00532 {
00533 TLOG_ERROR(name_) << "Received invalid token from " << receive_token_addrs_[receive_token_events_[n].data.fd] << TLOG_ENDL;
00534 }
00535 else
00536 {
00537 TLOG_DEBUG(name_) << "Received token from " << std::to_string(buff.rank) << " indicating " << buff.new_slots_free << " slots are free." << TLOG_ENDL;
00538 received_token_count_ += buff.new_slots_free;
00539 if (routing_mode_ == detail::RoutingMasterMode::RouteBySequenceID) {
00540 policy_->AddReceiverToken(buff.rank, buff.new_slots_free);
00541 }
00542 else if (routing_mode_ == detail::RoutingMasterMode::RouteBySendCount)
00543 {
00544 if (!received_token_counter_.count(buff.rank)) received_token_counter_[buff.rank] = 0;
00545 received_token_counter_[buff.rank] += buff.new_slots_free;
00546 TLOG_DEBUG(name_) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() << "." << TLOG_ENDL;
00547 while (received_token_counter_[buff.rank] >= sender_ranks_.size()) {
00548 TLOG_DEBUG(name_) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size()
00549 << "... Sending token to policy" << TLOG_ENDL;
00550 policy_->AddReceiverToken(buff.rank, 1);
00551 received_token_counter_[buff.rank] -= sender_ranks_.size();
00552 }
00553 }
00554 }
00555 auto delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00556 statsHelper_.addSample(TOKENS_RECEIVED_STAT_KEY, delta_time);
00557
00558 }
00559 }
00560 }
00561 }
00562
00563 void artdaq::RoutingMasterCore::start_recieve_token_thread_()
00564 {
00565 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00566 TLOG_INFO(name_) << "Starting Token Reception Thread" << TLOG_ENDL;
00567 ev_token_receive_thread_ = std::thread(&RoutingMasterCore::receive_tokens_, this);
00568 }
00569
00570 std::string artdaq::RoutingMasterCore::report(std::string const&) const
00571 {
00572 std::string resultString;
00573
00574
00575 auto tmpString = name_ + " run number = " + std::to_string(run_id_.run())
00576 + ", table updates sent = " + std::to_string(table_update_count_)
00577 + ", Receiver tokens received = " + std::to_string(received_token_count_);
00578 return tmpString;
00579 }
00580
00581 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const
00582 {
00583 std::ostringstream oss;
00584 oss << name_ << " statistics:" << std::endl;
00585
00586 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00587 if (mqPtr.get() != nullptr)
00588 {
00589 artdaq::MonitoredQuantityStats stats;
00590 mqPtr->getStats(stats);
00591 oss << " Table Update statistics: "
00592 << stats.recentSampleCount << " table updates sent at "
00593 << stats.recentSampleRate << " table updates/sec, , monitor window = "
00594 << stats.recentDuration << " sec" << std::endl;
00595 oss << " Average times per table update: ";
00596 if (stats.recentSampleRate > 0.0)
00597 {
00598 oss << " elapsed time = "
00599 << (1.0 / stats.recentSampleRate) << " sec";
00600 }
00601 oss << ", avg table acknowledgement wait time = "
00602 << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
00603 }
00604
00605 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00606 if (mqPtr.get() != nullptr)
00607 {
00608 artdaq::MonitoredQuantityStats stats;
00609 mqPtr->getStats(stats);
00610 oss << " Received Token statistics: "
00611 << stats.recentSampleCount << " tokens received at "
00612 << stats.recentSampleRate << " tokens/sec, , monitor window = "
00613 << stats.recentDuration << " sec" << std::endl;
00614 oss << " Average times per token: ";
00615 if (stats.recentSampleRate > 0.0)
00616 {
00617 oss << " elapsed time = "
00618 << (1.0 / stats.recentSampleRate) << " sec";
00619 }
00620 oss << ", input token wait time = "
00621 << mqPtr->getRecentValueSum() << " sec" << std::endl;
00622 }
00623
00624 return oss.str();
00625 }
00626
00627 void artdaq::RoutingMasterCore::sendMetrics_()
00628 {
00629 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00630 if (mqPtr.get() != nullptr)
00631 {
00632 artdaq::MonitoredQuantityStats stats;
00633 mqPtr->getStats(stats);
00634 metricMan_.sendMetric("Table Update Count",
00635 static_cast<unsigned long>(stats.fullSampleCount),
00636 "updates", 1);
00637 metricMan_.sendMetric("Table Update Rate",
00638 stats.recentSampleRate, "updates/sec", 1);
00639
00640 metricMan_.sendMetric("Average Sender Acknowledgement Time",
00641 (mqPtr->getRecentValueSum() / sender_ranks_.size()),
00642 "seconds", 3, false);
00643 }
00644
00645 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00646 if (mqPtr.get() != nullptr)
00647 {
00648 artdaq::MonitoredQuantityStats stats;
00649 mqPtr->getStats(stats);
00650 metricMan_.sendMetric("Receiver Token Count",
00651 static_cast<unsigned long>(stats.fullSampleCount),
00652 "updates", 1);
00653 metricMan_.sendMetric("Receiver Token Rate",
00654 stats.recentSampleRate, "updates/sec", 1);
00655 metricMan_.sendMetric("Total Receiver Token Wait Time",
00656 mqPtr->getRecentValueSum(),
00657 "seconds", 3, false);
00658 }
00659 }