00001 #include <sys/un.h>
00002 #include <sys/time.h>
00003 #include <sys/epoll.h>
00004 #include <arpa/inet.h>
00005 #include <netdb.h>
00006 #include <pthread.h>
00007 #include <sched.h>
00008 #include <algorithm>
00009
00010 #include "canvas/Utilities/Exception.h"
00011 #include "cetlib/exception.h"
00012
00013 #include "artdaq-core/Data/Fragment.hh"
00014 #include "artdaq-core/Utilities/ExceptionHandler.hh"
00015
00016 #include "artdaq/Application/RoutingMasterCore.hh"
00017 #include "artdaq/DAQdata/Globals.hh"
00018 #include "artdaq/Application/Routing/makeRoutingMasterPolicy.hh"
00019 #include "artdaq/DAQdata/TCP_listen_fd.hh"
00020 #include "artdaq/DAQdata/TCPConnect.hh"
00021
00022 #define TRACE_NAME "RoutingMasterCore"
00023
00024 const std::string artdaq::RoutingMasterCore::
00025 TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates");
00026 const std::string artdaq::RoutingMasterCore::
00027 TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived");
00028
00029 artdaq::RoutingMasterCore::RoutingMasterCore(int rank, std::string name) :
00030 name_(name)
00031 , received_token_counter_()
00032 , shutdown_requested_(false)
00033 , stop_requested_(false)
00034 , pause_requested_(false)
00035 , token_socket_(-1)
00036 , table_socket_(-1)
00037 , ack_socket_(-1)
00038 {
00039 TLOG_DEBUG(name_) << "Constructor" << TLOG_ENDL;
00040 statsHelper_.addMonitoredQuantityName(TABLE_UPDATES_STAT_KEY);
00041 statsHelper_.addMonitoredQuantityName(TOKENS_RECEIVED_STAT_KEY);
00042 metricMan = &metricMan_;
00043 my_rank = rank;
00044 }
00045
00046 artdaq::RoutingMasterCore::~RoutingMasterCore()
00047 {
00048 TLOG_DEBUG(name_) << "Destructor" << TLOG_ENDL;
00049 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00050 }
00051
00052 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t)
00053 {
00054 TLOG_DEBUG(name_) << "initialize method called with "
00055 << "ParameterSet = \"" << pset.to_string()
00056 << "\"." << TLOG_ENDL;
00057
00058
00059 fhicl::ParameterSet daq_pset;
00060 try
00061 {
00062 daq_pset = pset.get<fhicl::ParameterSet>("daq");
00063 }
00064 catch (...)
00065 {
00066 TLOG_ERROR(name_)
00067 << "Unable to find the DAQ parameters in the initialization "
00068 << "ParameterSet: \"" + pset.to_string() + "\"." << TLOG_ENDL;
00069 return false;
00070 }
00071 try
00072 {
00073 policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
00074 }
00075 catch (...)
00076 {
00077 TLOG_ERROR(name_)
00078 << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\"." << TLOG_ENDL;
00079 return false;
00080 }
00081
00082
00083 fhicl::ParameterSet metric_pset;
00084 try
00085 {
00086 metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
00087 }
00088 catch (...) {}
00089
00090 if (metric_pset.is_empty())
00091 {
00092 TLOG_INFO(name_) << "No metric plugins appear to be defined" << TLOG_ENDL;
00093 }
00094 try
00095 {
00096 metricMan_.initialize(metric_pset, name_);
00097 }
00098 catch (...)
00099 {
00100 ExceptionHandler(ExceptionHandlerRethrow::no,
00101 "Error loading metrics in RoutingMasterCore::initialize()");
00102 }
00103
00104
00105 auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
00106 if (policy_plugin_spec.length() == 0)
00107 {
00108 TLOG_ERROR(name_)
00109 << "No fragment generator (parameter name = \"policy\") was "
00110 << "specified in the policy ParameterSet. The "
00111 << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\"." << TLOG_ENDL;
00112 return false;
00113 }
00114 try
00115 {
00116 policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_);
00117 }
00118 catch (...)
00119 {
00120 std::stringstream exception_string;
00121 exception_string << "Exception thrown during initialization of policy of type \""
00122 << policy_plugin_spec << "\"";
00123
00124 ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
00125
00126 TLOG_DEBUG(name_) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string() << TLOG_ENDL;
00127
00128 return false;
00129 }
00130
00131 rt_priority_ = daq_pset.get<int>("rt_priority", 0);
00132 sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
00133 num_receivers_ = policy_->GetReceiverCount();
00134
00135 receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
00136 receive_token_events_ = std::vector<epoll_event>(num_receivers_);
00137
00138 auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
00139 routing_mode_ = mode ? detail::RoutingMasterMode::RouteBySendCount : detail::RoutingMasterMode::RouteBySequenceID;
00140 max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
00141 current_table_interval_ms_ = max_table_update_interval_ms_;
00142 max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
00143 receive_token_port_ = daq_pset.get<int>("routing_token_port", 35555);
00144 send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
00145 receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
00146 send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
00147 receive_address_ = daq_pset.get<std::string>("routing_master_hostname", "localhost");
00148
00149
00150 statsHelper_.createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
00151
00152 shutdown_requested_.store(false);
00153 start_recieve_token_thread_();
00154 return true;
00155 }
00156
00157 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t)
00158 {
00159 stop_requested_.store(false);
00160 pause_requested_.store(false);
00161
00162 statsHelper_.resetStatistics();
00163 policy_->Reset();
00164
00165 metricMan_.do_start();
00166 run_id_ = id;
00167 table_update_count_ = 0;
00168 received_token_count_ = 0;
00169
00170 TLOG_DEBUG(name_) << "Started run " << std::to_string(run_id_.run()) << TLOG_ENDL;
00171 return true;
00172 }
00173
00174 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t)
00175 {
00176 TLOG_DEBUG(name_) << "Stopping run " << std::to_string(run_id_.run())
00177 << " after " << std::to_string(table_update_count_) << " table updates."
00178 << " and " << received_token_count_ << " received tokens." << TLOG_ENDL;
00179 stop_requested_.store(true);
00180 return true;
00181 }
00182
00183 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t)
00184 {
00185 TLOG_DEBUG(name_) << "Pausing run " << std::to_string(run_id_.run())
00186 << " after " << table_update_count_ << " table updates."
00187 << " and " << received_token_count_ << " received tokens." << TLOG_ENDL;
00188 pause_requested_.store(true);
00189 return true;
00190 }
00191
00192 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t)
00193 {
00194 TLOG_DEBUG(name_) << "Resuming run " << run_id_.run() << TLOG_ENDL;
00195 policy_->Reset();
00196 pause_requested_.store(false);
00197 metricMan_.do_start();
00198 return true;
00199 }
00200
00201 bool artdaq::RoutingMasterCore::shutdown(uint64_t)
00202 {
00203 policy_.reset(nullptr);
00204 metricMan_.shutdown();
00205 shutdown_requested_.store(true);
00206 return true;
00207 }
00208
00209 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00210 {
00211 TLOG_DEBUG(name_) << "soft_initialize method called with "
00212 << "ParameterSet = \"" << pset.to_string()
00213 << "\"." << TLOG_ENDL;
00214 return initialize(pset, e, f);
00215 }
00216
00217 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00218 {
00219 TLOG_DEBUG(name_) << "reinitialize method called with "
00220 << "ParameterSet = \"" << pset.to_string()
00221 << "\"." << TLOG_ENDL;
00222 return initialize(pset, e, f);
00223 }
00224
00225 size_t artdaq::RoutingMasterCore::process_event_table()
00226 {
00227 if (rt_priority_ > 0)
00228 {
00229 #pragma GCC diagnostic push
00230 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00231 sched_param s_param = {};
00232 s_param.sched_priority = rt_priority_;
00233 if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
00234 TLOG_WARNING(name_) << "setting realtime priority failed" << TLOG_ENDL;
00235 #pragma GCC diagnostic pop
00236 }
00237
00238
00239
00240
00241 if (rt_priority_ > 0)
00242 {
00243 #pragma GCC diagnostic push
00244 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00245 sched_param s_param = {};
00246 s_param.sched_priority = rt_priority_;
00247 int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
00248 if (status != 0)
00249 {
00250 TLOG_ERROR(name_)
00251 << "Failed to set realtime priority to " << std::to_string(rt_priority_)
00252 << ", return code = " << status << TLOG_ENDL;
00253 }
00254 #pragma GCC diagnostic pop
00255 }
00256
00257
00258
00259 TLOG_DEBUG(name_) << "Sending initial table." << TLOG_ENDL;
00260 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00261 auto nextSendTime = startTime;
00262 double delta_time;
00263 while (true)
00264 {
00265 if (stop_requested_ || pause_requested_) { break; }
00266 startTime = artdaq::MonitoredQuantity::getCurrentTime();
00267
00268 if (startTime >= nextSendTime)
00269 {
00270 auto table = policy_->GetCurrentTable();
00271 if (table.size() > 0)
00272 {
00273 send_event_table(table);
00274 ++table_update_count_;
00275 delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00276 statsHelper_.addSample(TABLE_UPDATES_STAT_KEY, delta_time);
00277 TRACE(16, "%s::process_fragments TABLE_UPDATES_STAT_KEY=%f", name_.c_str(), delta_time);
00278 }
00279 else
00280 {
00281 TLOG_WARNING(name_) << "No tokens received in this update interval! This is most likely a Very Bad Thing!" << TLOG_ENDL;
00282 }
00283 auto max_tokens = policy_->GetMaxNumberOfTokens();
00284 if (max_tokens > 0)
00285 {
00286 auto frac = table.size() / static_cast<double>(max_tokens);
00287 if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
00288 if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
00289 if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
00290 if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
00291 }
00292 nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
00293 TLOG_DEBUG(name_) << "current_table_interval_ms is now " << current_table_interval_ms_ << TLOG_ENDL;
00294 }
00295 else
00296 {
00297 usleep(current_table_interval_ms_ * 10);
00298 }
00299 }
00300
00301 metricMan_.do_stop();
00302
00303 policy_.reset(nullptr);
00304 return table_update_count_;
00305 }
00306
00307 void artdaq::RoutingMasterCore::send_event_table(detail::RoutingPacket packet)
00308 {
00309
00310 if (table_socket_ == -1)
00311 {
00312 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00313 if (!table_socket_)
00314 {
00315 TLOG_ERROR(name_) << "I failed to create the socket for sending Data Requests! Errno: " << std::to_string(errno) << TLOG_ENDL;
00316 exit(1);
00317 }
00318 auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
00319 if (sts == -1)
00320 {
00321 TLOG_ERROR(name_) << "Unable to resolve table_update_address" << TLOG_ENDL;
00322 exit(1);
00323 }
00324
00325 auto yes = 1;
00326 if (receive_address_ != "localhost")
00327 {
00328 TLOG_DEBUG(name_) << "Making sure that multicast sending uses the correct interface for hostname " << receive_address_ << TLOG_ENDL;
00329 struct in_addr addr;
00330 sts = ResolveHost(receive_address_.c_str(), addr);
00331 if (sts == -1)
00332 {
00333 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;;
00334 }
00335
00336 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00337 {
00338 throw art::Exception(art::errors::Configuration) <<
00339 "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
00340 exit(1);
00341 }
00342
00343 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
00344 {
00345 TLOG_ERROR(name_) << "Cannot set outgoing interface. Errno: " << std::to_string(errno) << TLOG_ENDL;
00346 exit(1);
00347 }
00348 }
00349 if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1)
00350 {
00351 TLOG_ERROR(name_) << "Cannot set request socket to broadcast. Errno: " << std::to_string(errno) << TLOG_ENDL;
00352 exit(1);
00353 }
00354 }
00355
00356
00357 if (ack_socket_ == -1)
00358 {
00359 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00360 if (!ack_socket_)
00361 {
00362 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
00363 exit(1);
00364 }
00365
00366 struct sockaddr_in si_me_request;
00367
00368 auto yes = 1;
00369 if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00370 {
00371 throw art::Exception(art::errors::Configuration) <<
00372 "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
00373 exit(1);
00374 }
00375 memset(&si_me_request, 0, sizeof(si_me_request));
00376 si_me_request.sin_family = AF_INET;
00377 si_me_request.sin_port = htons(receive_acks_port_);
00378 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00379 if (bind(ack_socket_, reinterpret_cast<struct sockaddr *>(&si_me_request), sizeof(si_me_request)) == -1)
00380 {
00381 throw art::Exception(art::errors::Configuration) <<
00382 "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
00383 exit(1);
00384 }
00385 TLOG_DEBUG(name_) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_ << TLOG_ENDL;
00386 }
00387
00388 auto acks = std::unordered_map<int, bool>();
00389 for (auto& r : sender_ranks_)
00390 {
00391 acks[r] = false;
00392 }
00393 auto counter = 0U;
00394 auto start_time = std::chrono::steady_clock::now();
00395 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
00396 {
00397
00398 auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
00399 auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
00400
00401 TLOG_DEBUG(name_) << "Sending table information for " << std::to_string(header.nEntries) << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ << TLOG_ENDL;
00402 if (sendto(table_socket_, &header, sizeof(detail::RoutingPacketHeader), 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
00403 {
00404 TLOG_ERROR(name_) << "Error sending request message header" << TLOG_ENDL;
00405 }
00406 if (sendto(table_socket_, &packet[0], packetSize, 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
00407 {
00408 TLOG_ERROR(name_) << "Error sending request message data" << TLOG_ENDL;
00409 }
00410
00411
00412
00413 auto first = packet[0].sequence_id;
00414 auto last = packet.rbegin()->sequence_id;
00415 TLOG_DEBUG(name_) << "Expecting acks to have first= " << std::to_string(first) << ", and last= " << std::to_string(last) << TLOG_ENDL;
00416
00417
00418 auto startTime = std::chrono::steady_clock::now();
00419 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
00420 {
00421 auto currentTime = std::chrono::steady_clock::now();
00422 auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
00423 if (static_cast<size_t>(std::chrono::duration_cast<std::chrono::milliseconds>(currentTime - startTime).count()) > table_ack_wait_time_ms)
00424 {
00425 if (counter > max_ack_cycle_count_ && table_update_count_ > 0)
00426 {
00427 TLOG_ERROR(name_) << "Did not receive acks from all senders after resending table " << std::to_string(counter) << " times during the table_update_interval. Check the status of the senders!" << TLOG_ENDL;
00428 break;
00429 }
00430 TLOG_WARNING(name_) << "Did not receive acks from all senders within the table_ack_wait_time. Resending table update" << TLOG_ENDL;
00431 break;
00432 }
00433
00434 TLOG_ARB(20, name_) << "send_event_table: Polling Request socket for new requests" << TLOG_ENDL;
00435 auto ready = true;
00436 while (ready)
00437 {
00438 detail::RoutingAckPacket buffer;
00439 if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0)
00440 {
00441 if (errno == EWOULDBLOCK || errno == EAGAIN)
00442 {
00443 TLOG_ARB(20, name_) << "send_event_table: No more ack datagrams on ack socket." << TLOG_ENDL;
00444 ready = false;
00445 }
00446 else
00447 {
00448 TLOG_ERROR(name_) << "An unexpected error occurred during ack packet receive" << TLOG_ENDL;
00449 exit(2);
00450 }
00451 }
00452 else
00453 {
00454 TLOG_DEBUG(name_) << "Ack packet from rank " << buffer.rank << " has first= " << std::to_string(buffer.first_sequence_id) << " and last= " << std::to_string(buffer.last_sequence_id) << TLOG_ENDL;
00455 if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
00456 {
00457 TLOG_DEBUG(name_) << "Received table update acknowledgement from sender with rank " << std::to_string(buffer.rank) << "." << TLOG_ENDL;
00458 acks[buffer.rank] = true;
00459 TLOG_DEBUG(name_) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) << " acks outstanding" << TLOG_ENDL;
00460 }
00461 else
00462 {
00463 if (!acks.count(buffer.rank)) { TLOG_ERROR(name_) << "Received acknowledgement from invalid rank " << buffer.rank << "! Cross-talk between RoutingMasters means there's a configuration error!" << TLOG_ENDL; }
00464 else { TLOG_WARNING(name_) << "Received acknowledgement from rank " << buffer.rank << " that had incorrect sequence ID information. Discarding." << TLOG_ENDL; }
00465 }
00466 }
00467 }
00468 usleep(table_ack_wait_time_ms * 1000 / 10);
00469 }
00470 }
00471 if (metricMan)
00472 {
00473 std::chrono::duration<double, std::ratio<1>> delta = std::chrono::steady_clock::now() - start_time;
00474 metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average);
00475 }
00476 }
00477
00478 void artdaq::RoutingMasterCore::receive_tokens_()
00479 {
00480 while (!shutdown_requested_)
00481 {
00482 TLOG_DEBUG(name_) << "Receive Token loop start" << TLOG_ENDL;
00483 if (token_socket_ == -1)
00484 {
00485 TLOG_DEBUG(name_) << "Opening token listener socket" << TLOG_ENDL;
00486 token_socket_ = TCP_listen_fd(receive_token_port_, 3 * sizeof(detail::RoutingToken));
00487
00488 if (token_epoll_fd_ != -1) close(token_epoll_fd_);
00489 struct epoll_event ev;
00490 token_epoll_fd_ = epoll_create1(0);
00491 ev.events = EPOLLIN | EPOLLPRI;
00492 ev.data.fd = token_socket_;
00493 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, token_socket_, &ev) == -1)
00494 {
00495 TLOG_ERROR(name_) << "Could not register listen socket to epoll fd" << TLOG_ENDL;
00496 exit(3);
00497 }
00498 }
00499 if (token_socket_ == -1 || token_epoll_fd_ == -1)
00500 {
00501 TLOG_DEBUG(name_) << "One of the listen sockets was not opened successfully." << TLOG_ENDL;
00502 return;
00503 }
00504
00505 auto nfds = epoll_wait(token_epoll_fd_, &receive_token_events_[0], receive_token_events_.size(), current_table_interval_ms_);
00506 if (nfds == -1)
00507 {
00508 perror("epoll_wait");
00509 exit(EXIT_FAILURE);
00510 }
00511
00512 TLOG_DEBUG(name_) << "Received " << std::to_string(nfds) << " events" << TLOG_ENDL;
00513 for (auto n = 0; n < nfds; ++n)
00514 {
00515 if (receive_token_events_[n].data.fd == token_socket_)
00516 {
00517 TLOG_DEBUG(name_) << "Accepting new connection on token_socket" << TLOG_ENDL;
00518 sockaddr_in addr;
00519 socklen_t arglen = sizeof(addr);
00520 auto conn_sock = accept(token_socket_, (struct sockaddr *)&addr, &arglen);
00521
00522 if (conn_sock == -1)
00523 {
00524 perror("accept");
00525 exit(EXIT_FAILURE);
00526 }
00527
00528 receive_token_addrs_[conn_sock] = std::string(inet_ntoa(addr.sin_addr));
00529 struct epoll_event ev;
00530 ev.events = EPOLLIN | EPOLLET;
00531 ev.data.fd = conn_sock;
00532 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, conn_sock, &ev) == -1)
00533 {
00534 perror("epoll_ctl: conn_sock");
00535 exit(EXIT_FAILURE);
00536 }
00537 }
00538 else
00539 {
00540 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00541 detail::RoutingToken buff;
00542 auto sts = read(receive_token_events_[n].data.fd, &buff, sizeof(detail::RoutingToken));
00543 if (sts != sizeof(detail::RoutingToken) || buff.header != TOKEN_MAGIC)
00544 {
00545 TLOG_ERROR(name_) << "Received invalid token from " << receive_token_addrs_[receive_token_events_[n].data.fd] << TLOG_ENDL;
00546 }
00547 else
00548 {
00549 TLOG_DEBUG(name_) << "Received token from " << std::to_string(buff.rank) << " indicating " << buff.new_slots_free << " slots are free." << TLOG_ENDL;
00550 received_token_count_ += buff.new_slots_free;
00551 if (routing_mode_ == detail::RoutingMasterMode::RouteBySequenceID)
00552 {
00553 policy_->AddReceiverToken(buff.rank, buff.new_slots_free);
00554 }
00555 else if (routing_mode_ == detail::RoutingMasterMode::RouteBySendCount)
00556 {
00557 if (!received_token_counter_.count(buff.rank)) received_token_counter_[buff.rank] = 0;
00558 received_token_counter_[buff.rank] += buff.new_slots_free;
00559 TLOG_DEBUG(name_) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() << "." << TLOG_ENDL;
00560 while (received_token_counter_[buff.rank] >= sender_ranks_.size())
00561 {
00562 TLOG_DEBUG(name_) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size()
00563 << "... Sending token to policy" << TLOG_ENDL;
00564 policy_->AddReceiverToken(buff.rank, 1);
00565 received_token_counter_[buff.rank] -= sender_ranks_.size();
00566 }
00567 }
00568 }
00569 auto delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00570 statsHelper_.addSample(TOKENS_RECEIVED_STAT_KEY, delta_time);
00571
00572 }
00573 }
00574 }
00575 }
00576
00577 void artdaq::RoutingMasterCore::start_recieve_token_thread_()
00578 {
00579 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00580 TLOG_INFO(name_) << "Starting Token Reception Thread" << TLOG_ENDL;
00581 ev_token_receive_thread_ = std::thread(&RoutingMasterCore::receive_tokens_, this);
00582 }
00583
00584 std::string artdaq::RoutingMasterCore::report(std::string const&) const
00585 {
00586 std::string resultString;
00587
00588
00589 auto tmpString = name_ + " run number = " + std::to_string(run_id_.run())
00590 + ", table updates sent = " + std::to_string(table_update_count_)
00591 + ", Receiver tokens received = " + std::to_string(received_token_count_);
00592 return tmpString;
00593 }
00594
00595 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const
00596 {
00597 std::ostringstream oss;
00598 oss << name_ << " statistics:" << std::endl;
00599
00600 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00601 if (mqPtr.get() != nullptr)
00602 {
00603 artdaq::MonitoredQuantityStats stats;
00604 mqPtr->getStats(stats);
00605 oss << " Table Update statistics: "
00606 << stats.recentSampleCount << " table updates sent at "
00607 << stats.recentSampleRate << " table updates/sec, , monitor window = "
00608 << stats.recentDuration << " sec" << std::endl;
00609 oss << " Average times per table update: ";
00610 if (stats.recentSampleRate > 0.0)
00611 {
00612 oss << " elapsed time = "
00613 << (1.0 / stats.recentSampleRate) << " sec";
00614 }
00615 oss << ", avg table acknowledgement wait time = "
00616 << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
00617 }
00618
00619 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00620 if (mqPtr.get() != nullptr)
00621 {
00622 artdaq::MonitoredQuantityStats stats;
00623 mqPtr->getStats(stats);
00624 oss << " Received Token statistics: "
00625 << stats.recentSampleCount << " tokens received at "
00626 << stats.recentSampleRate << " tokens/sec, , monitor window = "
00627 << stats.recentDuration << " sec" << std::endl;
00628 oss << " Average times per token: ";
00629 if (stats.recentSampleRate > 0.0)
00630 {
00631 oss << " elapsed time = "
00632 << (1.0 / stats.recentSampleRate) << " sec";
00633 }
00634 oss << ", input token wait time = "
00635 << mqPtr->getRecentValueSum() << " sec" << std::endl;
00636 }
00637
00638 return oss.str();
00639 }
00640
00641 void artdaq::RoutingMasterCore::sendMetrics_()
00642 {
00643 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00644 if (mqPtr.get() != nullptr)
00645 {
00646 artdaq::MonitoredQuantityStats stats;
00647 mqPtr->getStats(stats);
00648 metricMan_.sendMetric("Table Update Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::Accumulate);
00649 metricMan_.sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
00650 metricMan_.sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average);
00651 }
00652
00653 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00654 if (mqPtr.get() != nullptr)
00655 {
00656 artdaq::MonitoredQuantityStats stats;
00657 mqPtr->getStats(stats);
00658 metricMan_.sendMetric("Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::Accumulate);
00659 metricMan_.sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
00660 metricMan_.sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
00661 }
00662 }