00001 #include <sys/un.h>
00002 #include <sys/time.h>
00003 #include <sys/epoll.h>
00004 #include <arpa/inet.h>
00005 #include <netdb.h>
00006 #include <pthread.h>
00007 #include <sched.h>
00008 #include <algorithm>
00009
00010 #include "canvas/Utilities/Exception.h"
00011 #include "cetlib_except/exception.h"
00012
00013 #define TRACE_NAME (app_name + "_RoutingMasterCore").c_str() // include these 2 first -
00014 #include "artdaq/DAQdata/Globals.hh"
00015 #include "artdaq-core/Data/Fragment.hh"
00016 #include "artdaq-core/Utilities/ExceptionHandler.hh"
00017
00018 #include "artdaq/Application/RoutingMasterCore.hh"
00019 #include "artdaq/Application/Routing/makeRoutingMasterPolicy.hh"
00020 #include "artdaq/DAQdata/TCP_listen_fd.hh"
00021 #include "artdaq/DAQdata/TCPConnect.hh"
00022
00023 const std::string artdaq::RoutingMasterCore::
00024 TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates");
00025 const std::string artdaq::RoutingMasterCore::
00026 TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived");
00027
00028 artdaq::RoutingMasterCore::RoutingMasterCore()
00029 : received_token_counter_()
00030 , shutdown_requested_(false)
00031 , stop_requested_(false)
00032 , pause_requested_(false)
00033 , token_socket_(-1)
00034 , table_socket_(-1)
00035 , ack_socket_(-1)
00036 {
00037 TLOG(TLVL_DEBUG) << "Constructor" ;
00038 statsHelper_.addMonitoredQuantityName(TABLE_UPDATES_STAT_KEY);
00039 statsHelper_.addMonitoredQuantityName(TOKENS_RECEIVED_STAT_KEY);
00040 metricMan = &metricMan_;
00041 }
00042
00043 artdaq::RoutingMasterCore::~RoutingMasterCore()
00044 {
00045 TLOG(TLVL_DEBUG) << "Destructor" ;
00046 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00047 }
00048
00049 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t)
00050 {
00051 TLOG(TLVL_DEBUG) << "initialize method called with "
00052 << "ParameterSet = \"" << pset.to_string()
00053 << "\"." ;
00054
00055
00056 fhicl::ParameterSet daq_pset;
00057 try
00058 {
00059 daq_pset = pset.get<fhicl::ParameterSet>("daq");
00060 }
00061 catch (...)
00062 {
00063 TLOG(TLVL_ERROR)
00064 << "Unable to find the DAQ parameters in the initialization "
00065 << "ParameterSet: \"" + pset.to_string() + "\"." ;
00066 return false;
00067 }
00068
00069 if (daq_pset.has_key("rank"))
00070 {
00071 if (my_rank >= 0 && daq_pset.get<int>("rank") != my_rank) {
00072 TLOG(TLVL_WARNING) << "Routing Master rank specified at startup is different than rank specified at configure! Using rank received at configure!";
00073 }
00074 my_rank = daq_pset.get<int>("rank");
00075 }
00076 if (my_rank == -1)
00077 {
00078 TLOG(TLVL_ERROR) << "Routing Master rank not specified at startup or in configuration! Aborting";
00079 exit(1);
00080 }
00081
00082 try
00083 {
00084 policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
00085 }
00086 catch (...)
00087 {
00088 TLOG(TLVL_ERROR)
00089 << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\"." ;
00090 return false;
00091 }
00092
00093
00094 fhicl::ParameterSet metric_pset;
00095 try
00096 {
00097 metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
00098 }
00099 catch (...) {}
00100
00101 if (metric_pset.is_empty())
00102 {
00103 TLOG(TLVL_INFO) << "No metric plugins appear to be defined" ;
00104 }
00105 try
00106 {
00107 metricMan_.initialize(metric_pset, app_name);
00108 }
00109 catch (...)
00110 {
00111 ExceptionHandler(ExceptionHandlerRethrow::no,
00112 "Error loading metrics in RoutingMasterCore::initialize()");
00113 }
00114
00115
00116 auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
00117 if (policy_plugin_spec.length() == 0)
00118 {
00119 TLOG(TLVL_ERROR)
00120 << "No fragment generator (parameter name = \"policy\") was "
00121 << "specified in the policy ParameterSet. The "
00122 << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\"." ;
00123 return false;
00124 }
00125 try
00126 {
00127 policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_);
00128 }
00129 catch (...)
00130 {
00131 std::stringstream exception_string;
00132 exception_string << "Exception thrown during initialization of policy of type \""
00133 << policy_plugin_spec << "\"";
00134
00135 ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
00136
00137 TLOG(TLVL_DEBUG) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string() ;
00138
00139 return false;
00140 }
00141
00142 rt_priority_ = daq_pset.get<int>("rt_priority", 0);
00143 sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
00144 num_receivers_ = policy_->GetReceiverCount();
00145
00146 receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
00147 receive_token_events_ = std::vector<epoll_event>(num_receivers_ + 1);
00148
00149 auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
00150 routing_mode_ = mode ? detail::RoutingMasterMode::RouteBySendCount : detail::RoutingMasterMode::RouteBySequenceID;
00151 max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
00152 current_table_interval_ms_ = max_table_update_interval_ms_;
00153 max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
00154 receive_token_port_ = daq_pset.get<int>("routing_token_port", 35555);
00155 send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
00156 receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
00157 send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
00158 receive_address_ = daq_pset.get<std::string>("routing_master_hostname", "localhost");
00159
00160
00161 statsHelper_.createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
00162
00163 shutdown_requested_.store(false);
00164 start_recieve_token_thread_();
00165 return true;
00166 }
00167
00168 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t)
00169 {
00170 stop_requested_.store(false);
00171 pause_requested_.store(false);
00172
00173 statsHelper_.resetStatistics();
00174 policy_->Reset();
00175
00176 metricMan_.do_start();
00177 run_id_ = id;
00178 table_update_count_ = 0;
00179 received_token_count_ = 0;
00180
00181 TLOG(TLVL_DEBUG) << "Started run " << std::to_string(run_id_.run()) ;
00182 return true;
00183 }
00184
00185 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t)
00186 {
00187 TLOG(TLVL_DEBUG) << "Stopping run " << std::to_string(run_id_.run())
00188 << " after " << std::to_string(table_update_count_) << " table updates."
00189 << " and " << received_token_count_ << " received tokens." ;
00190 stop_requested_.store(true);
00191 return true;
00192 }
00193
00194 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t)
00195 {
00196 TLOG(TLVL_DEBUG) << "Pausing run " << std::to_string(run_id_.run())
00197 << " after " << table_update_count_ << " table updates."
00198 << " and " << received_token_count_ << " received tokens." ;
00199 pause_requested_.store(true);
00200 return true;
00201 }
00202
00203 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t)
00204 {
00205 TLOG(TLVL_DEBUG) << "Resuming run " << run_id_.run() ;
00206 policy_->Reset();
00207 pause_requested_.store(false);
00208 metricMan_.do_start();
00209 return true;
00210 }
00211
00212 bool artdaq::RoutingMasterCore::shutdown(uint64_t)
00213 {
00214 shutdown_requested_.store(true);
00215 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00216 policy_.reset(nullptr);
00217 metricMan_.shutdown();
00218 return true;
00219 }
00220
00221 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00222 {
00223 TLOG(TLVL_DEBUG) << "soft_initialize method called with "
00224 << "ParameterSet = \"" << pset.to_string()
00225 << "\"." ;
00226 return initialize(pset, e, f);
00227 }
00228
00229 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00230 {
00231 TLOG(TLVL_DEBUG) << "reinitialize method called with "
00232 << "ParameterSet = \"" << pset.to_string()
00233 << "\"." ;
00234 return initialize(pset, e, f);
00235 }
00236
00237 size_t artdaq::RoutingMasterCore::process_event_table()
00238 {
00239 if (rt_priority_ > 0)
00240 {
00241 #pragma GCC diagnostic push
00242 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00243 sched_param s_param = {};
00244 s_param.sched_priority = rt_priority_;
00245 if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
00246 TLOG(TLVL_WARNING) << "setting realtime priority failed" ;
00247 #pragma GCC diagnostic pop
00248 }
00249
00250
00251
00252
00253 if (rt_priority_ > 0)
00254 {
00255 #pragma GCC diagnostic push
00256 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00257 sched_param s_param = {};
00258 s_param.sched_priority = rt_priority_;
00259 int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
00260 if (status != 0)
00261 {
00262 TLOG(TLVL_ERROR)
00263 << "Failed to set realtime priority to " << std::to_string(rt_priority_)
00264 << ", return code = " << status ;
00265 }
00266 #pragma GCC diagnostic pop
00267 }
00268
00269
00270
00271 TLOG(TLVL_DEBUG) << "Sending initial table." ;
00272 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00273 auto nextSendTime = startTime;
00274 double delta_time;
00275 while (true)
00276 {
00277 if (stop_requested_ || pause_requested_) { break; }
00278 startTime = artdaq::MonitoredQuantity::getCurrentTime();
00279
00280 if (startTime >= nextSendTime)
00281 {
00282 auto table = policy_->GetCurrentTable();
00283 if (table.size() > 0)
00284 {
00285 send_event_table(table);
00286 ++table_update_count_;
00287 delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00288 statsHelper_.addSample(TABLE_UPDATES_STAT_KEY, delta_time);
00289 TLOG(16) << "process_fragments TABLE_UPDATES_STAT_KEY=" << std::to_string(delta_time) ;
00290 }
00291 else
00292 {
00293 TLOG(TLVL_WARNING) << "No tokens received in this update interval! This is most likely a Very Bad Thing!" ;
00294 }
00295 auto max_tokens = policy_->GetMaxNumberOfTokens();
00296 if (max_tokens > 0)
00297 {
00298 auto frac = table.size() / static_cast<double>(max_tokens);
00299 if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
00300 if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
00301 if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
00302 if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
00303 }
00304 nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
00305 TLOG(TLVL_DEBUG) << "current_table_interval_ms is now " << current_table_interval_ms_ ;
00306 }
00307 else
00308 {
00309 usleep(current_table_interval_ms_ * 10);
00310 }
00311 }
00312
00313 metricMan_.do_stop();
00314
00315 return table_update_count_;
00316 }
00317
00318 void artdaq::RoutingMasterCore::send_event_table(detail::RoutingPacket packet)
00319 {
00320
00321 if (table_socket_ == -1)
00322 {
00323 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00324 if (table_socket_ < 0)
00325 {
00326 TLOG(TLVL_ERROR) << "I failed to create the socket for sending Data Requests! Errno: " << std::to_string(errno) ;
00327 exit(1);
00328 }
00329 auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
00330 if (sts == -1)
00331 {
00332 TLOG(TLVL_ERROR) << "Unable to resolve table_update_address" ;
00333 exit(1);
00334 }
00335
00336 auto yes = 1;
00337 if (receive_address_ != "localhost")
00338 {
00339 TLOG(TLVL_DEBUG) << "Making sure that multicast sending uses the correct interface for hostname " << receive_address_ ;
00340 struct in_addr addr;
00341 sts = ResolveHost(receive_address_.c_str(), addr);
00342 if (sts == -1)
00343 {
00344 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;;
00345 }
00346
00347 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00348 {
00349 throw art::Exception(art::errors::Configuration) <<
00350 "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
00351 exit(1);
00352 }
00353
00354 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0)
00355 {
00356 TLOG(TLVL_ERROR) << "Unable to enable multicast loopback on table socket" ;
00357 exit(1);
00358 }
00359 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
00360 {
00361 TLOG(TLVL_ERROR) << "Cannot set outgoing interface. Errno: " << std::to_string(errno) ;
00362 exit(1);
00363 }
00364 }
00365 if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1)
00366 {
00367 TLOG(TLVL_ERROR) << "Cannot set request socket to broadcast. Errno: " << std::to_string(errno) ;
00368 exit(1);
00369 }
00370 }
00371
00372
00373 if (ack_socket_ == -1)
00374 {
00375 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00376 if (ack_socket_ < 0)
00377 {
00378 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
00379 exit(1);
00380 }
00381
00382 struct sockaddr_in si_me_request;
00383
00384 auto yes = 1;
00385 if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00386 {
00387 throw art::Exception(art::errors::Configuration) <<
00388 "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
00389 exit(1);
00390 }
00391 memset(&si_me_request, 0, sizeof(si_me_request));
00392 si_me_request.sin_family = AF_INET;
00393 si_me_request.sin_port = htons(receive_acks_port_);
00394 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00395 if (bind(ack_socket_, reinterpret_cast<struct sockaddr *>(&si_me_request), sizeof(si_me_request)) == -1)
00396 {
00397 throw art::Exception(art::errors::Configuration) <<
00398 "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
00399 exit(1);
00400 }
00401 TLOG(TLVL_DEBUG) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_ ;
00402 }
00403
00404 auto acks = std::unordered_map<int, bool>();
00405 for (auto& r : sender_ranks_)
00406 {
00407 acks[r] = false;
00408 }
00409 auto counter = 0U;
00410 auto start_time = std::chrono::steady_clock::now();
00411 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
00412 {
00413
00414 auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
00415 auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
00416
00417 TLOG(TLVL_DEBUG) << "Sending table information for " << std::to_string(header.nEntries) << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ ;
00418 if (sendto(table_socket_, &header, sizeof(detail::RoutingPacketHeader), 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
00419 {
00420 TLOG(TLVL_ERROR) << "Error sending request message header" ;
00421 }
00422 if (sendto(table_socket_, &packet[0], packetSize, 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
00423 {
00424 TLOG(TLVL_ERROR) << "Error sending request message data" ;
00425 }
00426
00427
00428
00429 auto first = packet[0].sequence_id;
00430 auto last = packet.rbegin()->sequence_id;
00431 TLOG(TLVL_DEBUG) << "Expecting acks to have first= " << std::to_string(first) << ", and last= " << std::to_string(last) ;
00432
00433
00434 auto startTime = std::chrono::steady_clock::now();
00435 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
00436 {
00437 auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
00438 if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms)
00439 {
00440 if (counter > max_ack_cycle_count_ && table_update_count_ > 0)
00441 {
00442 TLOG(TLVL_ERROR) << "Did not receive acks from all senders after resending table " << std::to_string(counter)
00443 << " times during the table_update_interval. Check the status of the senders!" ;
00444 break;
00445 }
00446 TLOG(TLVL_WARNING) << "Did not receive acks from all senders within the table_ack_wait_time. Resending table update" ;
00447 break;
00448 }
00449
00450 TLOG(20) << "send_event_table: Polling Request socket for new requests" ;
00451 auto ready = true;
00452 while (ready)
00453 {
00454 detail::RoutingAckPacket buffer;
00455 if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0)
00456 {
00457 if (errno == EWOULDBLOCK || errno == EAGAIN)
00458 {
00459 TLOG(20) << "send_event_table: No more ack datagrams on ack socket." ;
00460 ready = false;
00461 }
00462 else
00463 {
00464 TLOG(TLVL_ERROR) << "An unexpected error occurred during ack packet receive" ;
00465 exit(2);
00466 }
00467 }
00468 else
00469 {
00470 TLOG(TLVL_DEBUG) << "Ack packet from rank " << buffer.rank << " has first= " << std::to_string(buffer.first_sequence_id)
00471 << " and last= " << std::to_string(buffer.last_sequence_id) ;
00472 if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
00473 {
00474 TLOG(TLVL_DEBUG) << "Received table update acknowledgement from sender with rank " << std::to_string(buffer.rank) << "." ;
00475 acks[buffer.rank] = true;
00476 TLOG(TLVL_DEBUG) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; })
00477 << " acks outstanding" ;
00478 }
00479 else
00480 {
00481 if (!acks.count(buffer.rank))
00482 {
00483 TLOG(TLVL_ERROR) << "Received acknowledgement from invalid rank " << buffer.rank << "!"
00484 << " Cross-talk between RoutingMasters means there's a configuration error!" ;
00485 }
00486 else
00487 {
00488 TLOG(TLVL_WARNING) << "Received acknowledgement from rank " << buffer.rank
00489 << " that had incorrect sequence ID information. Discarding." ;
00490 }
00491 }
00492 }
00493 }
00494 usleep(table_ack_wait_time_ms * 1000 / 10);
00495 }
00496 }
00497 if (metricMan)
00498 {
00499 artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time;
00500 metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average);
00501 }
00502 }
00503
00504 void artdaq::RoutingMasterCore::receive_tokens_()
00505 {
00506 while (!shutdown_requested_)
00507 {
00508 TLOG(TLVL_DEBUG) << "Receive Token loop start" ;
00509 if (token_socket_ == -1)
00510 {
00511 TLOG(TLVL_DEBUG) << "Opening token listener socket" ;
00512 token_socket_ = TCP_listen_fd(receive_token_port_, 3 * sizeof(detail::RoutingToken));
00513
00514 if (token_epoll_fd_ != -1) close(token_epoll_fd_);
00515 struct epoll_event ev;
00516 token_epoll_fd_ = epoll_create1(0);
00517 ev.events = EPOLLIN | EPOLLPRI;
00518 ev.data.fd = token_socket_;
00519 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, token_socket_, &ev) == -1)
00520 {
00521 TLOG(TLVL_ERROR) << "Could not register listen socket to epoll fd" ;
00522 exit(3);
00523 }
00524 }
00525 if (token_socket_ == -1 || token_epoll_fd_ == -1)
00526 {
00527 TLOG(TLVL_DEBUG) << "One of the listen sockets was not opened successfully." ;
00528 return;
00529 }
00530
00531 auto nfds = epoll_wait(token_epoll_fd_, &receive_token_events_[0], receive_token_events_.size(), current_table_interval_ms_);
00532 if (nfds == -1)
00533 {
00534 perror("epoll_wait");
00535 exit(EXIT_FAILURE);
00536 }
00537
00538 TLOG(TLVL_DEBUG) << "Received " << std::to_string(nfds) << " events" ;
00539 for (auto n = 0; n < nfds; ++n)
00540 {
00541 if (receive_token_events_[n].data.fd == token_socket_)
00542 {
00543 TLOG(TLVL_DEBUG) << "Accepting new connection on token_socket" ;
00544 sockaddr_in addr;
00545 socklen_t arglen = sizeof(addr);
00546 auto conn_sock = accept(token_socket_, (struct sockaddr *)&addr, &arglen);
00547
00548 if (conn_sock == -1)
00549 {
00550 perror("accept");
00551 exit(EXIT_FAILURE);
00552 }
00553
00554 receive_token_addrs_[conn_sock] = std::string(inet_ntoa(addr.sin_addr));
00555 struct epoll_event ev;
00556 ev.events = EPOLLIN | EPOLLET;
00557 ev.data.fd = conn_sock;
00558 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, conn_sock, &ev) == -1)
00559 {
00560 perror("epoll_ctl: conn_sock");
00561 exit(EXIT_FAILURE);
00562 }
00563 }
00564 else
00565 {
00566 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00567 detail::RoutingToken buff;
00568 auto sts = read(receive_token_events_[n].data.fd, &buff, sizeof(detail::RoutingToken));
00569 if (sts != sizeof(detail::RoutingToken) || buff.header != TOKEN_MAGIC)
00570 {
00571 TLOG(TLVL_ERROR) << "Received invalid token from " << receive_token_addrs_[receive_token_events_[n].data.fd] ;
00572 }
00573 else
00574 {
00575 TLOG(TLVL_DEBUG) << "Received token from " << std::to_string(buff.rank) << " indicating " << buff.new_slots_free << " slots are free." ;
00576 received_token_count_ += buff.new_slots_free;
00577 if (routing_mode_ == detail::RoutingMasterMode::RouteBySequenceID)
00578 {
00579 policy_->AddReceiverToken(buff.rank, buff.new_slots_free);
00580 }
00581 else if (routing_mode_ == detail::RoutingMasterMode::RouteBySendCount)
00582 {
00583 if (!received_token_counter_.count(buff.rank)) received_token_counter_[buff.rank] = 0;
00584 received_token_counter_[buff.rank] += buff.new_slots_free;
00585 TLOG(TLVL_DEBUG) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() << "." ;
00586 while (received_token_counter_[buff.rank] >= sender_ranks_.size())
00587 {
00588 TLOG(TLVL_DEBUG) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size()
00589 << "... Sending token to policy" ;
00590 policy_->AddReceiverToken(buff.rank, 1);
00591 received_token_counter_[buff.rank] -= sender_ranks_.size();
00592 }
00593 }
00594 }
00595 auto delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00596 statsHelper_.addSample(TOKENS_RECEIVED_STAT_KEY, delta_time);
00597
00598 }
00599 }
00600 }
00601 }
00602
00603 void artdaq::RoutingMasterCore::start_recieve_token_thread_()
00604 {
00605 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00606 TLOG(TLVL_INFO) << "Starting Token Reception Thread" ;
00607 ev_token_receive_thread_ = boost::thread(&RoutingMasterCore::receive_tokens_, this);
00608 }
00609
00610 std::string artdaq::RoutingMasterCore::report(std::string const&) const
00611 {
00612 std::string resultString;
00613
00614
00615 auto tmpString = app_name + " run number = " + std::to_string(run_id_.run())
00616 + ", table updates sent = " + std::to_string(table_update_count_)
00617 + ", Receiver tokens received = " + std::to_string(received_token_count_);
00618 return tmpString;
00619 }
00620
00621 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const
00622 {
00623 std::ostringstream oss;
00624 oss << app_name << " statistics:" << std::endl;
00625
00626 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00627 if (mqPtr.get() != nullptr)
00628 {
00629 artdaq::MonitoredQuantityStats stats;
00630 mqPtr->getStats(stats);
00631 oss << " Table Update statistics: "
00632 << stats.recentSampleCount << " table updates sent at "
00633 << stats.recentSampleRate << " table updates/sec, , monitor window = "
00634 << stats.recentDuration << " sec" << std::endl;
00635 oss << " Average times per table update: ";
00636 if (stats.recentSampleRate > 0.0)
00637 {
00638 oss << " elapsed time = "
00639 << (1.0 / stats.recentSampleRate) << " sec";
00640 }
00641 oss << ", avg table acknowledgement wait time = "
00642 << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
00643 }
00644
00645 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00646 if (mqPtr.get() != nullptr)
00647 {
00648 artdaq::MonitoredQuantityStats stats;
00649 mqPtr->getStats(stats);
00650 oss << " Received Token statistics: "
00651 << stats.recentSampleCount << " tokens received at "
00652 << stats.recentSampleRate << " tokens/sec, , monitor window = "
00653 << stats.recentDuration << " sec" << std::endl;
00654 oss << " Average times per token: ";
00655 if (stats.recentSampleRate > 0.0)
00656 {
00657 oss << " elapsed time = "
00658 << (1.0 / stats.recentSampleRate) << " sec";
00659 }
00660 oss << ", input token wait time = "
00661 << mqPtr->getRecentValueSum() << " sec" << std::endl;
00662 }
00663
00664 return oss.str();
00665 }
00666
00667 void artdaq::RoutingMasterCore::sendMetrics_()
00668 {
00669 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00670 if (mqPtr.get() != nullptr)
00671 {
00672 artdaq::MonitoredQuantityStats stats;
00673 mqPtr->getStats(stats);
00674 metricMan_.sendMetric("Table Update Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::Accumulate);
00675 metricMan_.sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
00676 metricMan_.sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average);
00677 }
00678
00679 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00680 if (mqPtr.get() != nullptr)
00681 {
00682 artdaq::MonitoredQuantityStats stats;
00683 mqPtr->getStats(stats);
00684 metricMan_.sendMetric("Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::Accumulate);
00685 metricMan_.sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
00686 metricMan_.sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
00687 }
00688 }