00001 #include <sys/un.h>
00002 #include <sys/time.h>
00003 #include <sys/epoll.h>
00004 #include <arpa/inet.h>
00005 #include <netdb.h>
00006 #include <pthread.h>
00007 #include <sched.h>
00008 #include <algorithm>
00009
00010 #include "canvas/Utilities/Exception.h"
00011 #include "cetlib_except/exception.h"
00012
00013 #define TRACE_NAME (app_name + "_RoutingMasterCore").c_str() // include these 2 first -
00014 #include "artdaq/DAQdata/Globals.hh"
00015 #include "artdaq-core/Data/Fragment.hh"
00016 #include "artdaq-core/Utilities/ExceptionHandler.hh"
00017
00018 #include "artdaq/Application/RoutingMasterCore.hh"
00019 #include "artdaq/Application/Routing/makeRoutingMasterPolicy.hh"
00020 #include "artdaq/DAQdata/TCP_listen_fd.hh"
00021 #include "artdaq/DAQdata/TCPConnect.hh"
00022
00023 const std::string artdaq::RoutingMasterCore::
00024 TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates");
00025 const std::string artdaq::RoutingMasterCore::
00026 TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived");
00027
00028 artdaq::RoutingMasterCore::RoutingMasterCore()
00029 : received_token_counter_()
00030 , shutdown_requested_(false)
00031 , stop_requested_(false)
00032 , pause_requested_(false)
00033 , token_socket_(-1)
00034 , table_socket_(-1)
00035 , ack_socket_(-1)
00036 {
00037 TLOG(TLVL_DEBUG) << "Constructor" ;
00038 statsHelper_.addMonitoredQuantityName(TABLE_UPDATES_STAT_KEY);
00039 statsHelper_.addMonitoredQuantityName(TOKENS_RECEIVED_STAT_KEY);
00040 }
00041
00042 artdaq::RoutingMasterCore::~RoutingMasterCore()
00043 {
00044 TLOG(TLVL_DEBUG) << "Destructor" ;
00045 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00046 }
00047
00048 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t)
00049 {
00050 TLOG(TLVL_DEBUG) << "initialize method called with "
00051 << "ParameterSet = \"" << pset.to_string()
00052 << "\"." ;
00053
00054
00055 fhicl::ParameterSet daq_pset;
00056 try
00057 {
00058 daq_pset = pset.get<fhicl::ParameterSet>("daq");
00059 }
00060 catch (...)
00061 {
00062 TLOG(TLVL_ERROR)
00063 << "Unable to find the DAQ parameters in the initialization "
00064 << "ParameterSet: \"" + pset.to_string() + "\"." ;
00065 return false;
00066 }
00067
00068 if (daq_pset.has_key("rank"))
00069 {
00070 if (my_rank >= 0 && daq_pset.get<int>("rank") != my_rank) {
00071 TLOG(TLVL_WARNING) << "Routing Master rank specified at startup is different than rank specified at configure! Using rank received at configure!";
00072 }
00073 my_rank = daq_pset.get<int>("rank");
00074 }
00075 if (my_rank == -1)
00076 {
00077 TLOG(TLVL_ERROR) << "Routing Master rank not specified at startup or in configuration! Aborting";
00078 exit(1);
00079 }
00080
00081 try
00082 {
00083 policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
00084 }
00085 catch (...)
00086 {
00087 TLOG(TLVL_ERROR)
00088 << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\"." ;
00089 return false;
00090 }
00091
00092
00093 fhicl::ParameterSet metric_pset;
00094 try
00095 {
00096 metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
00097 }
00098 catch (...) {}
00099
00100 if (metric_pset.is_empty())
00101 {
00102 TLOG(TLVL_INFO) << "No metric plugins appear to be defined" ;
00103 }
00104 try
00105 {
00106 metricMan->initialize(metric_pset, app_name);
00107 }
00108 catch (...)
00109 {
00110 ExceptionHandler(ExceptionHandlerRethrow::no,
00111 "Error loading metrics in RoutingMasterCore::initialize()");
00112 }
00113
00114
00115 auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
00116 if (policy_plugin_spec.length() == 0)
00117 {
00118 TLOG(TLVL_ERROR)
00119 << "No fragment generator (parameter name = \"policy\") was "
00120 << "specified in the policy ParameterSet. The "
00121 << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\"." ;
00122 return false;
00123 }
00124 try
00125 {
00126 policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_);
00127 }
00128 catch (...)
00129 {
00130 std::stringstream exception_string;
00131 exception_string << "Exception thrown during initialization of policy of type \""
00132 << policy_plugin_spec << "\"";
00133
00134 ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
00135
00136 TLOG(TLVL_DEBUG) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string() ;
00137
00138 return false;
00139 }
00140
00141 rt_priority_ = daq_pset.get<int>("rt_priority", 0);
00142 sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
00143 num_receivers_ = policy_->GetReceiverCount();
00144
00145 receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
00146 receive_token_events_ = std::vector<epoll_event>(num_receivers_ + 1);
00147
00148 auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
00149 routing_mode_ = mode ? detail::RoutingMasterMode::RouteBySendCount : detail::RoutingMasterMode::RouteBySequenceID;
00150 max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
00151 current_table_interval_ms_ = max_table_update_interval_ms_;
00152 max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
00153 receive_token_port_ = daq_pset.get<int>("routing_token_port", 35555);
00154 send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
00155 receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
00156 send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
00157 receive_address_ = daq_pset.get<std::string>("routing_master_hostname", "localhost");
00158
00159
00160 statsHelper_.createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
00161
00162 shutdown_requested_.store(false);
00163 start_recieve_token_thread_();
00164 return true;
00165 }
00166
00167 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t)
00168 {
00169 stop_requested_.store(false);
00170 pause_requested_.store(false);
00171
00172 statsHelper_.resetStatistics();
00173 policy_->Reset();
00174
00175 metricMan->do_start();
00176 run_id_ = id;
00177 table_update_count_ = 0;
00178 received_token_count_ = 0;
00179
00180 TLOG(TLVL_INFO) << "Started run " << run_id_.run() ;
00181 return true;
00182 }
00183
00184 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t)
00185 {
00186 TLOG(TLVL_INFO) << "Stopping run " << run_id_.run()
00187 << " after " << table_update_count_ << " table updates."
00188 << " and " << received_token_count_ << " received tokens." ;
00189 stop_requested_.store(true);
00190 return true;
00191 }
00192
00193 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t)
00194 {
00195 TLOG(TLVL_INFO) << "Pausing run " << run_id_.run()
00196 << " after " << table_update_count_ << " table updates."
00197 << " and " << received_token_count_ << " received tokens." ;
00198 pause_requested_.store(true);
00199 return true;
00200 }
00201
00202 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t)
00203 {
00204 TLOG(TLVL_INFO) << "Resuming run " << run_id_.run() ;
00205 policy_->Reset();
00206 pause_requested_.store(false);
00207 metricMan->do_start();
00208 return true;
00209 }
00210
00211 bool artdaq::RoutingMasterCore::shutdown(uint64_t)
00212 {
00213 shutdown_requested_.store(true);
00214 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00215 policy_.reset(nullptr);
00216 metricMan->shutdown();
00217 return true;
00218 }
00219
00220 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00221 {
00222 TLOG(TLVL_INFO) << "soft_initialize method called with "
00223 << "ParameterSet = \"" << pset.to_string()
00224 << "\"." ;
00225 return initialize(pset, e, f);
00226 }
00227
00228 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00229 {
00230 TLOG(TLVL_INFO) << "reinitialize method called with "
00231 << "ParameterSet = \"" << pset.to_string()
00232 << "\"." ;
00233 return initialize(pset, e, f);
00234 }
00235
00236 void artdaq::RoutingMasterCore::process_event_table()
00237 {
00238 if (rt_priority_ > 0)
00239 {
00240 #pragma GCC diagnostic push
00241 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00242 sched_param s_param = {};
00243 s_param.sched_priority = rt_priority_;
00244 if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
00245 TLOG(TLVL_WARNING) << "setting realtime priority failed" ;
00246 #pragma GCC diagnostic pop
00247 }
00248
00249
00250
00251
00252 if (rt_priority_ > 0)
00253 {
00254 #pragma GCC diagnostic push
00255 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00256 sched_param s_param = {};
00257 s_param.sched_priority = rt_priority_;
00258 int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
00259 if (status != 0)
00260 {
00261 TLOG(TLVL_ERROR)
00262 << "Failed to set realtime priority to " << rt_priority_
00263 << ", return code = " << status ;
00264 }
00265 #pragma GCC diagnostic pop
00266 }
00267
00268
00269
00270 TLOG(TLVL_DEBUG) << "Sending initial table." ;
00271 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00272 auto nextSendTime = startTime;
00273 double delta_time;
00274 while (!stop_requested_ && !pause_requested_)
00275 {
00276 startTime = artdaq::MonitoredQuantity::getCurrentTime();
00277
00278 if (startTime >= nextSendTime)
00279 {
00280 auto table = policy_->GetCurrentTable();
00281 if (table.size() > 0)
00282 {
00283 send_event_table(table);
00284 ++table_update_count_;
00285 delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00286 statsHelper_.addSample(TABLE_UPDATES_STAT_KEY, delta_time);
00287 TLOG(16) << "process_fragments TABLE_UPDATES_STAT_KEY=" << delta_time ;
00288 }
00289 else
00290 {
00291 TLOG(TLVL_DEBUG) << "No tokens received in this update interval (" << current_table_interval_ms_ << " ms)! This most likely means that the receivers are not keeping up!" ;
00292 }
00293 auto max_tokens = policy_->GetMaxNumberOfTokens();
00294 if (max_tokens > 0)
00295 {
00296 auto frac = table.size() / static_cast<double>(max_tokens);
00297 if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
00298 if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
00299 if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
00300 if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
00301 }
00302 nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
00303 TLOG(TLVL_DEBUG) << "current_table_interval_ms is now " << current_table_interval_ms_ ;
00304 }
00305 else
00306 {
00307 usleep(current_table_interval_ms_ * 10);
00308 }
00309 }
00310
00311 metricMan->do_stop();
00312 }
00313
00314 void artdaq::RoutingMasterCore::send_event_table(detail::RoutingPacket packet)
00315 {
00316
00317 if (table_socket_ == -1)
00318 {
00319 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00320 if (table_socket_ < 0)
00321 {
00322 TLOG(TLVL_ERROR) << "I failed to create the socket for sending Data Requests! Errno: " << errno ;
00323 exit(1);
00324 }
00325 auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
00326 if (sts == -1)
00327 {
00328 TLOG(TLVL_ERROR) << "Unable to resolve table_update_address" ;
00329 exit(1);
00330 }
00331
00332 auto yes = 1;
00333 if (receive_address_ != "localhost")
00334 {
00335 TLOG(TLVL_DEBUG) << "Making sure that multicast sending uses the correct interface for hostname " << receive_address_ ;
00336 struct in_addr addr;
00337 sts = ResolveHost(receive_address_.c_str(), addr);
00338 if (sts == -1)
00339 {
00340 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;;
00341 }
00342
00343 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00344 {
00345 throw art::Exception(art::errors::Configuration) <<
00346 "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
00347 exit(1);
00348 }
00349
00350 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0)
00351 {
00352 TLOG(TLVL_ERROR) << "Unable to enable multicast loopback on table socket" ;
00353 exit(1);
00354 }
00355 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
00356 {
00357 TLOG(TLVL_ERROR) << "Cannot set outgoing interface. Errno: " << errno ;
00358 exit(1);
00359 }
00360 }
00361 if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1)
00362 {
00363 TLOG(TLVL_ERROR) << "Cannot set request socket to broadcast. Errno: " << errno ;
00364 exit(1);
00365 }
00366 }
00367
00368
00369 if (ack_socket_ == -1)
00370 {
00371 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00372 if (ack_socket_ < 0)
00373 {
00374 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
00375 exit(1);
00376 }
00377
00378 struct sockaddr_in si_me_request;
00379
00380 auto yes = 1;
00381 if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00382 {
00383 throw art::Exception(art::errors::Configuration) <<
00384 "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
00385 exit(1);
00386 }
00387 memset(&si_me_request, 0, sizeof(si_me_request));
00388 si_me_request.sin_family = AF_INET;
00389 si_me_request.sin_port = htons(receive_acks_port_);
00390 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00391 if (bind(ack_socket_, reinterpret_cast<struct sockaddr *>(&si_me_request), sizeof(si_me_request)) == -1)
00392 {
00393 throw art::Exception(art::errors::Configuration) <<
00394 "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
00395 exit(1);
00396 }
00397 TLOG(TLVL_DEBUG) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_ ;
00398 }
00399
00400 auto acks = std::unordered_map<int, bool>();
00401 for (auto& r : sender_ranks_)
00402 {
00403 acks[r] = false;
00404 }
00405 auto counter = 0U;
00406 auto start_time = std::chrono::steady_clock::now();
00407 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0 && !stop_requested_)
00408 {
00409
00410 auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
00411 auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
00412
00413 TLOG(TLVL_DEBUG) << "Sending table information for " << header.nEntries << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ ;
00414 TRACE(16,"headerData:0x%016lx%016lx packetData:0x%016lx%016lx"
00415 ,((unsigned long*)&header)[0],((unsigned long*)&header)[1], ((unsigned long*)&packet[0])[0],((unsigned long*)&packet[0])[1] );
00416 auto hdrsts = sendto(table_socket_, &header, sizeof(detail::RoutingPacketHeader), 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_));
00417 if (hdrsts != sizeof(detail::RoutingPacketHeader))
00418 {
00419 TLOG(TLVL_ERROR) << "Error sending routing message header. hdrsts=" << hdrsts;
00420 }
00421 auto pktsts = sendto(table_socket_, &packet[0], packetSize, 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_));
00422 if (pktsts != (ssize_t)packetSize)
00423 {
00424 TLOG(TLVL_ERROR) << "Error sending routing message data. hdrsts="<<hdrsts<<" pktsts="<<pktsts;
00425 }
00426
00427
00428
00429 auto first = packet[0].sequence_id;
00430 auto last = packet.rbegin()->sequence_id;
00431 TLOG(TLVL_DEBUG) << "Sent " << hdrsts <<"+"<< pktsts << ". Expecting acks to have first= " << first << ", and last= " << last ;
00432
00433
00434 auto startTime = std::chrono::steady_clock::now();
00435 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
00436 {
00437 auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
00438 if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms)
00439 {
00440 if (counter > max_ack_cycle_count_ && table_update_count_ > 0)
00441 {
00442 TLOG(TLVL_ERROR) << "Did not receive acks from all senders after resending table " << counter
00443 << " times during the table_update_interval. Check the status of the senders!" ;
00444 break;
00445 }
00446 TLOG(TLVL_WARNING) << "Did not receive acks from all senders within the timeout (" << table_ack_wait_time_ms << " ms). Resending table update" ;
00447 break;
00448 }
00449
00450 TLOG(20) << "send_event_table: Polling Request socket for new requests" ;
00451 auto ready = true;
00452 while (ready)
00453 {
00454 detail::RoutingAckPacket buffer;
00455 if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0)
00456 {
00457 if (errno == EWOULDBLOCK || errno == EAGAIN)
00458 {
00459 TLOG(20) << "send_event_table: No more ack datagrams on ack socket." ;
00460 ready = false;
00461 }
00462 else
00463 {
00464 TLOG(TLVL_ERROR) << "An unexpected error occurred during ack packet receive" ;
00465 exit(2);
00466 }
00467 }
00468 else
00469 {
00470 TLOG(TLVL_DEBUG) << "Ack packet from rank " << buffer.rank << " has first= " << buffer.first_sequence_id
00471 << " and last= " << buffer.last_sequence_id ;
00472 if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
00473 {
00474 TLOG(TLVL_DEBUG) << "Received table update acknowledgement from sender with rank " << buffer.rank << "." ;
00475 acks[buffer.rank] = true;
00476 TLOG(TLVL_DEBUG) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; })
00477 << " acks outstanding" ;
00478 }
00479 else
00480 {
00481 if (!acks.count(buffer.rank))
00482 {
00483 TLOG(TLVL_ERROR) << "Received acknowledgement from invalid rank " << buffer.rank << "!"
00484 << " Cross-talk between RoutingMasters means there's a configuration error!" ;
00485 }
00486 else
00487 {
00488 TLOG(TLVL_WARNING) << "Received acknowledgement from rank " << buffer.rank
00489 << " that had incorrect sequence ID information. Discarding." ;
00490 }
00491 }
00492 }
00493 }
00494 usleep(table_ack_wait_time_ms * 1000 / 10);
00495 }
00496 }
00497 if (metricMan)
00498 {
00499 artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time;
00500 metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average);
00501 }
00502 }
00503
00504 void artdaq::RoutingMasterCore::receive_tokens_()
00505 {
00506 while (!shutdown_requested_)
00507 {
00508 TLOG(TLVL_DEBUG) << "Receive Token loop start" ;
00509 if (token_socket_ == -1)
00510 {
00511 TLOG(TLVL_DEBUG) << "Opening token listener socket" ;
00512 token_socket_ = TCP_listen_fd(receive_token_port_, 3 * sizeof(detail::RoutingToken));
00513 fcntl(token_socket_, F_SETFL, O_NONBLOCK);
00514
00515 if (token_epoll_fd_ != -1) close(token_epoll_fd_);
00516 struct epoll_event ev;
00517 token_epoll_fd_ = epoll_create1(0);
00518 ev.events = EPOLLIN | EPOLLPRI;
00519 ev.data.fd = token_socket_;
00520 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, token_socket_, &ev) == -1)
00521 {
00522 TLOG(TLVL_ERROR) << "Could not register listen socket to epoll fd" ;
00523 exit(3);
00524 }
00525 }
00526 if (token_socket_ == -1 || token_epoll_fd_ == -1)
00527 {
00528 TLOG(TLVL_DEBUG) << "One of the listen sockets was not opened successfully." ;
00529 return;
00530 }
00531
00532 auto nfds = epoll_wait(token_epoll_fd_, &receive_token_events_[0], receive_token_events_.size(), current_table_interval_ms_);
00533 if (nfds == -1)
00534 {
00535 perror("epoll_wait");
00536 exit(EXIT_FAILURE);
00537 }
00538
00539 TLOG(TLVL_DEBUG) << "Received " << nfds << " events" ;
00540 for (auto n = 0; n < nfds; ++n)
00541 {
00542 if (receive_token_events_[n].data.fd == token_socket_)
00543 {
00544 TLOG(TLVL_DEBUG) << "Accepting new connection on token_socket" ;
00545 sockaddr_in addr;
00546 socklen_t arglen = sizeof(addr);
00547 auto conn_sock = accept(token_socket_, (struct sockaddr *)&addr, &arglen);
00548 fcntl(conn_sock, F_SETFL, O_NONBLOCK);
00549
00550 if (conn_sock == -1)
00551 {
00552 perror("accept");
00553 exit(EXIT_FAILURE);
00554 }
00555
00556 receive_token_addrs_[conn_sock] = std::string(inet_ntoa(addr.sin_addr));
00557 TLOG(TLVL_DEBUG) << "New fd is " << conn_sock << " for receiver at " << receive_token_addrs_[conn_sock];
00558 struct epoll_event ev;
00559 ev.events = EPOLLIN | EPOLLET;
00560 ev.data.fd = conn_sock;
00561 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, conn_sock, &ev) == -1)
00562 {
00563 perror("epoll_ctl: conn_sock");
00564 exit(EXIT_FAILURE);
00565 }
00566 }
00567 else
00568 {
00569
00570
00571
00572
00573
00574
00575
00576
00577
00578 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00579 bool reading = true;
00580 int sts = 0;
00581 while(reading)
00582 {
00583 detail::RoutingToken buff;
00584 sts += read(receive_token_events_[n].data.fd, &buff, sizeof(detail::RoutingToken) - sts);
00585 if (sts == 0)
00586 {
00587 TLOG(TLVL_INFO) << "Received 0-size token from " << receive_token_addrs_[receive_token_events_[n].data.fd];
00588 reading = false;
00589 }
00590 else if(sts < 0 && errno == EAGAIN)
00591 {
00592 TLOG(TLVL_DEBUG) << "No more tokens from this rank. Continuing poll loop.";
00593 reading = false;
00594 }
00595 else if(sts < 0)
00596 {
00597 TLOG(TLVL_ERROR) << "Error reading from token socket: sts=" << sts << ", errno=" << errno;
00598 receive_token_addrs_.erase(receive_token_events_[n].data.fd);
00599 close(receive_token_events_[n].data.fd);
00600 epoll_ctl(token_epoll_fd_, EPOLL_CTL_DEL, receive_token_events_[n].data.fd, NULL);
00601 reading = false;
00602 }
00603 else if (sts == sizeof(detail::RoutingToken) && buff.header != TOKEN_MAGIC)
00604 {
00605 TLOG(TLVL_ERROR) << "Received invalid token from " << receive_token_addrs_[receive_token_events_[n].data.fd] << " sts=" << sts;
00606 reading = false;
00607 }
00608 else if(sts == sizeof(detail::RoutingToken))
00609 {
00610 sts = 0;
00611 TLOG(TLVL_DEBUG) << "Received token from " << buff.rank << " indicating " << buff.new_slots_free << " slots are free." ;
00612 received_token_count_ += buff.new_slots_free;
00613 if (routing_mode_ == detail::RoutingMasterMode::RouteBySequenceID)
00614 {
00615 policy_->AddReceiverToken(buff.rank, buff.new_slots_free);
00616 }
00617 else if (routing_mode_ == detail::RoutingMasterMode::RouteBySendCount)
00618 {
00619 if (!received_token_counter_.count(buff.rank)) received_token_counter_[buff.rank] = 0;
00620 received_token_counter_[buff.rank] += buff.new_slots_free;
00621 TLOG(TLVL_DEBUG) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() << "." ;
00622 while (received_token_counter_[buff.rank] >= sender_ranks_.size())
00623 {
00624 TLOG(TLVL_DEBUG) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size()
00625 << "... Sending token to policy" ;
00626 policy_->AddReceiverToken(buff.rank, 1);
00627 received_token_counter_[buff.rank] -= sender_ranks_.size();
00628 }
00629 }
00630 }
00631 }
00632 auto delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00633 statsHelper_.addSample(TOKENS_RECEIVED_STAT_KEY, delta_time);
00634 bool readyToReport = statsHelper_.readyToReport(delta_time);
00635 if (readyToReport)
00636 {
00637 std::string statString = buildStatisticsString_();
00638 TLOG(TLVL_INFO) << statString;
00639 sendMetrics_();
00640 }
00641 }
00642 }
00643 }
00644 }
00645
00646 void artdaq::RoutingMasterCore::start_recieve_token_thread_()
00647 {
00648 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00649 boost::thread::attributes attrs;
00650 attrs.set_stack_size(4096 * 2000);
00651
00652 TLOG(TLVL_INFO) << "Starting Token Reception Thread" ;
00653 try {
00654 ev_token_receive_thread_ = boost::thread(attrs, boost::bind(&RoutingMasterCore::receive_tokens_, this));
00655 }
00656 catch(boost::exception const& e)
00657 {
00658 std::cerr << "Exception encountered starting Token Reception thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
00659 exit(3);
00660 }
00661 TLOG(TLVL_INFO) << "Started Token Reception Thread";
00662 }
00663
00664 std::string artdaq::RoutingMasterCore::report(std::string const&) const
00665 {
00666 std::string resultString;
00667
00668
00669 auto tmpString = app_name + " run number = " + std::to_string(run_id_.run())
00670 + ", table updates sent = " + std::to_string(table_update_count_)
00671 + ", Receiver tokens received = " + std::to_string(received_token_count_);
00672 return tmpString;
00673 }
00674
00675 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const
00676 {
00677 std::ostringstream oss;
00678 oss << app_name << " statistics:" << std::endl;
00679
00680 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00681 if (mqPtr.get() != nullptr)
00682 {
00683 artdaq::MonitoredQuantityStats stats;
00684 mqPtr->getStats(stats);
00685 oss << " Table Update statistics: "
00686 << stats.recentSampleCount << " table updates sent at "
00687 << stats.recentSampleRate << " table updates/sec, , monitor window = "
00688 << stats.recentDuration << " sec" << std::endl;
00689 oss << " Average times per table update: ";
00690 if (stats.recentSampleRate > 0.0)
00691 {
00692 oss << " elapsed time = "
00693 << (1.0 / stats.recentSampleRate) << " sec";
00694 }
00695 oss << ", avg table acknowledgement wait time = "
00696 << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
00697 }
00698
00699 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00700 if (mqPtr.get() != nullptr)
00701 {
00702 artdaq::MonitoredQuantityStats stats;
00703 mqPtr->getStats(stats);
00704 oss << " Received Token statistics: "
00705 << stats.recentSampleCount << " tokens received at "
00706 << stats.recentSampleRate << " tokens/sec, , monitor window = "
00707 << stats.recentDuration << " sec" << std::endl;
00708 oss << " Average times per token: ";
00709 if (stats.recentSampleRate > 0.0)
00710 {
00711 oss << " elapsed time = "
00712 << (1.0 / stats.recentSampleRate) << " sec";
00713 }
00714 oss << ", input token wait time = "
00715 << mqPtr->getRecentValueSum() << " sec" << std::endl;
00716 }
00717
00718 return oss.str();
00719 }
00720
00721 void artdaq::RoutingMasterCore::sendMetrics_()
00722 {
00723 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00724 if (mqPtr.get() != nullptr)
00725 {
00726 artdaq::MonitoredQuantityStats stats;
00727 mqPtr->getStats(stats);
00728 metricMan->sendMetric("Table Update Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::LastPoint);
00729 metricMan->sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
00730 metricMan->sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average);
00731 }
00732
00733 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00734 if (mqPtr.get() != nullptr)
00735 {
00736 artdaq::MonitoredQuantityStats stats;
00737 mqPtr->getStats(stats);
00738 metricMan->sendMetric("Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::LastPoint);
00739 metricMan->sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
00740 metricMan->sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
00741 }
00742 }