$treeview $search $mathjax $extrastylesheet
artdaq
v3_04_00
$projectbrief
|
$projectbrief
|
$searchbox |
00001 #include <sys/un.h> 00002 #include <sys/time.h> 00003 #include <sys/epoll.h> 00004 #include <arpa/inet.h> 00005 #include <netdb.h> 00006 #include <pthread.h> 00007 #include <sched.h> 00008 #include <algorithm> 00009 00010 #include "canvas/Utilities/Exception.h" 00011 #include "cetlib_except/exception.h" 00012 00013 #define TRACE_NAME (app_name + "_RoutingMasterCore").c_str() // include these 2 first - 00014 #include "artdaq/DAQdata/Globals.hh" // to get tracemf.h before trace.h 00015 #include "artdaq-core/Data/Fragment.hh" 00016 #include "artdaq-core/Utilities/ExceptionHandler.hh" 00017 00018 #include "artdaq/Application/RoutingMasterCore.hh" 00019 #include "artdaq/Application/Routing/makeRoutingMasterPolicy.hh" 00020 #include "artdaq/DAQdata/TCP_listen_fd.hh" 00021 #include "artdaq/DAQdata/TCPConnect.hh" 00022 00023 const std::string artdaq::RoutingMasterCore:: 00024 TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates"); 00025 const std::string artdaq::RoutingMasterCore:: 00026 TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived"); 00027 00028 artdaq::RoutingMasterCore::RoutingMasterCore() 00029 : received_token_counter_() 00030 , shutdown_requested_(false) 00031 , stop_requested_(true) 00032 , pause_requested_(false) 00033 , token_socket_(-1) 00034 , table_socket_(-1) 00035 , ack_socket_(-1) 00036 { 00037 TLOG(TLVL_DEBUG) << "Constructor" ; 00038 statsHelper_.addMonitoredQuantityName(TABLE_UPDATES_STAT_KEY); 00039 statsHelper_.addMonitoredQuantityName(TOKENS_RECEIVED_STAT_KEY); 00040 } 00041 00042 artdaq::RoutingMasterCore::~RoutingMasterCore() 00043 { 00044 TLOG(TLVL_DEBUG) << "Destructor" ; 00045 artdaq::StatisticsCollection::getInstance().requestStop(); 00046 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join(); 00047 } 00048 00049 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t) 00050 { 00051 TLOG(TLVL_DEBUG) << "initialize method called with " 00052 << "ParameterSet = \"" << pset.to_string() 00053 << "\"." ; 00054 00055 // pull out the relevant parts of the ParameterSet 00056 fhicl::ParameterSet daq_pset; 00057 try 00058 { 00059 daq_pset = pset.get<fhicl::ParameterSet>("daq"); 00060 } 00061 catch (...) 00062 { 00063 TLOG(TLVL_ERROR) 00064 << "Unable to find the DAQ parameters in the initialization " 00065 << "ParameterSet: \"" + pset.to_string() + "\"." ; 00066 return false; 00067 } 00068 00069 if (daq_pset.has_key("rank")) 00070 { 00071 if (my_rank >= 0 && daq_pset.get<int>("rank") != my_rank) { 00072 TLOG(TLVL_WARNING) << "Routing Master rank specified at startup is different than rank specified at configure! Using rank received at configure!"; 00073 } 00074 my_rank = daq_pset.get<int>("rank"); 00075 } 00076 if (my_rank == -1) 00077 { 00078 TLOG(TLVL_ERROR) << "Routing Master rank not specified at startup or in configuration! Aborting"; 00079 exit(1); 00080 } 00081 00082 try 00083 { 00084 policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy"); 00085 } 00086 catch (...) 00087 { 00088 TLOG(TLVL_ERROR) 00089 << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\"." ; 00090 return false; 00091 } 00092 00093 // pull out the Metric part of the ParameterSet 00094 fhicl::ParameterSet metric_pset; 00095 try 00096 { 00097 metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics"); 00098 } 00099 catch (...) {} // OK if there's no metrics table defined in the FHiCL 00100 00101 if (metric_pset.is_empty()) 00102 { 00103 TLOG(TLVL_INFO) << "No metric plugins appear to be defined" ; 00104 } 00105 try 00106 { 00107 metricMan->initialize(metric_pset, app_name); 00108 } 00109 catch (...) 00110 { 00111 ExceptionHandler(ExceptionHandlerRethrow::no, 00112 "Error loading metrics in RoutingMasterCore::initialize()"); 00113 } 00114 00115 // create the requested CommandableFragmentGenerator 00116 auto policy_plugin_spec = policy_pset_.get<std::string>("policy", ""); 00117 if (policy_plugin_spec.length() == 0) 00118 { 00119 TLOG(TLVL_ERROR) 00120 << "No fragment generator (parameter name = \"policy\") was " 00121 << "specified in the policy ParameterSet. The " 00122 << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\"." ; 00123 return false; 00124 } 00125 try 00126 { 00127 policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_); 00128 } 00129 catch (...) 00130 { 00131 std::stringstream exception_string; 00132 exception_string << "Exception thrown during initialization of policy of type \"" 00133 << policy_plugin_spec << "\""; 00134 00135 ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str()); 00136 00137 TLOG(TLVL_DEBUG) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string() ; 00138 00139 return false; 00140 } 00141 00142 rt_priority_ = daq_pset.get<int>("rt_priority", 0); 00143 sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks"); 00144 num_receivers_ = policy_->GetReceiverCount(); 00145 00146 receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size()); 00147 receive_token_events_ = std::vector<epoll_event>(num_receivers_ + 1); 00148 00149 auto mode = daq_pset.get<bool>("senders_send_by_send_count", false); 00150 routing_mode_ = mode ? detail::RoutingMasterMode::RouteBySendCount : detail::RoutingMasterMode::RouteBySequenceID; 00151 max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000); 00152 current_table_interval_ms_ = max_table_update_interval_ms_; 00153 max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5); 00154 receive_token_port_ = daq_pset.get<int>("routing_token_port", 35555); 00155 send_tables_port_ = daq_pset.get<int>("table_update_port", 35556); 00156 receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557); 00157 send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28"); 00158 receive_address_ = daq_pset.get<std::string>("routing_master_hostname", "localhost"); 00159 00160 // fetch the monitoring parameters and create the MonitoredQuantity instances 00161 statsHelper_.createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY); 00162 00163 shutdown_requested_.store(false); 00164 start_recieve_token_thread_(); 00165 return true; 00166 } 00167 00168 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t) 00169 { 00170 run_id_ = id; 00171 stop_requested_.store(false); 00172 pause_requested_.store(false); 00173 00174 statsHelper_.resetStatistics(); 00175 00176 metricMan->do_start(); 00177 table_update_count_ = 0; 00178 received_token_count_ = 0; 00179 00180 TLOG(TLVL_INFO) << "Started run " << run_id_.run() ; 00181 return true; 00182 } 00183 00184 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t) 00185 { 00186 TLOG(TLVL_INFO) << "Stopping run " << run_id_.run() 00187 << " after " << table_update_count_ << " table updates." 00188 << " and " << received_token_count_ << " received tokens." ; 00189 stop_requested_.store(true); 00190 run_id_ = art::RunID::flushRun(); 00191 return true; 00192 } 00193 00194 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t) 00195 { 00196 TLOG(TLVL_INFO) << "Pausing run " << run_id_.run() 00197 << " after " << table_update_count_ << " table updates." 00198 << " and " << received_token_count_ << " received tokens." ; 00199 pause_requested_.store(true); 00200 return true; 00201 } 00202 00203 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t) 00204 { 00205 TLOG(TLVL_DEBUG) << "Resuming run " << run_id_.run(); 00206 pause_requested_.store(false); 00207 metricMan->do_start(); 00208 return true; 00209 } 00210 00211 bool artdaq::RoutingMasterCore::shutdown(uint64_t) 00212 { 00213 shutdown_requested_.store(true); 00214 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join(); 00215 policy_.reset(nullptr); 00216 metricMan->shutdown(); 00217 return true; 00218 } 00219 00220 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f) 00221 { 00222 TLOG(TLVL_INFO) << "soft_initialize method called with " 00223 << "ParameterSet = \"" << pset.to_string() 00224 << "\"." ; 00225 return initialize(pset, e, f); 00226 } 00227 00228 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f) 00229 { 00230 TLOG(TLVL_INFO) << "reinitialize method called with " 00231 << "ParameterSet = \"" << pset.to_string() 00232 << "\"." ; 00233 return initialize(pset, e, f); 00234 } 00235 00236 void artdaq::RoutingMasterCore::process_event_table() 00237 { 00238 if (rt_priority_ > 0) 00239 { 00240 #pragma GCC diagnostic push 00241 #pragma GCC diagnostic ignored "-Wmissing-field-initializers" 00242 sched_param s_param = {}; 00243 s_param.sched_priority = rt_priority_; 00244 if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param)) 00245 TLOG(TLVL_WARNING) << "setting realtime priority failed" ; 00246 #pragma GCC diagnostic pop 00247 } 00248 00249 // try-catch block here? 00250 00251 // how to turn RT PRI off? 00252 if (rt_priority_ > 0) 00253 { 00254 #pragma GCC diagnostic push 00255 #pragma GCC diagnostic ignored "-Wmissing-field-initializers" 00256 sched_param s_param = {}; 00257 s_param.sched_priority = rt_priority_; 00258 int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param); 00259 if (status != 0) 00260 { 00261 TLOG(TLVL_ERROR) 00262 << "Failed to set realtime priority to " << rt_priority_ 00263 << ", return code = " << status ; 00264 } 00265 #pragma GCC diagnostic pop 00266 } 00267 00268 //MPI_Barrier(local_group_comm_); 00269 00270 TLOG(TLVL_DEBUG) << "Sending initial table." ; 00271 auto startTime = artdaq::MonitoredQuantity::getCurrentTime(); 00272 auto nextSendTime = startTime; 00273 double delta_time; 00274 while (!stop_requested_ && !pause_requested_) 00275 { 00276 startTime = artdaq::MonitoredQuantity::getCurrentTime(); 00277 00278 if (startTime >= nextSendTime) 00279 { 00280 auto table = policy_->GetCurrentTable(); 00281 if (table.size() > 0) 00282 { 00283 send_event_table(table); 00284 ++table_update_count_; 00285 delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime; 00286 statsHelper_.addSample(TABLE_UPDATES_STAT_KEY, delta_time); 00287 TLOG(16) << "process_fragments TABLE_UPDATES_STAT_KEY=" << delta_time ; 00288 } 00289 else 00290 { 00291 TLOG(TLVL_DEBUG) << "No tokens received in this update interval (" << current_table_interval_ms_ << " ms)! This most likely means that the receivers are not keeping up!" ; 00292 } 00293 auto max_tokens = policy_->GetMaxNumberOfTokens(); 00294 if (max_tokens > 0) 00295 { 00296 auto frac = table.size() / static_cast<double>(max_tokens); 00297 if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10; 00298 if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10; 00299 if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_; 00300 if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1; 00301 } 00302 nextSendTime = startTime + current_table_interval_ms_ / 1000.0; 00303 TLOG(TLVL_DEBUG) << "current_table_interval_ms is now " << current_table_interval_ms_ ; 00304 } 00305 else 00306 { 00307 usleep(current_table_interval_ms_ * 10); // 1/100 of the table update interval 00308 } 00309 } 00310 00311 policy_->Reset(); 00312 metricMan->do_stop(); 00313 } 00314 00315 void artdaq::RoutingMasterCore::send_event_table(detail::RoutingPacket packet) 00316 { 00317 // Reconnect table socket, if necessary 00318 if (table_socket_ == -1) 00319 { 00320 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 00321 if (table_socket_ < 0) 00322 { 00323 TLOG(TLVL_ERROR) << "I failed to create the socket for sending Data Requests! Errno: " << errno ; 00324 exit(1); 00325 } 00326 auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_); 00327 if (sts == -1) 00328 { 00329 TLOG(TLVL_ERROR) << "Unable to resolve table_update_address" ; 00330 exit(1); 00331 } 00332 00333 auto yes = 1; 00334 if (receive_address_ != "localhost") 00335 { 00336 TLOG(TLVL_DEBUG) << "Making sure that multicast sending uses the correct interface for hostname " << receive_address_ ; 00337 struct in_addr addr; 00338 sts = ResolveHost(receive_address_.c_str(), addr); 00339 if (sts == -1) 00340 { 00341 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;; 00342 } 00343 00344 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) 00345 { 00346 throw art::Exception(art::errors::Configuration) << 00347 "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl; 00348 exit(1); 00349 } 00350 00351 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0) 00352 { 00353 TLOG(TLVL_ERROR) << "Unable to enable multicast loopback on table socket" ; 00354 exit(1); 00355 } 00356 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1) 00357 { 00358 TLOG(TLVL_ERROR) << "Cannot set outgoing interface. Errno: " << errno ; 00359 exit(1); 00360 } 00361 } 00362 if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1) 00363 { 00364 TLOG(TLVL_ERROR) << "Cannot set request socket to broadcast. Errno: " << errno ; 00365 exit(1); 00366 } 00367 } 00368 00369 // Reconnect ack socket, if necessary 00370 if (ack_socket_ == -1) 00371 { 00372 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 00373 if (ack_socket_ < 0) 00374 { 00375 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl; 00376 exit(1); 00377 } 00378 00379 struct sockaddr_in si_me_request; 00380 00381 auto yes = 1; 00382 if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) 00383 { 00384 throw art::Exception(art::errors::Configuration) << 00385 "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl; 00386 exit(1); 00387 } 00388 memset(&si_me_request, 0, sizeof(si_me_request)); 00389 si_me_request.sin_family = AF_INET; 00390 si_me_request.sin_port = htons(receive_acks_port_); 00391 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY); 00392 if (bind(ack_socket_, reinterpret_cast<struct sockaddr *>(&si_me_request), sizeof(si_me_request)) == -1) 00393 { 00394 throw art::Exception(art::errors::Configuration) << 00395 "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl; 00396 exit(1); 00397 } 00398 TLOG(TLVL_DEBUG) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_ ; 00399 } 00400 00401 auto acks = std::unordered_map<int, bool>(); 00402 for (auto& r : sender_ranks_) 00403 { 00404 acks[r] = false; 00405 } 00406 auto counter = 0U; 00407 auto start_time = std::chrono::steady_clock::now(); 00408 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0 && !stop_requested_) 00409 { 00410 // Send table update 00411 auto header = detail::RoutingPacketHeader(routing_mode_, packet.size()); 00412 auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size(); 00413 00414 assert(packetSize + sizeof(header) < MAX_ROUTING_TABLE_SIZE); 00415 std::vector<uint8_t> buffer(packetSize + sizeof(header)); 00416 memcpy(&buffer[0], &header, sizeof(detail::RoutingPacketHeader)); 00417 memcpy(&buffer[sizeof(detail::RoutingPacketHeader)], &packet[0], packetSize); 00418 00419 TLOG(TLVL_DEBUG) << "Sending table information for " << header.nEntries << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ ; 00420 TRACE(16,"headerData:0x%016lx%016lx packetData:0x%016lx%016lx" 00421 ,((unsigned long*)&header)[0],((unsigned long*)&header)[1], ((unsigned long*)&packet[0])[0],((unsigned long*)&packet[0])[1] ); 00422 auto sts = sendto(table_socket_, &buffer[0], buffer.size(), 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)); 00423 if (sts != static_cast<ssize_t>(buffer.size())) 00424 { 00425 TLOG(TLVL_ERROR) << "Error sending routing table. sts=" << sts; 00426 } 00427 00428 // Collect acks 00429 00430 auto first = packet[0].sequence_id; 00431 auto last = packet.rbegin()->sequence_id; 00432 TLOG(TLVL_DEBUG) << "Sent " << sts << " bytes. Expecting acks to have first= " << first << ", and last= " << last ; 00433 00434 00435 auto startTime = std::chrono::steady_clock::now(); 00436 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0) 00437 { 00438 auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_; 00439 if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms) 00440 { 00441 if (counter > max_ack_cycle_count_ && table_update_count_ > 0) 00442 { 00443 TLOG(TLVL_ERROR) << "Did not receive acks from all senders after resending table " << counter 00444 << " times during the table_update_interval. Check the status of the senders!" ; 00445 break; 00446 } 00447 TLOG(TLVL_WARNING) << "Did not receive acks from all senders within the timeout (" << table_ack_wait_time_ms << " ms). Resending table update" ; 00448 break; 00449 } 00450 00451 TLOG(20) << "send_event_table: Polling Request socket for new requests" ; 00452 auto ready = true; 00453 while (ready) 00454 { 00455 detail::RoutingAckPacket buffer; 00456 if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0) 00457 { 00458 if (errno == EWOULDBLOCK || errno == EAGAIN) 00459 { 00460 TLOG(20) << "send_event_table: No more ack datagrams on ack socket." ; 00461 ready = false; 00462 } 00463 else 00464 { 00465 TLOG(TLVL_ERROR) << "An unexpected error occurred during ack packet receive" ; 00466 exit(2); 00467 } 00468 } 00469 else 00470 { 00471 TLOG(TLVL_DEBUG) << "Ack packet from rank " << buffer.rank << " has first= " << buffer.first_sequence_id 00472 << " and last= " << buffer.last_sequence_id ; 00473 if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last) 00474 { 00475 TLOG(TLVL_DEBUG) << "Received table update acknowledgement from sender with rank " << buffer.rank << "." ; 00476 acks[buffer.rank] = true; 00477 TLOG(TLVL_DEBUG) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) 00478 << " acks outstanding" ; 00479 } 00480 else 00481 { 00482 if (!acks.count(buffer.rank)) 00483 { 00484 TLOG(TLVL_ERROR) << "Received acknowledgement from invalid rank " << buffer.rank << "!" 00485 << " Cross-talk between RoutingMasters means there's a configuration error!" ; 00486 } 00487 else 00488 { 00489 TLOG(TLVL_WARNING) << "Received acknowledgement from rank " << buffer.rank 00490 << " that had incorrect sequence ID information. Discarding." 00491 << " Expected first/last=" << first <<"/"<< last 00492 << " recvd=" << buffer.first_sequence_id <<"/"<< buffer.last_sequence_id; 00493 } 00494 } 00495 } 00496 } 00497 usleep(table_ack_wait_time_ms * 1000 / 10); 00498 } 00499 } 00500 if (metricMan) 00501 { 00502 artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time; 00503 metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average); 00504 } 00505 } 00506 00507 void artdaq::RoutingMasterCore::receive_tokens_() 00508 { 00509 while (!shutdown_requested_) 00510 { 00511 TLOG(TLVL_DEBUG) << "Receive Token loop start" ; 00512 if (token_socket_ == -1) 00513 { 00514 TLOG(TLVL_DEBUG) << "Opening token listener socket" ; 00515 token_socket_ = TCP_listen_fd(receive_token_port_, 3 * sizeof(detail::RoutingToken)); 00516 fcntl(token_socket_, F_SETFL, O_NONBLOCK); // set O_NONBLOCK 00517 00518 if (token_epoll_fd_ != -1) close(token_epoll_fd_); 00519 struct epoll_event ev; 00520 token_epoll_fd_ = epoll_create1(0); 00521 ev.events = EPOLLIN | EPOLLPRI; 00522 ev.data.fd = token_socket_; 00523 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, token_socket_, &ev) == -1) 00524 { 00525 TLOG(TLVL_ERROR) << "Could not register listen socket to epoll fd" ; 00526 exit(3); 00527 } 00528 } 00529 if (token_socket_ == -1 || token_epoll_fd_ == -1) 00530 { 00531 TLOG(TLVL_DEBUG) << "One of the listen sockets was not opened successfully." ; 00532 return; 00533 } 00534 00535 auto nfds = epoll_wait(token_epoll_fd_, &receive_token_events_[0], receive_token_events_.size(), current_table_interval_ms_); 00536 if (nfds == -1) 00537 { 00538 perror("epoll_wait"); 00539 exit(EXIT_FAILURE); 00540 } 00541 00542 while (stop_requested_ && !shutdown_requested_) 00543 { 00544 usleep(10000); 00545 } 00546 00547 TLOG(TLVL_DEBUG) << "Received " << nfds << " events" ; 00548 for (auto n = 0; n < nfds; ++n) 00549 { 00550 if (receive_token_events_[n].data.fd == token_socket_) 00551 { 00552 TLOG(TLVL_DEBUG) << "Accepting new connection on token_socket" ; 00553 sockaddr_in addr; 00554 socklen_t arglen = sizeof(addr); 00555 auto conn_sock = accept(token_socket_, (struct sockaddr *)&addr, &arglen); 00556 fcntl(conn_sock, F_SETFL, O_NONBLOCK); // set O_NONBLOCK 00557 00558 if (conn_sock == -1) 00559 { 00560 perror("accept"); 00561 exit(EXIT_FAILURE); 00562 } 00563 00564 receive_token_addrs_[conn_sock] = std::string(inet_ntoa(addr.sin_addr)); 00565 TLOG(TLVL_DEBUG) << "New fd is " << conn_sock << " for receiver at " << receive_token_addrs_[conn_sock]; 00566 struct epoll_event ev; 00567 ev.events = EPOLLIN | EPOLLET; 00568 ev.data.fd = conn_sock; 00569 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, conn_sock, &ev) == -1) 00570 { 00571 perror("epoll_ctl: conn_sock"); 00572 exit(EXIT_FAILURE); 00573 } 00574 } 00575 else 00576 { 00577 /*if (receive_token_events_[n].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)) 00578 { 00579 TLOG(TLVL_DEBUG) << "Closing connection on fd " << receive_token_events_[n].data.fd << " (" << receive_token_addrs_[receive_token_events_[n].data.fd] << ")"; 00580 receive_token_addrs_.erase(receive_token_events_[n].data.fd); 00581 close(receive_token_events_[n].data.fd); 00582 epoll_ctl(token_epoll_fd_, EPOLL_CTL_DEL, receive_token_events_[n].data.fd, NULL); 00583 continue; 00584 }*/ 00585 00586 auto startTime = artdaq::MonitoredQuantity::getCurrentTime(); 00587 bool reading = true; 00588 int sts = 0; 00589 while(reading) 00590 { 00591 detail::RoutingToken buff; 00592 sts += read(receive_token_events_[n].data.fd, &buff, sizeof(detail::RoutingToken) - sts); 00593 if (sts == 0) 00594 { 00595 TLOG(TLVL_INFO) << "Received 0-size token from " << receive_token_addrs_[receive_token_events_[n].data.fd]; 00596 reading = false; 00597 } 00598 else if(sts < 0 && errno == EAGAIN) 00599 { 00600 TLOG(TLVL_DEBUG) << "No more tokens from this rank. Continuing poll loop."; 00601 reading = false; 00602 } 00603 else if(sts < 0) 00604 { 00605 TLOG(TLVL_ERROR) << "Error reading from token socket: sts=" << sts << ", errno=" << errno; 00606 receive_token_addrs_.erase(receive_token_events_[n].data.fd); 00607 close(receive_token_events_[n].data.fd); 00608 epoll_ctl(token_epoll_fd_, EPOLL_CTL_DEL, receive_token_events_[n].data.fd, NULL); 00609 reading = false; 00610 } 00611 else if (sts == sizeof(detail::RoutingToken) && buff.header != TOKEN_MAGIC) 00612 { 00613 TLOG(TLVL_ERROR) << "Received invalid token from " << receive_token_addrs_[receive_token_events_[n].data.fd] << " sts=" << sts; 00614 reading = false; 00615 } 00616 else if(sts == sizeof(detail::RoutingToken)) 00617 { 00618 sts = 0; 00619 TLOG(TLVL_DEBUG) << "Received token from " << buff.rank << " indicating " << buff.new_slots_free << " slots are free. (run=" << buff.run_number << ")" ; 00620 if (buff.run_number != run_id_.run()) 00621 { 00622 TLOG(TLVL_DEBUG) << "Received token from a different run number! Current = " << run_id_.run() << ", token = " << buff.run_number << ", ignoring (n=" << buff.new_slots_free << ")"; 00623 } 00624 else 00625 { 00626 received_token_count_ += buff.new_slots_free; 00627 if (routing_mode_ == detail::RoutingMasterMode::RouteBySequenceID) 00628 { 00629 policy_->AddReceiverToken(buff.rank, buff.new_slots_free); 00630 } 00631 else if (routing_mode_ == detail::RoutingMasterMode::RouteBySendCount) 00632 { 00633 if (!received_token_counter_.count(buff.rank)) received_token_counter_[buff.rank] = 0; 00634 received_token_counter_[buff.rank] += buff.new_slots_free; 00635 TLOG(TLVL_DEBUG) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() << "." ; 00636 while (received_token_counter_[buff.rank] >= sender_ranks_.size()) 00637 { 00638 TLOG(TLVL_DEBUG) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() 00639 << "... Sending token to policy" ; 00640 policy_->AddReceiverToken(buff.rank, 1); 00641 received_token_counter_[buff.rank] -= sender_ranks_.size(); 00642 } 00643 } 00644 } 00645 } 00646 } 00647 auto delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime; 00648 statsHelper_.addSample(TOKENS_RECEIVED_STAT_KEY, delta_time); 00649 bool readyToReport = statsHelper_.readyToReport(delta_time); 00650 if (readyToReport) 00651 { 00652 std::string statString = buildStatisticsString_(); 00653 TLOG(TLVL_INFO) << statString; 00654 sendMetrics_(); 00655 } 00656 } 00657 } 00658 } 00659 } 00660 00661 void artdaq::RoutingMasterCore::start_recieve_token_thread_() 00662 { 00663 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join(); 00664 boost::thread::attributes attrs; 00665 attrs.set_stack_size(4096 * 2000); // 8000 KB 00666 00667 TLOG(TLVL_INFO) << "Starting Token Reception Thread" ; 00668 try { 00669 ev_token_receive_thread_ = boost::thread(attrs, boost::bind(&RoutingMasterCore::receive_tokens_, this)); 00670 } 00671 catch(boost::exception const& e) 00672 { 00673 std::cerr << "Exception encountered starting Token Reception thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl; 00674 exit(3); 00675 } 00676 TLOG(TLVL_INFO) << "Started Token Reception Thread"; 00677 } 00678 00679 std::string artdaq::RoutingMasterCore::report(std::string const&) const 00680 { 00681 std::string resultString; 00682 00683 // if we haven't been able to come up with any report so far, say so 00684 auto tmpString = app_name + " run number = " + std::to_string(run_id_.run()) 00685 + ", table updates sent = " + std::to_string(table_update_count_) 00686 + ", Receiver tokens received = " + std::to_string(received_token_count_); 00687 return tmpString; 00688 } 00689 00690 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const 00691 { 00692 std::ostringstream oss; 00693 oss << app_name << " statistics:" << std::endl; 00694 00695 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY); 00696 if (mqPtr.get() != nullptr) 00697 { 00698 artdaq::MonitoredQuantityStats stats; 00699 mqPtr->getStats(stats); 00700 oss << " Table Update statistics: " 00701 << stats.recentSampleCount << " table updates sent at " 00702 << stats.recentSampleRate << " table updates/sec, , monitor window = " 00703 << stats.recentDuration << " sec" << std::endl; 00704 oss << " Average times per table update: "; 00705 if (stats.recentSampleRate > 0.0) 00706 { 00707 oss << " elapsed time = " 00708 << (1.0 / stats.recentSampleRate) << " sec"; 00709 } 00710 oss << ", avg table acknowledgement wait time = " 00711 << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl; 00712 } 00713 00714 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY); 00715 if (mqPtr.get() != nullptr) 00716 { 00717 artdaq::MonitoredQuantityStats stats; 00718 mqPtr->getStats(stats); 00719 oss << " Received Token statistics: " 00720 << stats.recentSampleCount << " tokens received at " 00721 << stats.recentSampleRate << " tokens/sec, , monitor window = " 00722 << stats.recentDuration << " sec" << std::endl; 00723 oss << " Average times per token: "; 00724 if (stats.recentSampleRate > 0.0) 00725 { 00726 oss << " elapsed time = " 00727 << (1.0 / stats.recentSampleRate) << " sec"; 00728 } 00729 oss << ", input token wait time = " 00730 << mqPtr->getRecentValueSum() << " sec" << std::endl; 00731 } 00732 00733 return oss.str(); 00734 } 00735 00736 void artdaq::RoutingMasterCore::sendMetrics_() 00737 { 00738 if (metricMan) 00739 { 00740 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY); 00741 if (mqPtr.get() != nullptr) 00742 { 00743 artdaq::MonitoredQuantityStats stats; 00744 mqPtr->getStats(stats); 00745 metricMan->sendMetric("Table Update Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::LastPoint); 00746 metricMan->sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average); 00747 metricMan->sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average); 00748 } 00749 00750 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY); 00751 if (mqPtr.get() != nullptr) 00752 { 00753 artdaq::MonitoredQuantityStats stats; 00754 mqPtr->getStats(stats); 00755 metricMan->sendMetric("Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::LastPoint); 00756 metricMan->sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average); 00757 metricMan->sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average); 00758 } 00759 } 00760 }