00001 #include <sys/un.h>
00002 #include <sys/time.h>
00003 #include <sys/epoll.h>
00004 #include <arpa/inet.h>
00005 #include <netdb.h>
00006 #include <pthread.h>
00007 #include <sched.h>
00008 #include <algorithm>
00009
00010 #include "canvas/Utilities/Exception.h"
00011 #include "cetlib_except/exception.h"
00012
00013 #define TRACE_NAME (app_name + "_RoutingMasterCore").c_str() // include these 2 first -
00014 #include "artdaq/DAQdata/Globals.hh"
00015 #include "artdaq-core/Data/Fragment.hh"
00016 #include "artdaq-core/Utilities/ExceptionHandler.hh"
00017
00018 #include "artdaq/Application/RoutingMasterCore.hh"
00019 #include "artdaq/Application/Routing/makeRoutingMasterPolicy.hh"
00020 #include "artdaq/DAQdata/TCP_listen_fd.hh"
00021 #include "artdaq/DAQdata/TCPConnect.hh"
00022
00023 const std::string artdaq::RoutingMasterCore::
00024 TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates");
00025 const std::string artdaq::RoutingMasterCore::
00026 TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived");
00027
00028 artdaq::RoutingMasterCore::RoutingMasterCore()
00029 : received_token_counter_()
00030 , shutdown_requested_(false)
00031 , stop_requested_(true)
00032 , pause_requested_(false)
00033 , token_socket_(-1)
00034 , table_socket_(-1)
00035 , ack_socket_(-1)
00036 {
00037 TLOG(TLVL_DEBUG) << "Constructor" ;
00038 statsHelper_.addMonitoredQuantityName(TABLE_UPDATES_STAT_KEY);
00039 statsHelper_.addMonitoredQuantityName(TOKENS_RECEIVED_STAT_KEY);
00040 }
00041
00042 artdaq::RoutingMasterCore::~RoutingMasterCore()
00043 {
00044 TLOG(TLVL_DEBUG) << "Destructor" ;
00045 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00046 }
00047
00048 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t)
00049 {
00050 TLOG(TLVL_DEBUG) << "initialize method called with "
00051 << "ParameterSet = \"" << pset.to_string()
00052 << "\"." ;
00053
00054
00055 fhicl::ParameterSet daq_pset;
00056 try
00057 {
00058 daq_pset = pset.get<fhicl::ParameterSet>("daq");
00059 }
00060 catch (...)
00061 {
00062 TLOG(TLVL_ERROR)
00063 << "Unable to find the DAQ parameters in the initialization "
00064 << "ParameterSet: \"" + pset.to_string() + "\"." ;
00065 return false;
00066 }
00067
00068 if (daq_pset.has_key("rank"))
00069 {
00070 if (my_rank >= 0 && daq_pset.get<int>("rank") != my_rank) {
00071 TLOG(TLVL_WARNING) << "Routing Master rank specified at startup is different than rank specified at configure! Using rank received at configure!";
00072 }
00073 my_rank = daq_pset.get<int>("rank");
00074 }
00075 if (my_rank == -1)
00076 {
00077 TLOG(TLVL_ERROR) << "Routing Master rank not specified at startup or in configuration! Aborting";
00078 exit(1);
00079 }
00080
00081 try
00082 {
00083 policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
00084 }
00085 catch (...)
00086 {
00087 TLOG(TLVL_ERROR)
00088 << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\"." ;
00089 return false;
00090 }
00091
00092
00093 fhicl::ParameterSet metric_pset;
00094 try
00095 {
00096 metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
00097 }
00098 catch (...) {}
00099
00100 if (metric_pset.is_empty())
00101 {
00102 TLOG(TLVL_INFO) << "No metric plugins appear to be defined" ;
00103 }
00104 try
00105 {
00106 metricMan->initialize(metric_pset, app_name);
00107 }
00108 catch (...)
00109 {
00110 ExceptionHandler(ExceptionHandlerRethrow::no,
00111 "Error loading metrics in RoutingMasterCore::initialize()");
00112 }
00113
00114
00115 auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
00116 if (policy_plugin_spec.length() == 0)
00117 {
00118 TLOG(TLVL_ERROR)
00119 << "No fragment generator (parameter name = \"policy\") was "
00120 << "specified in the policy ParameterSet. The "
00121 << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\"." ;
00122 return false;
00123 }
00124 try
00125 {
00126 policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_);
00127 }
00128 catch (...)
00129 {
00130 std::stringstream exception_string;
00131 exception_string << "Exception thrown during initialization of policy of type \""
00132 << policy_plugin_spec << "\"";
00133
00134 ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
00135
00136 TLOG(TLVL_DEBUG) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string() ;
00137
00138 return false;
00139 }
00140
00141 rt_priority_ = daq_pset.get<int>("rt_priority", 0);
00142 sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
00143 num_receivers_ = policy_->GetReceiverCount();
00144
00145 receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
00146 receive_token_events_ = std::vector<epoll_event>(num_receivers_ + 1);
00147
00148 auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
00149 routing_mode_ = mode ? detail::RoutingMasterMode::RouteBySendCount : detail::RoutingMasterMode::RouteBySequenceID;
00150 max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
00151 current_table_interval_ms_ = max_table_update_interval_ms_;
00152 max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
00153 receive_token_port_ = daq_pset.get<int>("routing_token_port", 35555);
00154 send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
00155 receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
00156 send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
00157 receive_address_ = daq_pset.get<std::string>("routing_master_hostname", "localhost");
00158
00159
00160 statsHelper_.createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
00161
00162 shutdown_requested_.store(false);
00163 start_recieve_token_thread_();
00164 return true;
00165 }
00166
00167 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t)
00168 {
00169 run_id_ = id;
00170 stop_requested_.store(false);
00171 pause_requested_.store(false);
00172
00173 statsHelper_.resetStatistics();
00174
00175 metricMan->do_start();
00176 table_update_count_ = 0;
00177 received_token_count_ = 0;
00178
00179 TLOG(TLVL_INFO) << "Started run " << run_id_.run() ;
00180 return true;
00181 }
00182
00183 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t)
00184 {
00185 TLOG(TLVL_INFO) << "Stopping run " << run_id_.run()
00186 << " after " << table_update_count_ << " table updates."
00187 << " and " << received_token_count_ << " received tokens." ;
00188 stop_requested_.store(true);
00189 run_id_ = art::RunID::flushRun();
00190 return true;
00191 }
00192
00193 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t)
00194 {
00195 TLOG(TLVL_INFO) << "Pausing run " << run_id_.run()
00196 << " after " << table_update_count_ << " table updates."
00197 << " and " << received_token_count_ << " received tokens." ;
00198 pause_requested_.store(true);
00199 return true;
00200 }
00201
00202 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t)
00203 {
00204 TLOG(TLVL_DEBUG) << "Resuming run " << run_id_.run();
00205 pause_requested_.store(false);
00206 metricMan->do_start();
00207 return true;
00208 }
00209
00210 bool artdaq::RoutingMasterCore::shutdown(uint64_t)
00211 {
00212 shutdown_requested_.store(true);
00213 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00214 policy_.reset(nullptr);
00215 metricMan->shutdown();
00216 return true;
00217 }
00218
00219 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00220 {
00221 TLOG(TLVL_INFO) << "soft_initialize method called with "
00222 << "ParameterSet = \"" << pset.to_string()
00223 << "\"." ;
00224 return initialize(pset, e, f);
00225 }
00226
00227 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00228 {
00229 TLOG(TLVL_INFO) << "reinitialize method called with "
00230 << "ParameterSet = \"" << pset.to_string()
00231 << "\"." ;
00232 return initialize(pset, e, f);
00233 }
00234
00235 void artdaq::RoutingMasterCore::process_event_table()
00236 {
00237 if (rt_priority_ > 0)
00238 {
00239 #pragma GCC diagnostic push
00240 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00241 sched_param s_param = {};
00242 s_param.sched_priority = rt_priority_;
00243 if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
00244 TLOG(TLVL_WARNING) << "setting realtime priority failed" ;
00245 #pragma GCC diagnostic pop
00246 }
00247
00248
00249
00250
00251 if (rt_priority_ > 0)
00252 {
00253 #pragma GCC diagnostic push
00254 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00255 sched_param s_param = {};
00256 s_param.sched_priority = rt_priority_;
00257 int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
00258 if (status != 0)
00259 {
00260 TLOG(TLVL_ERROR)
00261 << "Failed to set realtime priority to " << rt_priority_
00262 << ", return code = " << status ;
00263 }
00264 #pragma GCC diagnostic pop
00265 }
00266
00267
00268
00269 TLOG(TLVL_DEBUG) << "Sending initial table." ;
00270 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00271 auto nextSendTime = startTime;
00272 double delta_time;
00273 while (!stop_requested_ && !pause_requested_)
00274 {
00275 startTime = artdaq::MonitoredQuantity::getCurrentTime();
00276
00277 if (startTime >= nextSendTime)
00278 {
00279 auto table = policy_->GetCurrentTable();
00280 if (table.size() > 0)
00281 {
00282 send_event_table(table);
00283 ++table_update_count_;
00284 delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00285 statsHelper_.addSample(TABLE_UPDATES_STAT_KEY, delta_time);
00286 TLOG(16) << "process_fragments TABLE_UPDATES_STAT_KEY=" << delta_time ;
00287 }
00288 else
00289 {
00290 TLOG(TLVL_DEBUG) << "No tokens received in this update interval (" << current_table_interval_ms_ << " ms)! This most likely means that the receivers are not keeping up!" ;
00291 }
00292 auto max_tokens = policy_->GetMaxNumberOfTokens();
00293 if (max_tokens > 0)
00294 {
00295 auto frac = table.size() / static_cast<double>(max_tokens);
00296 if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
00297 if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
00298 if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
00299 if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
00300 }
00301 nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
00302 TLOG(TLVL_DEBUG) << "current_table_interval_ms is now " << current_table_interval_ms_ ;
00303 }
00304 else
00305 {
00306 usleep(current_table_interval_ms_ * 10);
00307 }
00308 }
00309
00310 policy_->Reset();
00311 metricMan->do_stop();
00312 }
00313
00314 void artdaq::RoutingMasterCore::send_event_table(detail::RoutingPacket packet)
00315 {
00316
00317 if (table_socket_ == -1)
00318 {
00319 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00320 if (table_socket_ < 0)
00321 {
00322 TLOG(TLVL_ERROR) << "I failed to create the socket for sending Data Requests! Errno: " << errno ;
00323 exit(1);
00324 }
00325 auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
00326 if (sts == -1)
00327 {
00328 TLOG(TLVL_ERROR) << "Unable to resolve table_update_address" ;
00329 exit(1);
00330 }
00331
00332 auto yes = 1;
00333 if (receive_address_ != "localhost")
00334 {
00335 TLOG(TLVL_DEBUG) << "Making sure that multicast sending uses the correct interface for hostname " << receive_address_ ;
00336 struct in_addr addr;
00337 sts = ResolveHost(receive_address_.c_str(), addr);
00338 if (sts == -1)
00339 {
00340 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;;
00341 }
00342
00343 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00344 {
00345 throw art::Exception(art::errors::Configuration) <<
00346 "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
00347 exit(1);
00348 }
00349
00350 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0)
00351 {
00352 TLOG(TLVL_ERROR) << "Unable to enable multicast loopback on table socket" ;
00353 exit(1);
00354 }
00355 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
00356 {
00357 TLOG(TLVL_ERROR) << "Cannot set outgoing interface. Errno: " << errno ;
00358 exit(1);
00359 }
00360 }
00361 if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1)
00362 {
00363 TLOG(TLVL_ERROR) << "Cannot set request socket to broadcast. Errno: " << errno ;
00364 exit(1);
00365 }
00366 }
00367
00368
00369 if (ack_socket_ == -1)
00370 {
00371 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00372 if (ack_socket_ < 0)
00373 {
00374 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
00375 exit(1);
00376 }
00377
00378 struct sockaddr_in si_me_request;
00379
00380 auto yes = 1;
00381 if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00382 {
00383 throw art::Exception(art::errors::Configuration) <<
00384 "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
00385 exit(1);
00386 }
00387 memset(&si_me_request, 0, sizeof(si_me_request));
00388 si_me_request.sin_family = AF_INET;
00389 si_me_request.sin_port = htons(receive_acks_port_);
00390 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00391 if (bind(ack_socket_, reinterpret_cast<struct sockaddr *>(&si_me_request), sizeof(si_me_request)) == -1)
00392 {
00393 throw art::Exception(art::errors::Configuration) <<
00394 "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
00395 exit(1);
00396 }
00397 TLOG(TLVL_DEBUG) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_ ;
00398 }
00399
00400 auto acks = std::unordered_map<int, bool>();
00401 for (auto& r : sender_ranks_)
00402 {
00403 acks[r] = false;
00404 }
00405 auto counter = 0U;
00406 auto start_time = std::chrono::steady_clock::now();
00407 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0 && !stop_requested_)
00408 {
00409
00410 auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
00411 auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
00412
00413 assert(packetSize + sizeof(header) < MAX_ROUTING_TABLE_SIZE);
00414 std::vector<uint8_t> buffer(packetSize + sizeof(header));
00415 memcpy(&buffer[0], &header, sizeof(detail::RoutingPacketHeader));
00416 memcpy(&buffer[sizeof(detail::RoutingPacketHeader)], &packet[0], packetSize);
00417
00418 TLOG(TLVL_DEBUG) << "Sending table information for " << header.nEntries << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ ;
00419 TRACE(16,"headerData:0x%016lx%016lx packetData:0x%016lx%016lx"
00420 ,((unsigned long*)&header)[0],((unsigned long*)&header)[1], ((unsigned long*)&packet[0])[0],((unsigned long*)&packet[0])[1] );
00421 auto sts = sendto(table_socket_, &buffer[0], buffer.size(), 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_));
00422 if (sts != static_cast<ssize_t>(buffer.size()))
00423 {
00424 TLOG(TLVL_ERROR) << "Error sending routing table. sts=" << sts;
00425 }
00426
00427
00428
00429 auto first = packet[0].sequence_id;
00430 auto last = packet.rbegin()->sequence_id;
00431 TLOG(TLVL_DEBUG) << "Sent " << sts << " bytes. Expecting acks to have first= " << first << ", and last= " << last ;
00432
00433
00434 auto startTime = std::chrono::steady_clock::now();
00435 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
00436 {
00437 auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
00438 if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms)
00439 {
00440 if (counter > max_ack_cycle_count_ && table_update_count_ > 0)
00441 {
00442 TLOG(TLVL_ERROR) << "Did not receive acks from all senders after resending table " << counter
00443 << " times during the table_update_interval. Check the status of the senders!" ;
00444 break;
00445 }
00446 TLOG(TLVL_WARNING) << "Did not receive acks from all senders within the timeout (" << table_ack_wait_time_ms << " ms). Resending table update" ;
00447 break;
00448 }
00449
00450 TLOG(20) << "send_event_table: Polling Request socket for new requests" ;
00451 auto ready = true;
00452 while (ready)
00453 {
00454 detail::RoutingAckPacket buffer;
00455 if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0)
00456 {
00457 if (errno == EWOULDBLOCK || errno == EAGAIN)
00458 {
00459 TLOG(20) << "send_event_table: No more ack datagrams on ack socket." ;
00460 ready = false;
00461 }
00462 else
00463 {
00464 TLOG(TLVL_ERROR) << "An unexpected error occurred during ack packet receive" ;
00465 exit(2);
00466 }
00467 }
00468 else
00469 {
00470 TLOG(TLVL_DEBUG) << "Ack packet from rank " << buffer.rank << " has first= " << buffer.first_sequence_id
00471 << " and last= " << buffer.last_sequence_id ;
00472 if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
00473 {
00474 TLOG(TLVL_DEBUG) << "Received table update acknowledgement from sender with rank " << buffer.rank << "." ;
00475 acks[buffer.rank] = true;
00476 TLOG(TLVL_DEBUG) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; })
00477 << " acks outstanding" ;
00478 }
00479 else
00480 {
00481 if (!acks.count(buffer.rank))
00482 {
00483 TLOG(TLVL_ERROR) << "Received acknowledgement from invalid rank " << buffer.rank << "!"
00484 << " Cross-talk between RoutingMasters means there's a configuration error!" ;
00485 }
00486 else
00487 {
00488 TLOG(TLVL_WARNING) << "Received acknowledgement from rank " << buffer.rank
00489 << " that had incorrect sequence ID information. Discarding."
00490 << " Expected first/last=" << first <<"/"<< last
00491 << " recvd=" << buffer.first_sequence_id <<"/"<< buffer.last_sequence_id;
00492 }
00493 }
00494 }
00495 }
00496 usleep(table_ack_wait_time_ms * 1000 / 10);
00497 }
00498 }
00499 if (metricMan)
00500 {
00501 artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time;
00502 metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average);
00503 }
00504 }
00505
00506 void artdaq::RoutingMasterCore::receive_tokens_()
00507 {
00508 while (!shutdown_requested_)
00509 {
00510 TLOG(TLVL_DEBUG) << "Receive Token loop start" ;
00511 if (token_socket_ == -1)
00512 {
00513 TLOG(TLVL_DEBUG) << "Opening token listener socket" ;
00514 token_socket_ = TCP_listen_fd(receive_token_port_, 3 * sizeof(detail::RoutingToken));
00515 fcntl(token_socket_, F_SETFL, O_NONBLOCK);
00516
00517 if (token_epoll_fd_ != -1) close(token_epoll_fd_);
00518 struct epoll_event ev;
00519 token_epoll_fd_ = epoll_create1(0);
00520 ev.events = EPOLLIN | EPOLLPRI;
00521 ev.data.fd = token_socket_;
00522 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, token_socket_, &ev) == -1)
00523 {
00524 TLOG(TLVL_ERROR) << "Could not register listen socket to epoll fd" ;
00525 exit(3);
00526 }
00527 }
00528 if (token_socket_ == -1 || token_epoll_fd_ == -1)
00529 {
00530 TLOG(TLVL_DEBUG) << "One of the listen sockets was not opened successfully." ;
00531 return;
00532 }
00533
00534 auto nfds = epoll_wait(token_epoll_fd_, &receive_token_events_[0], receive_token_events_.size(), current_table_interval_ms_);
00535 if (nfds == -1)
00536 {
00537 perror("epoll_wait");
00538 exit(EXIT_FAILURE);
00539 }
00540
00541 while (stop_requested_ && !shutdown_requested_)
00542 {
00543 usleep(10000);
00544 }
00545
00546 TLOG(TLVL_DEBUG) << "Received " << nfds << " events" ;
00547 for (auto n = 0; n < nfds; ++n)
00548 {
00549 if (receive_token_events_[n].data.fd == token_socket_)
00550 {
00551 TLOG(TLVL_DEBUG) << "Accepting new connection on token_socket" ;
00552 sockaddr_in addr;
00553 socklen_t arglen = sizeof(addr);
00554 auto conn_sock = accept(token_socket_, (struct sockaddr *)&addr, &arglen);
00555 fcntl(conn_sock, F_SETFL, O_NONBLOCK);
00556
00557 if (conn_sock == -1)
00558 {
00559 perror("accept");
00560 exit(EXIT_FAILURE);
00561 }
00562
00563 receive_token_addrs_[conn_sock] = std::string(inet_ntoa(addr.sin_addr));
00564 TLOG(TLVL_DEBUG) << "New fd is " << conn_sock << " for receiver at " << receive_token_addrs_[conn_sock];
00565 struct epoll_event ev;
00566 ev.events = EPOLLIN | EPOLLET;
00567 ev.data.fd = conn_sock;
00568 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, conn_sock, &ev) == -1)
00569 {
00570 perror("epoll_ctl: conn_sock");
00571 exit(EXIT_FAILURE);
00572 }
00573 }
00574 else
00575 {
00576
00577
00578
00579
00580
00581
00582
00583
00584
00585 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00586 bool reading = true;
00587 int sts = 0;
00588 while(reading)
00589 {
00590 detail::RoutingToken buff;
00591 sts += read(receive_token_events_[n].data.fd, &buff, sizeof(detail::RoutingToken) - sts);
00592 if (sts == 0)
00593 {
00594 TLOG(TLVL_INFO) << "Received 0-size token from " << receive_token_addrs_[receive_token_events_[n].data.fd];
00595 reading = false;
00596 }
00597 else if(sts < 0 && errno == EAGAIN)
00598 {
00599 TLOG(TLVL_DEBUG) << "No more tokens from this rank. Continuing poll loop.";
00600 reading = false;
00601 }
00602 else if(sts < 0)
00603 {
00604 TLOG(TLVL_ERROR) << "Error reading from token socket: sts=" << sts << ", errno=" << errno;
00605 receive_token_addrs_.erase(receive_token_events_[n].data.fd);
00606 close(receive_token_events_[n].data.fd);
00607 epoll_ctl(token_epoll_fd_, EPOLL_CTL_DEL, receive_token_events_[n].data.fd, NULL);
00608 reading = false;
00609 }
00610 else if (sts == sizeof(detail::RoutingToken) && buff.header != TOKEN_MAGIC)
00611 {
00612 TLOG(TLVL_ERROR) << "Received invalid token from " << receive_token_addrs_[receive_token_events_[n].data.fd] << " sts=" << sts;
00613 reading = false;
00614 }
00615 else if(sts == sizeof(detail::RoutingToken))
00616 {
00617 sts = 0;
00618 TLOG(TLVL_DEBUG) << "Received token from " << buff.rank << " indicating " << buff.new_slots_free << " slots are free. (run=" << buff.run_number << ")" ;
00619 if (buff.run_number != run_id_.run())
00620 {
00621 TLOG(TLVL_DEBUG) << "Received token from a different run number! Current = " << run_id_.run() << ", token = " << buff.run_number << ", ignoring (n=" << buff.new_slots_free << ")";
00622 }
00623 else
00624 {
00625 received_token_count_ += buff.new_slots_free;
00626 if (routing_mode_ == detail::RoutingMasterMode::RouteBySequenceID)
00627 {
00628 policy_->AddReceiverToken(buff.rank, buff.new_slots_free);
00629 }
00630 else if (routing_mode_ == detail::RoutingMasterMode::RouteBySendCount)
00631 {
00632 if (!received_token_counter_.count(buff.rank)) received_token_counter_[buff.rank] = 0;
00633 received_token_counter_[buff.rank] += buff.new_slots_free;
00634 TLOG(TLVL_DEBUG) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() << "." ;
00635 while (received_token_counter_[buff.rank] >= sender_ranks_.size())
00636 {
00637 TLOG(TLVL_DEBUG) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size()
00638 << "... Sending token to policy" ;
00639 policy_->AddReceiverToken(buff.rank, 1);
00640 received_token_counter_[buff.rank] -= sender_ranks_.size();
00641 }
00642 }
00643 }
00644 }
00645 }
00646 auto delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00647 statsHelper_.addSample(TOKENS_RECEIVED_STAT_KEY, delta_time);
00648 bool readyToReport = statsHelper_.readyToReport(delta_time);
00649 if (readyToReport)
00650 {
00651 std::string statString = buildStatisticsString_();
00652 TLOG(TLVL_INFO) << statString;
00653 sendMetrics_();
00654 }
00655 }
00656 }
00657 }
00658 }
00659
00660 void artdaq::RoutingMasterCore::start_recieve_token_thread_()
00661 {
00662 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00663 boost::thread::attributes attrs;
00664 attrs.set_stack_size(4096 * 2000);
00665
00666 TLOG(TLVL_INFO) << "Starting Token Reception Thread" ;
00667 try {
00668 ev_token_receive_thread_ = boost::thread(attrs, boost::bind(&RoutingMasterCore::receive_tokens_, this));
00669 }
00670 catch(boost::exception const& e)
00671 {
00672 std::cerr << "Exception encountered starting Token Reception thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
00673 exit(3);
00674 }
00675 TLOG(TLVL_INFO) << "Started Token Reception Thread";
00676 }
00677
00678 std::string artdaq::RoutingMasterCore::report(std::string const&) const
00679 {
00680 std::string resultString;
00681
00682
00683 auto tmpString = app_name + " run number = " + std::to_string(run_id_.run())
00684 + ", table updates sent = " + std::to_string(table_update_count_)
00685 + ", Receiver tokens received = " + std::to_string(received_token_count_);
00686 return tmpString;
00687 }
00688
00689 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const
00690 {
00691 std::ostringstream oss;
00692 oss << app_name << " statistics:" << std::endl;
00693
00694 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00695 if (mqPtr.get() != nullptr)
00696 {
00697 artdaq::MonitoredQuantityStats stats;
00698 mqPtr->getStats(stats);
00699 oss << " Table Update statistics: "
00700 << stats.recentSampleCount << " table updates sent at "
00701 << stats.recentSampleRate << " table updates/sec, , monitor window = "
00702 << stats.recentDuration << " sec" << std::endl;
00703 oss << " Average times per table update: ";
00704 if (stats.recentSampleRate > 0.0)
00705 {
00706 oss << " elapsed time = "
00707 << (1.0 / stats.recentSampleRate) << " sec";
00708 }
00709 oss << ", avg table acknowledgement wait time = "
00710 << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
00711 }
00712
00713 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00714 if (mqPtr.get() != nullptr)
00715 {
00716 artdaq::MonitoredQuantityStats stats;
00717 mqPtr->getStats(stats);
00718 oss << " Received Token statistics: "
00719 << stats.recentSampleCount << " tokens received at "
00720 << stats.recentSampleRate << " tokens/sec, , monitor window = "
00721 << stats.recentDuration << " sec" << std::endl;
00722 oss << " Average times per token: ";
00723 if (stats.recentSampleRate > 0.0)
00724 {
00725 oss << " elapsed time = "
00726 << (1.0 / stats.recentSampleRate) << " sec";
00727 }
00728 oss << ", input token wait time = "
00729 << mqPtr->getRecentValueSum() << " sec" << std::endl;
00730 }
00731
00732 return oss.str();
00733 }
00734
00735 void artdaq::RoutingMasterCore::sendMetrics_()
00736 {
00737 if (metricMan)
00738 {
00739 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00740 if (mqPtr.get() != nullptr)
00741 {
00742 artdaq::MonitoredQuantityStats stats;
00743 mqPtr->getStats(stats);
00744 metricMan->sendMetric("Table Update Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::LastPoint);
00745 metricMan->sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
00746 metricMan->sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average);
00747 }
00748
00749 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00750 if (mqPtr.get() != nullptr)
00751 {
00752 artdaq::MonitoredQuantityStats stats;
00753 mqPtr->getStats(stats);
00754 metricMan->sendMetric("Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::LastPoint);
00755 metricMan->sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
00756 metricMan->sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
00757 }
00758 }
00759 }