00001 #include <sys/un.h>
00002 #include <sys/time.h>
00003 #include <sys/epoll.h>
00004 #include <arpa/inet.h>
00005 #include <netdb.h>
00006 #include <pthread.h>
00007 #include <sched.h>
00008 #include <algorithm>
00009
00010 #include "canvas/Utilities/Exception.h"
00011 #include "cetlib/exception.h"
00012
00013 #define TRACE_NAME "RoutingMasterCore" // include these 2 first -
00014 #include "artdaq/DAQdata/Globals.hh"
00015 #include "artdaq-core/Data/Fragment.hh"
00016 #include "artdaq-core/Utilities/ExceptionHandler.hh"
00017
00018 #include "artdaq/Application/RoutingMasterCore.hh"
00019 #include "artdaq/Application/Routing/makeRoutingMasterPolicy.hh"
00020 #include "artdaq/DAQdata/TCP_listen_fd.hh"
00021 #include "artdaq/DAQdata/TCPConnect.hh"
00022
00023 const std::string artdaq::RoutingMasterCore::
00024 TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates");
00025 const std::string artdaq::RoutingMasterCore::
00026 TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived");
00027
00028 artdaq::RoutingMasterCore::RoutingMasterCore()
00029 : received_token_counter_()
00030 , shutdown_requested_(false)
00031 , stop_requested_(false)
00032 , pause_requested_(false)
00033 , token_socket_(-1)
00034 , table_socket_(-1)
00035 , ack_socket_(-1)
00036 {
00037 TLOG_DEBUG(app_name) << "Constructor" << TLOG_ENDL;
00038 statsHelper_.addMonitoredQuantityName(TABLE_UPDATES_STAT_KEY);
00039 statsHelper_.addMonitoredQuantityName(TOKENS_RECEIVED_STAT_KEY);
00040 metricMan = &metricMan_;
00041 }
00042
00043 artdaq::RoutingMasterCore::~RoutingMasterCore()
00044 {
00045 TLOG_DEBUG(app_name) << "Destructor" << TLOG_ENDL;
00046 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00047 }
00048
00049 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t)
00050 {
00051 TLOG_DEBUG(app_name) << "initialize method called with "
00052 << "ParameterSet = \"" << pset.to_string()
00053 << "\"." << TLOG_ENDL;
00054
00055
00056 fhicl::ParameterSet daq_pset;
00057 try
00058 {
00059 daq_pset = pset.get<fhicl::ParameterSet>("daq");
00060 }
00061 catch (...)
00062 {
00063 TLOG_ERROR(app_name)
00064 << "Unable to find the DAQ parameters in the initialization "
00065 << "ParameterSet: \"" + pset.to_string() + "\"." << TLOG_ENDL;
00066 return false;
00067 }
00068 try
00069 {
00070 policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
00071 }
00072 catch (...)
00073 {
00074 TLOG_ERROR(app_name)
00075 << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\"." << TLOG_ENDL;
00076 return false;
00077 }
00078
00079
00080 fhicl::ParameterSet metric_pset;
00081 try
00082 {
00083 metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
00084 }
00085 catch (...) {}
00086
00087 if (metric_pset.is_empty())
00088 {
00089 TLOG_INFO(app_name) << "No metric plugins appear to be defined" << TLOG_ENDL;
00090 }
00091 try
00092 {
00093 metricMan_.initialize(metric_pset, app_name);
00094 }
00095 catch (...)
00096 {
00097 ExceptionHandler(ExceptionHandlerRethrow::no,
00098 "Error loading metrics in RoutingMasterCore::initialize()");
00099 }
00100
00101
00102 auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
00103 if (policy_plugin_spec.length() == 0)
00104 {
00105 TLOG_ERROR(app_name)
00106 << "No fragment generator (parameter name = \"policy\") was "
00107 << "specified in the policy ParameterSet. The "
00108 << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\"." << TLOG_ENDL;
00109 return false;
00110 }
00111 try
00112 {
00113 policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_);
00114 }
00115 catch (...)
00116 {
00117 std::stringstream exception_string;
00118 exception_string << "Exception thrown during initialization of policy of type \""
00119 << policy_plugin_spec << "\"";
00120
00121 ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
00122
00123 TLOG_DEBUG(app_name) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string() << TLOG_ENDL;
00124
00125 return false;
00126 }
00127
00128 rt_priority_ = daq_pset.get<int>("rt_priority", 0);
00129 sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
00130 num_receivers_ = policy_->GetReceiverCount();
00131
00132 receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
00133 receive_token_events_ = std::vector<epoll_event>(num_receivers_ + 1);
00134
00135 auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
00136 routing_mode_ = mode ? detail::RoutingMasterMode::RouteBySendCount : detail::RoutingMasterMode::RouteBySequenceID;
00137 max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
00138 current_table_interval_ms_ = max_table_update_interval_ms_;
00139 max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
00140 receive_token_port_ = daq_pset.get<int>("routing_token_port", 35555);
00141 send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
00142 receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
00143 send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
00144 receive_address_ = daq_pset.get<std::string>("routing_master_hostname", "localhost");
00145
00146
00147 statsHelper_.createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
00148
00149 shutdown_requested_.store(false);
00150 start_recieve_token_thread_();
00151 return true;
00152 }
00153
00154 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t)
00155 {
00156 stop_requested_.store(false);
00157 pause_requested_.store(false);
00158
00159 statsHelper_.resetStatistics();
00160 policy_->Reset();
00161
00162 metricMan_.do_start();
00163 run_id_ = id;
00164 table_update_count_ = 0;
00165 received_token_count_ = 0;
00166
00167 TLOG_DEBUG(app_name) << "Started run " << std::to_string(run_id_.run()) << TLOG_ENDL;
00168 return true;
00169 }
00170
00171 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t)
00172 {
00173 TLOG_DEBUG(app_name) << "Stopping run " << std::to_string(run_id_.run())
00174 << " after " << std::to_string(table_update_count_) << " table updates."
00175 << " and " << received_token_count_ << " received tokens." << TLOG_ENDL;
00176 stop_requested_.store(true);
00177 return true;
00178 }
00179
00180 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t)
00181 {
00182 TLOG_DEBUG(app_name) << "Pausing run " << std::to_string(run_id_.run())
00183 << " after " << table_update_count_ << " table updates."
00184 << " and " << received_token_count_ << " received tokens." << TLOG_ENDL;
00185 pause_requested_.store(true);
00186 return true;
00187 }
00188
00189 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t)
00190 {
00191 TLOG_DEBUG(app_name) << "Resuming run " << run_id_.run() << TLOG_ENDL;
00192 policy_->Reset();
00193 pause_requested_.store(false);
00194 metricMan_.do_start();
00195 return true;
00196 }
00197
00198 bool artdaq::RoutingMasterCore::shutdown(uint64_t)
00199 {
00200 shutdown_requested_.store(true);
00201 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00202 policy_.reset(nullptr);
00203 metricMan_.shutdown();
00204 return true;
00205 }
00206
00207 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00208 {
00209 TLOG_DEBUG(app_name) << "soft_initialize method called with "
00210 << "ParameterSet = \"" << pset.to_string()
00211 << "\"." << TLOG_ENDL;
00212 return initialize(pset, e, f);
00213 }
00214
00215 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00216 {
00217 TLOG_DEBUG(app_name) << "reinitialize method called with "
00218 << "ParameterSet = \"" << pset.to_string()
00219 << "\"." << TLOG_ENDL;
00220 return initialize(pset, e, f);
00221 }
00222
00223 size_t artdaq::RoutingMasterCore::process_event_table()
00224 {
00225 if (rt_priority_ > 0)
00226 {
00227 #pragma GCC diagnostic push
00228 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00229 sched_param s_param = {};
00230 s_param.sched_priority = rt_priority_;
00231 if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
00232 TLOG_WARNING(app_name) << "setting realtime priority failed" << TLOG_ENDL;
00233 #pragma GCC diagnostic pop
00234 }
00235
00236
00237
00238
00239 if (rt_priority_ > 0)
00240 {
00241 #pragma GCC diagnostic push
00242 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00243 sched_param s_param = {};
00244 s_param.sched_priority = rt_priority_;
00245 int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
00246 if (status != 0)
00247 {
00248 TLOG_ERROR(app_name)
00249 << "Failed to set realtime priority to " << std::to_string(rt_priority_)
00250 << ", return code = " << status << TLOG_ENDL;
00251 }
00252 #pragma GCC diagnostic pop
00253 }
00254
00255
00256
00257 TLOG_DEBUG(app_name) << "Sending initial table." << TLOG_ENDL;
00258 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00259 auto nextSendTime = startTime;
00260 double delta_time;
00261 while (true)
00262 {
00263 if (stop_requested_ || pause_requested_) { break; }
00264 startTime = artdaq::MonitoredQuantity::getCurrentTime();
00265
00266 if (startTime >= nextSendTime)
00267 {
00268 auto table = policy_->GetCurrentTable();
00269 if (table.size() > 0)
00270 {
00271 send_event_table(table);
00272 ++table_update_count_;
00273 delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00274 statsHelper_.addSample(TABLE_UPDATES_STAT_KEY, delta_time);
00275 TLOG_ARB(16, app_name) << "process_fragments TABLE_UPDATES_STAT_KEY=" << std::to_string(delta_time) << TLOG_ENDL;
00276 }
00277 else
00278 {
00279 TLOG_WARNING(app_name) << "No tokens received in this update interval! This is most likely a Very Bad Thing!" << TLOG_ENDL;
00280 }
00281 auto max_tokens = policy_->GetMaxNumberOfTokens();
00282 if (max_tokens > 0)
00283 {
00284 auto frac = table.size() / static_cast<double>(max_tokens);
00285 if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
00286 if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
00287 if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
00288 if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
00289 }
00290 nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
00291 TLOG_DEBUG(app_name) << "current_table_interval_ms is now " << current_table_interval_ms_ << TLOG_ENDL;
00292 }
00293 else
00294 {
00295 usleep(current_table_interval_ms_ * 10);
00296 }
00297 }
00298
00299 metricMan_.do_stop();
00300
00301 return table_update_count_;
00302 }
00303
00304 void artdaq::RoutingMasterCore::send_event_table(detail::RoutingPacket packet)
00305 {
00306
00307 if (table_socket_ == -1)
00308 {
00309 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00310 if (table_socket_ < 0)
00311 {
00312 TLOG_ERROR(app_name) << "I failed to create the socket for sending Data Requests! Errno: " << std::to_string(errno) << TLOG_ENDL;
00313 exit(1);
00314 }
00315 auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
00316 if (sts == -1)
00317 {
00318 TLOG_ERROR(app_name) << "Unable to resolve table_update_address" << TLOG_ENDL;
00319 exit(1);
00320 }
00321
00322 auto yes = 1;
00323 if (receive_address_ != "localhost")
00324 {
00325 TLOG_DEBUG(app_name) << "Making sure that multicast sending uses the correct interface for hostname " << receive_address_ << TLOG_ENDL;
00326 struct in_addr addr;
00327 sts = ResolveHost(receive_address_.c_str(), addr);
00328 if (sts == -1)
00329 {
00330 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;;
00331 }
00332
00333 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00334 {
00335 throw art::Exception(art::errors::Configuration) <<
00336 "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
00337 exit(1);
00338 }
00339
00340 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0)
00341 {
00342 TLOG_ERROR("RequestSender") << "Unable to enable multicast loopback on table socket" << TLOG_ENDL;
00343 exit(1);
00344 }
00345 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
00346 {
00347 TLOG_ERROR(app_name) << "Cannot set outgoing interface. Errno: " << std::to_string(errno) << TLOG_ENDL;
00348 exit(1);
00349 }
00350 }
00351 if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1)
00352 {
00353 TLOG_ERROR(app_name) << "Cannot set request socket to broadcast. Errno: " << std::to_string(errno) << TLOG_ENDL;
00354 exit(1);
00355 }
00356 }
00357
00358
00359 if (ack_socket_ == -1)
00360 {
00361 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00362 if (ack_socket_ < 0)
00363 {
00364 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
00365 exit(1);
00366 }
00367
00368 struct sockaddr_in si_me_request;
00369
00370 auto yes = 1;
00371 if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00372 {
00373 throw art::Exception(art::errors::Configuration) <<
00374 "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
00375 exit(1);
00376 }
00377 memset(&si_me_request, 0, sizeof(si_me_request));
00378 si_me_request.sin_family = AF_INET;
00379 si_me_request.sin_port = htons(receive_acks_port_);
00380 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00381 if (bind(ack_socket_, reinterpret_cast<struct sockaddr *>(&si_me_request), sizeof(si_me_request)) == -1)
00382 {
00383 throw art::Exception(art::errors::Configuration) <<
00384 "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
00385 exit(1);
00386 }
00387 TLOG_DEBUG(app_name) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_ << TLOG_ENDL;
00388 }
00389
00390 auto acks = std::unordered_map<int, bool>();
00391 for (auto& r : sender_ranks_)
00392 {
00393 acks[r] = false;
00394 }
00395 auto counter = 0U;
00396 auto start_time = std::chrono::steady_clock::now();
00397 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
00398 {
00399
00400 auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
00401 auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
00402
00403 TLOG_DEBUG(app_name) << "Sending table information for " << std::to_string(header.nEntries) << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ << TLOG_ENDL;
00404 if (sendto(table_socket_, &header, sizeof(detail::RoutingPacketHeader), 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
00405 {
00406 TLOG_ERROR(app_name) << "Error sending request message header" << TLOG_ENDL;
00407 }
00408 if (sendto(table_socket_, &packet[0], packetSize, 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
00409 {
00410 TLOG_ERROR(app_name) << "Error sending request message data" << TLOG_ENDL;
00411 }
00412
00413
00414
00415 auto first = packet[0].sequence_id;
00416 auto last = packet.rbegin()->sequence_id;
00417 TLOG_DEBUG(app_name) << "Expecting acks to have first= " << std::to_string(first) << ", and last= " << std::to_string(last) << TLOG_ENDL;
00418
00419
00420 auto startTime = std::chrono::steady_clock::now();
00421 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
00422 {
00423 auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
00424 if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms)
00425 {
00426 if (counter > max_ack_cycle_count_ && table_update_count_ > 0)
00427 {
00428 TLOG_ERROR(app_name) << "Did not receive acks from all senders after resending table " << std::to_string(counter)
00429 << " times during the table_update_interval. Check the status of the senders!" << TLOG_ENDL;
00430 break;
00431 }
00432 TLOG_WARNING(app_name) << "Did not receive acks from all senders within the table_ack_wait_time. Resending table update" << TLOG_ENDL;
00433 break;
00434 }
00435
00436 TLOG_ARB(20, app_name) << "send_event_table: Polling Request socket for new requests" << TLOG_ENDL;
00437 auto ready = true;
00438 while (ready)
00439 {
00440 detail::RoutingAckPacket buffer;
00441 if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0)
00442 {
00443 if (errno == EWOULDBLOCK || errno == EAGAIN)
00444 {
00445 TLOG_ARB(20, app_name) << "send_event_table: No more ack datagrams on ack socket." << TLOG_ENDL;
00446 ready = false;
00447 }
00448 else
00449 {
00450 TLOG_ERROR(app_name) << "An unexpected error occurred during ack packet receive" << TLOG_ENDL;
00451 exit(2);
00452 }
00453 }
00454 else
00455 {
00456 TLOG_DEBUG(app_name) << "Ack packet from rank " << buffer.rank << " has first= " << std::to_string(buffer.first_sequence_id)
00457 << " and last= " << std::to_string(buffer.last_sequence_id) << TLOG_ENDL;
00458 if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
00459 {
00460 TLOG_DEBUG(app_name) << "Received table update acknowledgement from sender with rank " << std::to_string(buffer.rank) << "." << TLOG_ENDL;
00461 acks[buffer.rank] = true;
00462 TLOG_DEBUG(app_name) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; })
00463 << " acks outstanding" << TLOG_ENDL;
00464 }
00465 else
00466 {
00467 if (!acks.count(buffer.rank))
00468 {
00469 TLOG_ERROR(app_name) << "Received acknowledgement from invalid rank " << buffer.rank << "!"
00470 << " Cross-talk between RoutingMasters means there's a configuration error!" << TLOG_ENDL;
00471 }
00472 else
00473 {
00474 TLOG_WARNING(app_name) << "Received acknowledgement from rank " << buffer.rank
00475 << " that had incorrect sequence ID information. Discarding." << TLOG_ENDL;
00476 }
00477 }
00478 }
00479 }
00480 usleep(table_ack_wait_time_ms * 1000 / 10);
00481 }
00482 }
00483 if (metricMan)
00484 {
00485 artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time;
00486 metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average);
00487 }
00488 }
00489
00490 void artdaq::RoutingMasterCore::receive_tokens_()
00491 {
00492 while (!shutdown_requested_)
00493 {
00494 TLOG_DEBUG(app_name) << "Receive Token loop start" << TLOG_ENDL;
00495 if (token_socket_ == -1)
00496 {
00497 TLOG_DEBUG(app_name) << "Opening token listener socket" << TLOG_ENDL;
00498 token_socket_ = TCP_listen_fd(receive_token_port_, 3 * sizeof(detail::RoutingToken));
00499
00500 if (token_epoll_fd_ != -1) close(token_epoll_fd_);
00501 struct epoll_event ev;
00502 token_epoll_fd_ = epoll_create1(0);
00503 ev.events = EPOLLIN | EPOLLPRI;
00504 ev.data.fd = token_socket_;
00505 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, token_socket_, &ev) == -1)
00506 {
00507 TLOG_ERROR(app_name) << "Could not register listen socket to epoll fd" << TLOG_ENDL;
00508 exit(3);
00509 }
00510 }
00511 if (token_socket_ == -1 || token_epoll_fd_ == -1)
00512 {
00513 TLOG_DEBUG(app_name) << "One of the listen sockets was not opened successfully." << TLOG_ENDL;
00514 return;
00515 }
00516
00517 auto nfds = epoll_wait(token_epoll_fd_, &receive_token_events_[0], receive_token_events_.size(), current_table_interval_ms_);
00518 if (nfds == -1)
00519 {
00520 perror("epoll_wait");
00521 exit(EXIT_FAILURE);
00522 }
00523
00524 TLOG_DEBUG(app_name) << "Received " << std::to_string(nfds) << " events" << TLOG_ENDL;
00525 for (auto n = 0; n < nfds; ++n)
00526 {
00527 if (receive_token_events_[n].data.fd == token_socket_)
00528 {
00529 TLOG_DEBUG(app_name) << "Accepting new connection on token_socket" << TLOG_ENDL;
00530 sockaddr_in addr;
00531 socklen_t arglen = sizeof(addr);
00532 auto conn_sock = accept(token_socket_, (struct sockaddr *)&addr, &arglen);
00533
00534 if (conn_sock == -1)
00535 {
00536 perror("accept");
00537 exit(EXIT_FAILURE);
00538 }
00539
00540 receive_token_addrs_[conn_sock] = std::string(inet_ntoa(addr.sin_addr));
00541 struct epoll_event ev;
00542 ev.events = EPOLLIN | EPOLLET;
00543 ev.data.fd = conn_sock;
00544 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, conn_sock, &ev) == -1)
00545 {
00546 perror("epoll_ctl: conn_sock");
00547 exit(EXIT_FAILURE);
00548 }
00549 }
00550 else
00551 {
00552 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00553 detail::RoutingToken buff;
00554 auto sts = read(receive_token_events_[n].data.fd, &buff, sizeof(detail::RoutingToken));
00555 if (sts != sizeof(detail::RoutingToken) || buff.header != TOKEN_MAGIC)
00556 {
00557 TLOG_ERROR(app_name) << "Received invalid token from " << receive_token_addrs_[receive_token_events_[n].data.fd] << TLOG_ENDL;
00558 }
00559 else
00560 {
00561 TLOG_DEBUG(app_name) << "Received token from " << std::to_string(buff.rank) << " indicating " << buff.new_slots_free << " slots are free." << TLOG_ENDL;
00562 received_token_count_ += buff.new_slots_free;
00563 if (routing_mode_ == detail::RoutingMasterMode::RouteBySequenceID)
00564 {
00565 policy_->AddReceiverToken(buff.rank, buff.new_slots_free);
00566 }
00567 else if (routing_mode_ == detail::RoutingMasterMode::RouteBySendCount)
00568 {
00569 if (!received_token_counter_.count(buff.rank)) received_token_counter_[buff.rank] = 0;
00570 received_token_counter_[buff.rank] += buff.new_slots_free;
00571 TLOG_DEBUG(app_name) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() << "." << TLOG_ENDL;
00572 while (received_token_counter_[buff.rank] >= sender_ranks_.size())
00573 {
00574 TLOG_DEBUG(app_name) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size()
00575 << "... Sending token to policy" << TLOG_ENDL;
00576 policy_->AddReceiverToken(buff.rank, 1);
00577 received_token_counter_[buff.rank] -= sender_ranks_.size();
00578 }
00579 }
00580 }
00581 auto delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00582 statsHelper_.addSample(TOKENS_RECEIVED_STAT_KEY, delta_time);
00583
00584 }
00585 }
00586 }
00587 }
00588
00589 void artdaq::RoutingMasterCore::start_recieve_token_thread_()
00590 {
00591 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00592 TLOG_INFO(app_name) << "Starting Token Reception Thread" << TLOG_ENDL;
00593 ev_token_receive_thread_ = boost::thread(&RoutingMasterCore::receive_tokens_, this);
00594 }
00595
00596 std::string artdaq::RoutingMasterCore::report(std::string const&) const
00597 {
00598 std::string resultString;
00599
00600
00601 auto tmpString = app_name + " run number = " + std::to_string(run_id_.run())
00602 + ", table updates sent = " + std::to_string(table_update_count_)
00603 + ", Receiver tokens received = " + std::to_string(received_token_count_);
00604 return tmpString;
00605 }
00606
00607 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const
00608 {
00609 std::ostringstream oss;
00610 oss << app_name << " statistics:" << std::endl;
00611
00612 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00613 if (mqPtr.get() != nullptr)
00614 {
00615 artdaq::MonitoredQuantityStats stats;
00616 mqPtr->getStats(stats);
00617 oss << " Table Update statistics: "
00618 << stats.recentSampleCount << " table updates sent at "
00619 << stats.recentSampleRate << " table updates/sec, , monitor window = "
00620 << stats.recentDuration << " sec" << std::endl;
00621 oss << " Average times per table update: ";
00622 if (stats.recentSampleRate > 0.0)
00623 {
00624 oss << " elapsed time = "
00625 << (1.0 / stats.recentSampleRate) << " sec";
00626 }
00627 oss << ", avg table acknowledgement wait time = "
00628 << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
00629 }
00630
00631 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00632 if (mqPtr.get() != nullptr)
00633 {
00634 artdaq::MonitoredQuantityStats stats;
00635 mqPtr->getStats(stats);
00636 oss << " Received Token statistics: "
00637 << stats.recentSampleCount << " tokens received at "
00638 << stats.recentSampleRate << " tokens/sec, , monitor window = "
00639 << stats.recentDuration << " sec" << std::endl;
00640 oss << " Average times per token: ";
00641 if (stats.recentSampleRate > 0.0)
00642 {
00643 oss << " elapsed time = "
00644 << (1.0 / stats.recentSampleRate) << " sec";
00645 }
00646 oss << ", input token wait time = "
00647 << mqPtr->getRecentValueSum() << " sec" << std::endl;
00648 }
00649
00650 return oss.str();
00651 }
00652
00653 void artdaq::RoutingMasterCore::sendMetrics_()
00654 {
00655 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00656 if (mqPtr.get() != nullptr)
00657 {
00658 artdaq::MonitoredQuantityStats stats;
00659 mqPtr->getStats(stats);
00660 metricMan_.sendMetric("Table Update Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::Accumulate);
00661 metricMan_.sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
00662 metricMan_.sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average);
00663 }
00664
00665 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00666 if (mqPtr.get() != nullptr)
00667 {
00668 artdaq::MonitoredQuantityStats stats;
00669 mqPtr->getStats(stats);
00670 metricMan_.sendMetric("Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::Accumulate);
00671 metricMan_.sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
00672 metricMan_.sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
00673 }
00674 }