00001 #include <sys/un.h>
00002 #include <sys/time.h>
00003 #include <sys/epoll.h>
00004 #include <arpa/inet.h>
00005 #include <netdb.h>
00006 #include <pthread.h>
00007 #include <sched.h>
00008 #include <algorithm>
00009
00010 #include "canvas/Utilities/Exception.h"
00011 #include "cetlib/exception.h"
00012
00013 #define TRACE_NAME "RoutingMasterCore" // include these 2 first -
00014 #include "artdaq/DAQdata/Globals.hh"
00015 #include "artdaq-core/Data/Fragment.hh"
00016 #include "artdaq-core/Utilities/ExceptionHandler.hh"
00017
00018 #include "artdaq/Application/RoutingMasterCore.hh"
00019 #include "artdaq/Application/Routing/makeRoutingMasterPolicy.hh"
00020 #include "artdaq/DAQdata/TCP_listen_fd.hh"
00021 #include "artdaq/DAQdata/TCPConnect.hh"
00022
00023 const std::string artdaq::RoutingMasterCore::
00024 TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates");
00025 const std::string artdaq::RoutingMasterCore::
00026 TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived");
00027
00028 artdaq::RoutingMasterCore::RoutingMasterCore(int rank, std::string name) :
00029 name_(name)
00030 , received_token_counter_()
00031 , shutdown_requested_(false)
00032 , stop_requested_(false)
00033 , pause_requested_(false)
00034 , token_socket_(-1)
00035 , table_socket_(-1)
00036 , ack_socket_(-1)
00037 {
00038 TLOG_DEBUG(name_) << "Constructor" << TLOG_ENDL;
00039 statsHelper_.addMonitoredQuantityName(TABLE_UPDATES_STAT_KEY);
00040 statsHelper_.addMonitoredQuantityName(TOKENS_RECEIVED_STAT_KEY);
00041 metricMan = &metricMan_;
00042 my_rank = rank;
00043 }
00044
00045 artdaq::RoutingMasterCore::~RoutingMasterCore()
00046 {
00047 TLOG_DEBUG(name_) << "Destructor" << TLOG_ENDL;
00048 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00049 }
00050
00051 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t)
00052 {
00053 TLOG_DEBUG(name_) << "initialize method called with "
00054 << "ParameterSet = \"" << pset.to_string()
00055 << "\"." << TLOG_ENDL;
00056
00057
00058 fhicl::ParameterSet daq_pset;
00059 try
00060 {
00061 daq_pset = pset.get<fhicl::ParameterSet>("daq");
00062 }
00063 catch (...)
00064 {
00065 TLOG_ERROR(name_)
00066 << "Unable to find the DAQ parameters in the initialization "
00067 << "ParameterSet: \"" + pset.to_string() + "\"." << TLOG_ENDL;
00068 return false;
00069 }
00070 try
00071 {
00072 policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
00073 }
00074 catch (...)
00075 {
00076 TLOG_ERROR(name_)
00077 << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\"." << TLOG_ENDL;
00078 return false;
00079 }
00080
00081
00082 fhicl::ParameterSet metric_pset;
00083 try
00084 {
00085 metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
00086 }
00087 catch (...) {}
00088
00089 if (metric_pset.is_empty())
00090 {
00091 TLOG_INFO(name_) << "No metric plugins appear to be defined" << TLOG_ENDL;
00092 }
00093 try
00094 {
00095 metricMan_.initialize(metric_pset, name_);
00096 }
00097 catch (...)
00098 {
00099 ExceptionHandler(ExceptionHandlerRethrow::no,
00100 "Error loading metrics in RoutingMasterCore::initialize()");
00101 }
00102
00103
00104 auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
00105 if (policy_plugin_spec.length() == 0)
00106 {
00107 TLOG_ERROR(name_)
00108 << "No fragment generator (parameter name = \"policy\") was "
00109 << "specified in the policy ParameterSet. The "
00110 << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\"." << TLOG_ENDL;
00111 return false;
00112 }
00113 try
00114 {
00115 policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_);
00116 }
00117 catch (...)
00118 {
00119 std::stringstream exception_string;
00120 exception_string << "Exception thrown during initialization of policy of type \""
00121 << policy_plugin_spec << "\"";
00122
00123 ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
00124
00125 TLOG_DEBUG(name_) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string() << TLOG_ENDL;
00126
00127 return false;
00128 }
00129
00130 rt_priority_ = daq_pset.get<int>("rt_priority", 0);
00131 sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
00132 num_receivers_ = policy_->GetReceiverCount();
00133
00134 receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
00135 receive_token_events_ = std::vector<epoll_event>(num_receivers_ + 1);
00136
00137 auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
00138 routing_mode_ = mode ? detail::RoutingMasterMode::RouteBySendCount : detail::RoutingMasterMode::RouteBySequenceID;
00139 max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
00140 current_table_interval_ms_ = max_table_update_interval_ms_;
00141 max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
00142 receive_token_port_ = daq_pset.get<int>("routing_token_port", 35555);
00143 send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
00144 receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
00145 send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
00146 receive_address_ = daq_pset.get<std::string>("routing_master_hostname", "localhost");
00147
00148
00149 statsHelper_.createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
00150
00151 shutdown_requested_.store(false);
00152 start_recieve_token_thread_();
00153 return true;
00154 }
00155
00156 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t)
00157 {
00158 stop_requested_.store(false);
00159 pause_requested_.store(false);
00160
00161 statsHelper_.resetStatistics();
00162 policy_->Reset();
00163
00164 metricMan_.do_start();
00165 run_id_ = id;
00166 table_update_count_ = 0;
00167 received_token_count_ = 0;
00168
00169 TLOG_DEBUG(name_) << "Started run " << std::to_string(run_id_.run()) << TLOG_ENDL;
00170 return true;
00171 }
00172
00173 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t)
00174 {
00175 TLOG_DEBUG(name_) << "Stopping run " << std::to_string(run_id_.run())
00176 << " after " << std::to_string(table_update_count_) << " table updates."
00177 << " and " << received_token_count_ << " received tokens." << TLOG_ENDL;
00178 stop_requested_.store(true);
00179 return true;
00180 }
00181
00182 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t)
00183 {
00184 TLOG_DEBUG(name_) << "Pausing run " << std::to_string(run_id_.run())
00185 << " after " << table_update_count_ << " table updates."
00186 << " and " << received_token_count_ << " received tokens." << TLOG_ENDL;
00187 pause_requested_.store(true);
00188 return true;
00189 }
00190
00191 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t)
00192 {
00193 TLOG_DEBUG(name_) << "Resuming run " << run_id_.run() << TLOG_ENDL;
00194 policy_->Reset();
00195 pause_requested_.store(false);
00196 metricMan_.do_start();
00197 return true;
00198 }
00199
00200 bool artdaq::RoutingMasterCore::shutdown(uint64_t)
00201 {
00202 shutdown_requested_.store(true);
00203 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00204 policy_.reset(nullptr);
00205 metricMan_.shutdown();
00206 return true;
00207 }
00208
00209 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00210 {
00211 TLOG_DEBUG(name_) << "soft_initialize method called with "
00212 << "ParameterSet = \"" << pset.to_string()
00213 << "\"." << TLOG_ENDL;
00214 return initialize(pset, e, f);
00215 }
00216
00217 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
00218 {
00219 TLOG_DEBUG(name_) << "reinitialize method called with "
00220 << "ParameterSet = \"" << pset.to_string()
00221 << "\"." << TLOG_ENDL;
00222 return initialize(pset, e, f);
00223 }
00224
00225 size_t artdaq::RoutingMasterCore::process_event_table()
00226 {
00227 if (rt_priority_ > 0)
00228 {
00229 #pragma GCC diagnostic push
00230 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00231 sched_param s_param = {};
00232 s_param.sched_priority = rt_priority_;
00233 if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
00234 TLOG_WARNING(name_) << "setting realtime priority failed" << TLOG_ENDL;
00235 #pragma GCC diagnostic pop
00236 }
00237
00238
00239
00240
00241 if (rt_priority_ > 0)
00242 {
00243 #pragma GCC diagnostic push
00244 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
00245 sched_param s_param = {};
00246 s_param.sched_priority = rt_priority_;
00247 int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
00248 if (status != 0)
00249 {
00250 TLOG_ERROR(name_)
00251 << "Failed to set realtime priority to " << std::to_string(rt_priority_)
00252 << ", return code = " << status << TLOG_ENDL;
00253 }
00254 #pragma GCC diagnostic pop
00255 }
00256
00257
00258
00259 TLOG_DEBUG(name_) << "Sending initial table." << TLOG_ENDL;
00260 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00261 auto nextSendTime = startTime;
00262 double delta_time;
00263 while (true)
00264 {
00265 if (stop_requested_ || pause_requested_) { break; }
00266 startTime = artdaq::MonitoredQuantity::getCurrentTime();
00267
00268 if (startTime >= nextSendTime)
00269 {
00270 auto table = policy_->GetCurrentTable();
00271 if (table.size() > 0)
00272 {
00273 send_event_table(table);
00274 ++table_update_count_;
00275 delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00276 statsHelper_.addSample(TABLE_UPDATES_STAT_KEY, delta_time);
00277 TLOG_ARB(16, name_) << "process_fragments TABLE_UPDATES_STAT_KEY=" << std::to_string(delta_time) << TLOG_ENDL;
00278 }
00279 else
00280 {
00281 TLOG_WARNING(name_) << "No tokens received in this update interval! This is most likely a Very Bad Thing!" << TLOG_ENDL;
00282 }
00283 auto max_tokens = policy_->GetMaxNumberOfTokens();
00284 if (max_tokens > 0)
00285 {
00286 auto frac = table.size() / static_cast<double>(max_tokens);
00287 if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
00288 if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
00289 if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
00290 if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
00291 }
00292 nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
00293 TLOG_DEBUG(name_) << "current_table_interval_ms is now " << current_table_interval_ms_ << TLOG_ENDL;
00294 }
00295 else
00296 {
00297 usleep(current_table_interval_ms_ * 10);
00298 }
00299 }
00300
00301 metricMan_.do_stop();
00302
00303 return table_update_count_;
00304 }
00305
00306 void artdaq::RoutingMasterCore::send_event_table(detail::RoutingPacket packet)
00307 {
00308
00309 if (table_socket_ == -1)
00310 {
00311 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00312 if (table_socket_ < 0)
00313 {
00314 TLOG_ERROR(name_) << "I failed to create the socket for sending Data Requests! Errno: " << std::to_string(errno) << TLOG_ENDL;
00315 exit(1);
00316 }
00317 auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
00318 if (sts == -1)
00319 {
00320 TLOG_ERROR(name_) << "Unable to resolve table_update_address" << TLOG_ENDL;
00321 exit(1);
00322 }
00323
00324 auto yes = 1;
00325 if (receive_address_ != "localhost")
00326 {
00327 TLOG_DEBUG(name_) << "Making sure that multicast sending uses the correct interface for hostname " << receive_address_ << TLOG_ENDL;
00328 struct in_addr addr;
00329 sts = ResolveHost(receive_address_.c_str(), addr);
00330 if (sts == -1)
00331 {
00332 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;;
00333 }
00334
00335 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00336 {
00337 throw art::Exception(art::errors::Configuration) <<
00338 "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
00339 exit(1);
00340 }
00341
00342 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0)
00343 {
00344 TLOG_ERROR("RequestSender") << "Unable to enable multicast loopback on table socket" << TLOG_ENDL;
00345 exit(1);
00346 }
00347 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
00348 {
00349 TLOG_ERROR(name_) << "Cannot set outgoing interface. Errno: " << std::to_string(errno) << TLOG_ENDL;
00350 exit(1);
00351 }
00352 }
00353 if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1)
00354 {
00355 TLOG_ERROR(name_) << "Cannot set request socket to broadcast. Errno: " << std::to_string(errno) << TLOG_ENDL;
00356 exit(1);
00357 }
00358 }
00359
00360
00361 if (ack_socket_ == -1)
00362 {
00363 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00364 if (ack_socket_ < 0)
00365 {
00366 throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
00367 exit(1);
00368 }
00369
00370 struct sockaddr_in si_me_request;
00371
00372 auto yes = 1;
00373 if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00374 {
00375 throw art::Exception(art::errors::Configuration) <<
00376 "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
00377 exit(1);
00378 }
00379 memset(&si_me_request, 0, sizeof(si_me_request));
00380 si_me_request.sin_family = AF_INET;
00381 si_me_request.sin_port = htons(receive_acks_port_);
00382 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00383 if (bind(ack_socket_, reinterpret_cast<struct sockaddr *>(&si_me_request), sizeof(si_me_request)) == -1)
00384 {
00385 throw art::Exception(art::errors::Configuration) <<
00386 "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
00387 exit(1);
00388 }
00389 TLOG_DEBUG(name_) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_ << TLOG_ENDL;
00390 }
00391
00392 auto acks = std::unordered_map<int, bool>();
00393 for (auto& r : sender_ranks_)
00394 {
00395 acks[r] = false;
00396 }
00397 auto counter = 0U;
00398 auto start_time = std::chrono::steady_clock::now();
00399 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
00400 {
00401
00402 auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
00403 auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
00404
00405 TLOG_DEBUG(name_) << "Sending table information for " << std::to_string(header.nEntries) << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ << TLOG_ENDL;
00406 if (sendto(table_socket_, &header, sizeof(detail::RoutingPacketHeader), 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
00407 {
00408 TLOG_ERROR(name_) << "Error sending request message header" << TLOG_ENDL;
00409 }
00410 if (sendto(table_socket_, &packet[0], packetSize, 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
00411 {
00412 TLOG_ERROR(name_) << "Error sending request message data" << TLOG_ENDL;
00413 }
00414
00415
00416
00417 auto first = packet[0].sequence_id;
00418 auto last = packet.rbegin()->sequence_id;
00419 TLOG_DEBUG(name_) << "Expecting acks to have first= " << std::to_string(first) << ", and last= " << std::to_string(last) << TLOG_ENDL;
00420
00421
00422 auto startTime = std::chrono::steady_clock::now();
00423 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
00424 {
00425 auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
00426 if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms)
00427 {
00428 if (counter > max_ack_cycle_count_ && table_update_count_ > 0)
00429 {
00430 TLOG_ERROR(name_) << "Did not receive acks from all senders after resending table " << std::to_string(counter)
00431 << " times during the table_update_interval. Check the status of the senders!" << TLOG_ENDL;
00432 break;
00433 }
00434 TLOG_WARNING(name_) << "Did not receive acks from all senders within the table_ack_wait_time. Resending table update" << TLOG_ENDL;
00435 break;
00436 }
00437
00438 TLOG_ARB(20, name_) << "send_event_table: Polling Request socket for new requests" << TLOG_ENDL;
00439 auto ready = true;
00440 while (ready)
00441 {
00442 detail::RoutingAckPacket buffer;
00443 if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0)
00444 {
00445 if (errno == EWOULDBLOCK || errno == EAGAIN)
00446 {
00447 TLOG_ARB(20, name_) << "send_event_table: No more ack datagrams on ack socket." << TLOG_ENDL;
00448 ready = false;
00449 }
00450 else
00451 {
00452 TLOG_ERROR(name_) << "An unexpected error occurred during ack packet receive" << TLOG_ENDL;
00453 exit(2);
00454 }
00455 }
00456 else
00457 {
00458 TLOG_DEBUG(name_) << "Ack packet from rank " << buffer.rank << " has first= " << std::to_string(buffer.first_sequence_id)
00459 << " and last= " << std::to_string(buffer.last_sequence_id) << TLOG_ENDL;
00460 if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
00461 {
00462 TLOG_DEBUG(name_) << "Received table update acknowledgement from sender with rank " << std::to_string(buffer.rank) << "." << TLOG_ENDL;
00463 acks[buffer.rank] = true;
00464 TLOG_DEBUG(name_) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; })
00465 << " acks outstanding" << TLOG_ENDL;
00466 }
00467 else
00468 {
00469 if (!acks.count(buffer.rank))
00470 {
00471 TLOG_ERROR(name_) << "Received acknowledgement from invalid rank " << buffer.rank << "!"
00472 << " Cross-talk between RoutingMasters means there's a configuration error!" << TLOG_ENDL;
00473 }
00474 else
00475 {
00476 TLOG_WARNING(name_) << "Received acknowledgement from rank " << buffer.rank
00477 << " that had incorrect sequence ID information. Discarding." << TLOG_ENDL;
00478 }
00479 }
00480 }
00481 }
00482 usleep(table_ack_wait_time_ms * 1000 / 10);
00483 }
00484 }
00485 if (metricMan)
00486 {
00487 artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time;
00488 metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average);
00489 }
00490 }
00491
00492 void artdaq::RoutingMasterCore::receive_tokens_()
00493 {
00494 while (!shutdown_requested_)
00495 {
00496 TLOG_DEBUG(name_) << "Receive Token loop start" << TLOG_ENDL;
00497 if (token_socket_ == -1)
00498 {
00499 TLOG_DEBUG(name_) << "Opening token listener socket" << TLOG_ENDL;
00500 token_socket_ = TCP_listen_fd(receive_token_port_, 3 * sizeof(detail::RoutingToken));
00501
00502 if (token_epoll_fd_ != -1) close(token_epoll_fd_);
00503 struct epoll_event ev;
00504 token_epoll_fd_ = epoll_create1(0);
00505 ev.events = EPOLLIN | EPOLLPRI;
00506 ev.data.fd = token_socket_;
00507 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, token_socket_, &ev) == -1)
00508 {
00509 TLOG_ERROR(name_) << "Could not register listen socket to epoll fd" << TLOG_ENDL;
00510 exit(3);
00511 }
00512 }
00513 if (token_socket_ == -1 || token_epoll_fd_ == -1)
00514 {
00515 TLOG_DEBUG(name_) << "One of the listen sockets was not opened successfully." << TLOG_ENDL;
00516 return;
00517 }
00518
00519 auto nfds = epoll_wait(token_epoll_fd_, &receive_token_events_[0], receive_token_events_.size(), current_table_interval_ms_);
00520 if (nfds == -1)
00521 {
00522 perror("epoll_wait");
00523 exit(EXIT_FAILURE);
00524 }
00525
00526 TLOG_DEBUG(name_) << "Received " << std::to_string(nfds) << " events" << TLOG_ENDL;
00527 for (auto n = 0; n < nfds; ++n)
00528 {
00529 if (receive_token_events_[n].data.fd == token_socket_)
00530 {
00531 TLOG_DEBUG(name_) << "Accepting new connection on token_socket" << TLOG_ENDL;
00532 sockaddr_in addr;
00533 socklen_t arglen = sizeof(addr);
00534 auto conn_sock = accept(token_socket_, (struct sockaddr *)&addr, &arglen);
00535
00536 if (conn_sock == -1)
00537 {
00538 perror("accept");
00539 exit(EXIT_FAILURE);
00540 }
00541
00542 receive_token_addrs_[conn_sock] = std::string(inet_ntoa(addr.sin_addr));
00543 struct epoll_event ev;
00544 ev.events = EPOLLIN | EPOLLET;
00545 ev.data.fd = conn_sock;
00546 if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, conn_sock, &ev) == -1)
00547 {
00548 perror("epoll_ctl: conn_sock");
00549 exit(EXIT_FAILURE);
00550 }
00551 }
00552 else
00553 {
00554 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
00555 detail::RoutingToken buff;
00556 auto sts = read(receive_token_events_[n].data.fd, &buff, sizeof(detail::RoutingToken));
00557 if (sts != sizeof(detail::RoutingToken) || buff.header != TOKEN_MAGIC)
00558 {
00559 TLOG_ERROR(name_) << "Received invalid token from " << receive_token_addrs_[receive_token_events_[n].data.fd] << TLOG_ENDL;
00560 }
00561 else
00562 {
00563 TLOG_DEBUG(name_) << "Received token from " << std::to_string(buff.rank) << " indicating " << buff.new_slots_free << " slots are free." << TLOG_ENDL;
00564 received_token_count_ += buff.new_slots_free;
00565 if (routing_mode_ == detail::RoutingMasterMode::RouteBySequenceID)
00566 {
00567 policy_->AddReceiverToken(buff.rank, buff.new_slots_free);
00568 }
00569 else if (routing_mode_ == detail::RoutingMasterMode::RouteBySendCount)
00570 {
00571 if (!received_token_counter_.count(buff.rank)) received_token_counter_[buff.rank] = 0;
00572 received_token_counter_[buff.rank] += buff.new_slots_free;
00573 TLOG_DEBUG(name_) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() << "." << TLOG_ENDL;
00574 while (received_token_counter_[buff.rank] >= sender_ranks_.size())
00575 {
00576 TLOG_DEBUG(name_) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size()
00577 << "... Sending token to policy" << TLOG_ENDL;
00578 policy_->AddReceiverToken(buff.rank, 1);
00579 received_token_counter_[buff.rank] -= sender_ranks_.size();
00580 }
00581 }
00582 }
00583 auto delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
00584 statsHelper_.addSample(TOKENS_RECEIVED_STAT_KEY, delta_time);
00585
00586 }
00587 }
00588 }
00589 }
00590
00591 void artdaq::RoutingMasterCore::start_recieve_token_thread_()
00592 {
00593 if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
00594 TLOG_INFO(name_) << "Starting Token Reception Thread" << TLOG_ENDL;
00595 ev_token_receive_thread_ = boost::thread(&RoutingMasterCore::receive_tokens_, this);
00596 }
00597
00598 std::string artdaq::RoutingMasterCore::report(std::string const&) const
00599 {
00600 std::string resultString;
00601
00602
00603 auto tmpString = name_ + " run number = " + std::to_string(run_id_.run())
00604 + ", table updates sent = " + std::to_string(table_update_count_)
00605 + ", Receiver tokens received = " + std::to_string(received_token_count_);
00606 return tmpString;
00607 }
00608
00609 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const
00610 {
00611 std::ostringstream oss;
00612 oss << name_ << " statistics:" << std::endl;
00613
00614 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00615 if (mqPtr.get() != nullptr)
00616 {
00617 artdaq::MonitoredQuantityStats stats;
00618 mqPtr->getStats(stats);
00619 oss << " Table Update statistics: "
00620 << stats.recentSampleCount << " table updates sent at "
00621 << stats.recentSampleRate << " table updates/sec, , monitor window = "
00622 << stats.recentDuration << " sec" << std::endl;
00623 oss << " Average times per table update: ";
00624 if (stats.recentSampleRate > 0.0)
00625 {
00626 oss << " elapsed time = "
00627 << (1.0 / stats.recentSampleRate) << " sec";
00628 }
00629 oss << ", avg table acknowledgement wait time = "
00630 << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
00631 }
00632
00633 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00634 if (mqPtr.get() != nullptr)
00635 {
00636 artdaq::MonitoredQuantityStats stats;
00637 mqPtr->getStats(stats);
00638 oss << " Received Token statistics: "
00639 << stats.recentSampleCount << " tokens received at "
00640 << stats.recentSampleRate << " tokens/sec, , monitor window = "
00641 << stats.recentDuration << " sec" << std::endl;
00642 oss << " Average times per token: ";
00643 if (stats.recentSampleRate > 0.0)
00644 {
00645 oss << " elapsed time = "
00646 << (1.0 / stats.recentSampleRate) << " sec";
00647 }
00648 oss << ", input token wait time = "
00649 << mqPtr->getRecentValueSum() << " sec" << std::endl;
00650 }
00651
00652 return oss.str();
00653 }
00654
00655 void artdaq::RoutingMasterCore::sendMetrics_()
00656 {
00657 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
00658 if (mqPtr.get() != nullptr)
00659 {
00660 artdaq::MonitoredQuantityStats stats;
00661 mqPtr->getStats(stats);
00662 metricMan_.sendMetric("Table Update Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::Accumulate);
00663 metricMan_.sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
00664 metricMan_.sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average);
00665 }
00666
00667 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
00668 if (mqPtr.get() != nullptr)
00669 {
00670 artdaq::MonitoredQuantityStats stats;
00671 mqPtr->getStats(stats);
00672 metricMan_.sendMetric("Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::Accumulate);
00673 metricMan_.sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
00674 metricMan_.sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
00675 }
00676 }