10 #include "canvas/Utilities/Exception.h"
11 #include "cetlib_except/exception.h"
13 #include "artdaq/DAQdata/Globals.hh"
14 #define TRACE_NAME (app_name + "_RoutingManagerCore").c_str() // before trace.h
15 #include "artdaq-core/Data/Fragment.hh"
16 #include "artdaq-core/Utilities/ExceptionHandler.hh"
18 #include "artdaq/Application/RoutingManagerCore.hh"
21 #include "artdaq/RoutingPolicies/makeRoutingManagerPolicy.hh"
31 : shutdown_requested_(false)
32 , stop_requested_(true)
33 , pause_requested_(false)
36 TLOG(TLVL_DEBUG + 32) <<
"Constructor";
44 TLOG(TLVL_DEBUG + 32) <<
"Destructor";
45 artdaq::StatisticsCollection::getInstance().requestStop();
46 token_receiver_->stopTokenReception(
true);
51 TLOG(TLVL_DEBUG + 32) <<
"initialize method called with "
52 <<
"ParameterSet = \"" << pset.to_string()
56 fhicl::ParameterSet daq_pset;
59 daq_pset = pset.get<fhicl::ParameterSet>(
"daq");
64 <<
"Unable to find the DAQ parameters in the initialization "
65 <<
"ParameterSet: \"" + pset.to_string() +
"\".";
69 if (daq_pset.has_key(
"rank"))
71 if (my_rank >= 0 && daq_pset.get<
int>(
"rank") != my_rank)
73 TLOG(TLVL_WARNING) <<
"Routing Manager rank specified at startup is different than rank specified at configure! Using rank received at configure!";
75 my_rank = daq_pset.get<
int>(
"rank");
79 TLOG(TLVL_ERROR) <<
"Routing Manager rank not specified at startup or in configuration! Aborting";
85 policy_pset_ = daq_pset.get<fhicl::ParameterSet>(
"policy");
90 <<
"Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() +
"\".";
96 token_receiver_pset_ = daq_pset.get<fhicl::ParameterSet>(
"token_receiver");
101 <<
"Unable to find the token_receiver parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() +
"\".";
106 fhicl::ParameterSet metric_pset;
109 metric_pset = daq_pset.get<fhicl::ParameterSet>(
"metrics");
114 if (metric_pset.is_empty())
116 TLOG(TLVL_INFO) <<
"No metric plugins appear to be defined";
120 metricMan->initialize(metric_pset, app_name);
124 ExceptionHandler(ExceptionHandlerRethrow::no,
125 "Error loading metrics in RoutingManagerCore::initialize()");
129 auto policy_plugin_spec = policy_pset_.get<std::string>(
"policy",
"");
130 if (policy_plugin_spec.length() == 0)
133 <<
"No fragment generator (parameter name = \"policy\") was "
134 <<
"specified in the policy ParameterSet. The "
135 <<
"DAQ initialization PSet was \"" << daq_pset.to_string() <<
"\".";
144 std::stringstream exception_string;
145 exception_string <<
"Exception thrown during initialization of policy of type \""
146 << policy_plugin_spec <<
"\"";
148 ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
150 TLOG(TLVL_DEBUG + 32) <<
"FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string();
155 rt_priority_ = daq_pset.get<
int>(
"rt_priority", 0);
156 max_table_update_interval_ms_ = daq_pset.get<
size_t>(
"table_update_interval_ms", 1000);
157 current_table_interval_ms_ = max_table_update_interval_ms_;
158 table_update_high_fraction_ = daq_pset.get<
double>(
"table_update_interval_high_frac", 0.75);
159 table_update_low_fraction_ = daq_pset.get<
double>(
"table_update_interval_low_frac", 0.5);
162 statsHelperPtr_->createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
165 token_receiver_ = std::make_unique<TokenReceiver>(token_receiver_pset_, policy_, max_table_update_interval_ms_);
166 token_receiver_->setStatsHelper(statsHelperPtr_, TOKENS_RECEIVED_STAT_KEY);
167 token_receiver_->startTokenReception();
168 token_receiver_->pauseTokenReception();
170 table_listen_port_ = daq_pset.get<
int>(
"table_update_port", 35556);
172 shutdown_requested_.store(
true);
173 if (listen_thread_ && listen_thread_->joinable())
175 listen_thread_->join();
177 shutdown_requested_.store(
false);
178 TLOG(TLVL_INFO) <<
"Starting Listener Thread";
182 listen_thread_ = std::make_unique<boost::thread>(&RoutingManagerCore::listen_,
this);
184 catch (
const boost::exception& e)
186 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting TCP Socket Listen thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
187 std::cerr <<
"Caught boost::exception starting TCP Socket Listen thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
196 stop_requested_.store(
false);
197 pause_requested_.store(
false);
199 statsHelperPtr_->resetStatistics();
201 metricMan->do_start();
202 table_update_count_ = 0;
203 token_receiver_->setRunNumber(run_id_.run());
204 token_receiver_->resumeTokenReception();
206 TLOG(TLVL_INFO) <<
"Started run " << run_id_.run();
212 TLOG(TLVL_INFO) <<
"Stopping run " << run_id_.run()
213 <<
" after " << table_update_count_ <<
" table updates."
214 <<
" and " << token_receiver_->getReceivedTokenCount() <<
" received tokens.";
215 stop_requested_.store(
true);
216 token_receiver_->pauseTokenReception();
217 run_id_ = art::RunID::flushRun();
223 TLOG(TLVL_INFO) <<
"Pausing run " << run_id_.run()
224 <<
" after " << table_update_count_ <<
" table updates."
225 <<
" and " << token_receiver_->getReceivedTokenCount() <<
" received tokens.";
226 pause_requested_.store(
true);
232 TLOG(TLVL_DEBUG + 32) <<
"Resuming run " << run_id_.run();
233 pause_requested_.store(
false);
234 metricMan->do_start();
240 shutdown_requested_.store(
true);
241 if (listen_thread_ && listen_thread_->joinable())
243 listen_thread_->join();
245 token_receiver_->stopTokenReception();
247 metricMan->shutdown();
253 TLOG(TLVL_INFO) <<
"soft_initialize method called with "
254 <<
"ParameterSet = \"" << pset.to_string()
256 return initialize(pset, timeout, timestamp);
261 TLOG(TLVL_INFO) <<
"reinitialize method called with "
262 <<
"ParameterSet = \"" << pset.to_string()
264 return initialize(pset, timeout, timestamp);
269 if (rt_priority_ > 0)
271 #pragma GCC diagnostic push
272 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
273 sched_param s_param = {};
274 s_param.sched_priority = rt_priority_;
275 if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param) != 0)
277 TLOG(TLVL_WARNING) <<
"setting realtime priority failed";
279 #pragma GCC diagnostic pop
285 if (rt_priority_ > 0)
287 #pragma GCC diagnostic push
288 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
289 sched_param s_param = {};
290 s_param.sched_priority = rt_priority_;
291 int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
295 <<
"Failed to set realtime priority to " << rt_priority_
296 <<
", return code = " << status;
298 #pragma GCC diagnostic pop
303 TLOG(TLVL_DEBUG + 32) <<
"Sending initial table.";
304 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
305 auto nextSendTime = startTime;
307 while (!stop_requested_ && !pause_requested_)
312 startTime = artdaq::MonitoredQuantity::getCurrentTime();
314 if (startTime >= nextSendTime)
316 auto table = policy_->GetCurrentTable();
320 TLOG(TLVL_WARNING) <<
"Routing Policy generated Empty table for this routing interval (" << current_table_interval_ms_ <<
" ms)! This may indicate issues with the receivers, if it persists."
321 <<
" Next seqID=" << policy_->GetNextSequenceID() <<
", Policy held tokens=" << policy_->GetHeldTokenCount();
325 send_event_table(table);
326 ++table_update_count_;
327 delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
328 statsHelperPtr_->addSample(TABLE_UPDATES_STAT_KEY, delta_time);
329 TLOG(TLVL_DEBUG + 34) <<
"process_fragments TABLE_UPDATES_STAT_KEY=" << delta_time;
331 bool readyToReport = statsHelperPtr_->readyToReport();
334 std::string statString = buildStatisticsString_();
335 TLOG(TLVL_INFO) << statString;
340 auto max_tokens = policy_->GetMaxNumberOfTokens();
343 auto frac = policy_->GetTokensUsedSinceLastUpdate() /
static_cast<double>(max_tokens);
344 policy_->ResetTokensUsedSinceLastUpdate();
345 if (frac > table_update_high_fraction_) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
346 if (frac < table_update_low_fraction_) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
347 if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
348 if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
350 nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
351 TLOG(TLVL_DEBUG + 32) <<
"current_table_interval_ms is now " << current_table_interval_ms_;
352 statsHelperPtr_->addSample(CURRENT_TABLE_INTERVAL_STAT_KEY, current_table_interval_ms_ / 1000.0);
356 usleep(current_table_interval_ms_ * 10);
361 TLOG(TLVL_DEBUG + 32) <<
"stop_requested_ is " << stop_requested_ <<
", pause_requested_ is " << pause_requested_ <<
", exiting process_event_table loop";
363 metricMan->do_stop();
368 std::lock_guard<std::mutex> lk(fd_mutex_);
369 for (
auto& dest : connected_fds_)
371 for (
auto& connected_fd : dest.second)
374 TLOG(TLVL_DEBUG + 32) <<
"Sending table information for " << header.
nEntries <<
" events to destination " << dest.first;
375 TRACE(16,
"headerData:0x%016lx%016lx packetData:0x%016lx%016lx", ((
unsigned long*)&header)[0], ((
unsigned long*)&header)[1], ((
unsigned long*)&packet[0])[0], ((
unsigned long*)&packet[0])[1]);
376 auto sts = write(connected_fd, &header,
sizeof(header));
377 if (sts !=
sizeof(header))
379 TLOG(TLVL_ERROR) <<
"Error sending routing header to fd " << connected_fd <<
", rank " << dest.first;
386 TLOG(TLVL_ERROR) <<
"Error sending routing table. sts=" << sts <<
"/" << packet.size() *
sizeof(
detail::RoutingPacketEntry) <<
", fd=" << connected_fd <<
", rank=" << dest.first;
395 std::string resultString;
398 auto tmpString = app_name +
" run number = " + std::to_string(run_id_.run()) +
", table updates sent = " + std::to_string(table_update_count_) +
", Receiver tokens received = " + std::to_string(token_receiver_->getReceivedTokenCount());
402 std::string artdaq::RoutingManagerCore::buildStatisticsString_()
const
404 std::ostringstream oss;
405 oss << app_name <<
" statistics:" << std::endl;
407 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
408 if (mqPtr !=
nullptr)
410 artdaq::MonitoredQuantityStats stats;
411 mqPtr->getStats(stats);
412 oss <<
" Table Update statistics: "
413 << stats.recentSampleCount <<
" table updates sent at "
414 << stats.recentSampleRate <<
" table updates/sec, , monitor window = "
415 << stats.recentDuration <<
" sec" << std::endl;
416 oss <<
" Average times per table update: ";
417 if (stats.recentSampleRate > 0.0)
419 oss <<
" elapsed time = "
420 << (1.0 / stats.recentSampleRate) <<
" sec";
424 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
425 if (mqPtr !=
nullptr)
427 artdaq::MonitoredQuantityStats stats;
428 mqPtr->getStats(stats);
429 oss <<
" Received Token statistics: "
430 << stats.recentSampleCount <<
" tokens received at "
431 << stats.recentSampleRate <<
" tokens/sec, , monitor window = "
432 << stats.recentDuration <<
" sec" << std::endl;
433 oss <<
" Average times per token: ";
434 if (stats.recentSampleRate > 0.0)
436 oss <<
" elapsed time = "
437 << (1.0 / stats.recentSampleRate) <<
" sec";
439 oss <<
", input token wait time = "
440 << mqPtr->getRecentValueSum() <<
" sec" << std::endl;
446 void artdaq::RoutingManagerCore::sendMetrics_()
450 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
451 if (mqPtr !=
nullptr)
453 artdaq::MonitoredQuantityStats stats;
454 mqPtr->getStats(stats);
455 metricMan->sendMetric(
"Table Update Count", stats.fullSampleCount,
"updates", 1, MetricMode::LastPoint);
456 metricMan->sendMetric(
"Table Update Rate", stats.recentSampleRate,
"updates/sec", 1, MetricMode::Average);
459 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
460 if (mqPtr !=
nullptr)
462 artdaq::MonitoredQuantityStats stats;
463 mqPtr->getStats(stats);
464 metricMan->sendMetric(
"Receiver Token Count", stats.fullSampleCount,
"updates", 1, MetricMode::LastPoint);
465 metricMan->sendMetric(
"Receiver Token Rate", stats.recentSampleRate,
"updates/sec", 1, MetricMode::Average);
466 metricMan->sendMetric(
"Total Receiver Token Wait Time", mqPtr->getRecentValueSum(),
"seconds", 3, MetricMode::Average);
469 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(CURRENT_TABLE_INTERVAL_STAT_KEY);
470 if (mqPtr.get() !=
nullptr)
472 artdaq::MonitoredQuantityStats stats;
473 mqPtr->getStats(stats);
474 metricMan->sendMetric(
"Table Update Interval", stats.recentValueAverage,
"s", 3, MetricMode::Average);
479 void artdaq::RoutingManagerCore::listen_()
483 epoll_fd_ = epoll_create1(0);
486 while (shutdown_requested_ ==
false)
488 TLOG(TLVL_DEBUG + 33) <<
"listen_: Listening/accepting new connections on port " << table_listen_port_;
491 TLOG(TLVL_DEBUG + 32) <<
"listen_: Opening listener";
496 TLOG(TLVL_DEBUG + 32) <<
"listen_: Error creating listen_fd!";
504 FD_SET(listen_fd, &rfds);
506 res = select(listen_fd + 1, &rfds, static_cast<fd_set*>(
nullptr), static_cast<fd_set*>(
nullptr), &tv);
511 socklen_t arglen =
sizeof(un);
513 TLOG(TLVL_DEBUG + 32) <<
"listen_: Calling accept";
514 fd = accept(listen_fd, reinterpret_cast<sockaddr*>(&un), &arglen);
515 TLOG(TLVL_DEBUG + 32) <<
"listen_: Done with accept";
517 TLOG(TLVL_DEBUG + 32) <<
"listen_: Reading connect message";
518 socklen_t lenlen =
sizeof(tv);
520 setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, lenlen);
521 detail::RoutingRequest rch;
522 uint64_t mark_us = TimeUtils::gettimeofday_us();
523 sts = read(fd, &rch,
sizeof(rch));
524 uint64_t delta_us = TimeUtils::gettimeofday_us() - mark_us;
525 TLOG(TLVL_DEBUG + 32) <<
"listen_: Read of connect message took " << delta_us <<
" microseconds.";
526 if (sts !=
sizeof(rch))
528 TLOG(TLVL_DEBUG + 32) <<
"listen_: Wrong message header length received!";
534 if (rch.header != ROUTING_MAGIC || !(rch.mode == detail::RoutingRequest::RequestMode::Connect))
536 TLOG(TLVL_DEBUG + 32) <<
"listen_: Wrong magic bytes in header! rch.header: " << std::hex << rch.header;
542 std::lock_guard<std::mutex> lk(fd_mutex_);
543 connected_fds_[rch.rank].insert(fd);
544 struct epoll_event ev;
547 epoll_ctl(epoll_fd_, EPOLL_CTL_ADD, fd, &ev);
548 TLOG(TLVL_INFO) <<
"listen_: New fd is " << fd <<
" for table receiver rank " << rch.rank;
552 TLOG(TLVL_DEBUG + 34) <<
"listen_: No connections in timeout interval!";
556 TLOG(TLVL_INFO) <<
"listen_: Shutting down connection listener";
561 std::lock_guard<std::mutex> lk(fd_mutex_);
562 for (
auto& fd_set : connected_fds_)
564 for (
auto& fd : fd_set.second)
566 epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, fd,
nullptr);
570 connected_fds_.clear();
574 void artdaq::RoutingManagerCore::receive_()
578 epoll_fd_ = epoll_create1(0);
580 std::vector<epoll_event> received_events(10);
585 std::lock_guard<std::mutex> lk(fd_mutex_);
586 nfds = epoll_wait(epoll_fd_, &received_events[0], received_events.size(), 1);
589 TLOG(TLVL_ERROR) <<
"Error status received from epoll_wait, exiting with code " << EXIT_FAILURE <<
", errno=" << errno <<
" (" << strerror(errno) <<
")";
590 perror(
"epoll_wait");
596 TLOG(TLVL_DEBUG + 35) <<
"Received " << nfds <<
" events on table sockets";
598 for (
auto n = 0; n < nfds; ++n)
604 if ((received_events[n].events & EPOLLIN) != 0)
606 detail::RoutingRequest buff;
607 auto stss = read(received_events[n].data.fd, &buff,
sizeof(detail::RoutingRequest) - sts);
611 TLOG(TLVL_INFO) <<
"Received 0-size request from " << find_fd_(received_events[n].data.fd);
614 else if (stss < 0 && errno == EAGAIN)
616 TLOG(TLVL_DEBUG + 32) <<
"No more requests from this rank. Continuing poll loop.";
621 TLOG(TLVL_ERROR) <<
"Error reading from request socket: sts=" << sts <<
", errno=" << errno <<
" (" << strerror(errno) <<
")";
622 close(received_events[n].data.fd);
623 epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, received_events[n].data.fd,
nullptr);
626 else if (sts ==
sizeof(detail::RoutingRequest) && buff.header != ROUTING_MAGIC)
628 TLOG(TLVL_ERROR) <<
"Received invalid request from " << find_fd_(received_events[n].data.fd) <<
" sts=" << sts <<
", header=" << std::hex << buff.header;
631 else if (sts ==
sizeof(detail::RoutingRequest))
636 detail::RoutingPacketEntry reply;
640 case detail::RoutingRequest::RequestMode::Disconnect:
641 connected_fds_[buff.rank].erase(received_events[n].data.fd);
642 close(received_events[n].data.fd);
643 epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, received_events[n].data.fd,
nullptr);
646 case detail::RoutingRequest::RequestMode::Request:
647 reply = policy_->GetRouteForSequenceID(buff.sequence_id, buff.rank);
648 if (reply.sequence_id == buff.sequence_id)
650 TLOG(TLVL_DEBUG + 33) <<
"Reply to request from " << buff.rank <<
" with route to " << reply.destination_rank <<
" for sequence ID " << buff.sequence_id;
651 detail::RoutingPacketHeader hdr(1);
652 write(received_events[n].data.fd, &hdr,
sizeof(hdr));
653 write(received_events[n].data.fd, &reply,
sizeof(detail::RoutingPacketEntry));
657 TLOG(TLVL_DEBUG + 33) <<
"Unable to route request, replying with empty RoutingPacket";
658 detail::RoutingPacketHeader hdr(0);
659 write(received_events[n].data.fd, &hdr,
sizeof(hdr));
670 TLOG(TLVL_DEBUG + 32) <<
"Received event mask " << received_events[n].events <<
" from table socket rank " << find_fd_(received_events[n].data.fd);
677 int artdaq::RoutingManagerCore::find_fd_(
int fd)
const
679 for (
auto& rank : connected_fds_)
681 if (rank.second.count(fd) != 0)
This class manages MonitoredQuantity instances for the *Core classes.
bool shutdown(uint64_t)
Shuts Down the RoutingManagerCore.
bool start(art::RunID id, uint64_t, uint64_t)
Start the RoutingManagerCore.
A row of the Routing Table.
static std::string RequestModeToString(RequestMode m)
Convert a RequestMode enumeration value to string.
static const std::string TABLE_UPDATES_STAT_KEY
Key for Table Update count MonnitoredQuantity.
bool pause(uint64_t, uint64_t)
Pauses the RoutingManagerCore.
int TCP_listen_fd(int port, int rcvbuf)
Create a TCP listening socket on the given port and INADDR_ANY, with the given receive buffer...
Multiple sources sending to a single destination. RoutingManager pushes table updates to all senders...
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
bool initialize(fhicl::ParameterSet const &pset, uint64_t, uint64_t)
Processes the initialize request.
Multiple sources sending to a single destination. Table updates are triggered by senders requesting r...
bool stop(uint64_t, uint64_t)
Stops the RoutingManagerCore.
void process_event_table()
Main loop of the RoutingManagerCore. Determines when to send the next table update, asks the RoutingManagerPolicy for the table to send, and sends it.
bool soft_initialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Soft-Initializes the RoutingManagerCore.
static const std::string TOKENS_RECEIVED_STAT_KEY
Key for the Tokens Received MonitoredQuantity.
std::string report(std::string const &) const
Send a report on the current status of the RoutingManagerCore.
bool resume(uint64_t, uint64_t)
Resumes the RoutingManagerCore.
void send_event_table(detail::RoutingPacket packet)
Sends a detail::RoutingPacket to the table receivers.
RoutingManagerCore()
RoutingManagerCore Constructor.
static const std::string CURRENT_TABLE_INTERVAL_STAT_KEY
Key for the Current Table Interval MonitoredQuantity.
std::shared_ptr< RoutingManagerPolicy > makeRoutingManagerPolicy(std::string const &policy_plugin_spec, fhicl::ParameterSet const &ps)
Load a RoutingManagerPolicy plugin.
bool reinitialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Reinitializes the RoutingManagerCore.