1 #include "TRACE/tracemf.h"
2 #include "artdaq/DAQdata/Globals.hh"
3 #define TRACE_NAME (app_name + "_RoutingManagerCore").c_str() // before trace.h
5 #include "artdaq/Application/RoutingManagerCore.hh"
7 #include "artdaq-core/Utilities/ExceptionHandler.hh"
9 #include "artdaq/RoutingPolicies/makeRoutingManagerPolicy.hh"
11 #include "fhiclcpp/ParameterSet.h"
13 #include <arpa/inet.h>
30 : shutdown_requested_(false)
31 , stop_requested_(true)
32 , pause_requested_(false)
35 TLOG(TLVL_DEBUG + 32) <<
"Constructor";
43 TLOG(TLVL_DEBUG + 32) <<
"Destructor";
44 artdaq::StatisticsCollection::getInstance().requestStop();
45 token_receiver_->stopTokenReception(
true);
50 TLOG(TLVL_DEBUG + 32) <<
"initialize method called with "
51 <<
"ParameterSet = \"" << pset.to_string()
55 fhicl::ParameterSet daq_pset;
58 daq_pset = pset.get<fhicl::ParameterSet>(
"daq");
63 <<
"Unable to find the DAQ parameters in the initialization "
64 <<
"ParameterSet: \"" + pset.to_string() +
"\".";
68 if (daq_pset.has_key(
"rank"))
70 if (my_rank >= 0 && daq_pset.get<
int>(
"rank") != my_rank)
72 TLOG(TLVL_WARNING) <<
"Routing Manager rank specified at startup is different than rank specified at configure! Using rank received at configure!";
74 my_rank = daq_pset.get<
int>(
"rank");
78 TLOG(TLVL_ERROR) <<
"Routing Manager rank not specified at startup or in configuration! Aborting";
84 policy_pset_ = daq_pset.get<fhicl::ParameterSet>(
"policy");
89 <<
"Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() +
"\".";
95 token_receiver_pset_ = daq_pset.get<fhicl::ParameterSet>(
"token_receiver");
100 <<
"Unable to find the token_receiver parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() +
"\".";
105 fhicl::ParameterSet metric_pset;
108 metric_pset = daq_pset.get<fhicl::ParameterSet>(
"metrics");
113 if (metric_pset.is_empty())
115 TLOG(TLVL_INFO) <<
"No metric plugins appear to be defined";
119 metricMan->initialize(metric_pset, app_name);
123 ExceptionHandler(ExceptionHandlerRethrow::no,
124 "Error loading metrics in RoutingManagerCore::initialize()");
128 auto policy_plugin_spec = policy_pset_.get<std::string>(
"policy",
"");
129 if (policy_plugin_spec.length() == 0)
132 <<
"No fragment generator (parameter name = \"policy\") was "
133 <<
"specified in the policy ParameterSet. The "
134 <<
"DAQ initialization PSet was \"" << daq_pset.to_string() <<
"\".";
143 std::stringstream exception_string;
144 exception_string <<
"Exception thrown during initialization of policy of type \""
145 << policy_plugin_spec <<
"\"";
147 ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
149 TLOG(TLVL_DEBUG + 32) <<
"FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string();
154 rt_priority_ = daq_pset.get<
int>(
"rt_priority", 0);
155 max_table_update_interval_ms_ = daq_pset.get<
size_t>(
"table_update_interval_ms", 1000);
156 current_table_interval_ms_ = max_table_update_interval_ms_;
157 table_update_high_fraction_ = daq_pset.get<
double>(
"table_update_interval_high_frac", 0.75);
158 table_update_low_fraction_ = daq_pset.get<
double>(
"table_update_interval_low_frac", 0.5);
161 statsHelperPtr_->createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
164 token_receiver_ = std::make_unique<TokenReceiver>(token_receiver_pset_, policy_, max_table_update_interval_ms_);
165 token_receiver_->setStatsHelper(statsHelperPtr_, TOKENS_RECEIVED_STAT_KEY);
166 token_receiver_->startTokenReception();
167 token_receiver_->pauseTokenReception();
169 table_listen_port_ = daq_pset.get<
int>(
"table_update_port", 35556);
171 shutdown_requested_.store(
true);
172 if (listen_thread_ && listen_thread_->joinable())
174 listen_thread_->join();
176 shutdown_requested_.store(
false);
177 TLOG(TLVL_INFO) <<
"Starting Listener Thread";
181 listen_thread_ = std::make_unique<boost::thread>(&RoutingManagerCore::listen_,
this);
183 catch (
const boost::exception& e)
185 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting TCP Socket Listen thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
186 std::cerr <<
"Caught boost::exception starting TCP Socket Listen thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
195 stop_requested_.store(
false);
196 pause_requested_.store(
false);
198 statsHelperPtr_->resetStatistics();
200 metricMan->do_start();
201 table_update_count_ = 0;
202 token_receiver_->setRunNumber(run_id_.run());
203 token_receiver_->resumeTokenReception();
205 TLOG(TLVL_INFO) <<
"Started run " << run_id_.run();
211 TLOG(TLVL_INFO) <<
"Stopping run " << run_id_.run()
212 <<
" after " << table_update_count_ <<
" table updates."
213 <<
" and " << token_receiver_->getReceivedTokenCount() <<
" received tokens.";
214 stop_requested_.store(
true);
215 token_receiver_->pauseTokenReception();
216 run_id_ = art::RunID::flushRun();
222 TLOG(TLVL_INFO) <<
"Pausing run " << run_id_.run()
223 <<
" after " << table_update_count_ <<
" table updates."
224 <<
" and " << token_receiver_->getReceivedTokenCount() <<
" received tokens.";
225 pause_requested_.store(
true);
231 TLOG(TLVL_DEBUG + 32) <<
"Resuming run " << run_id_.run();
232 pause_requested_.store(
false);
233 metricMan->do_start();
239 shutdown_requested_.store(
true);
240 if (listen_thread_ && listen_thread_->joinable())
242 listen_thread_->join();
244 token_receiver_->stopTokenReception();
246 metricMan->shutdown();
252 TLOG(TLVL_INFO) <<
"soft_initialize method called with "
253 <<
"ParameterSet = \"" << pset.to_string()
255 return initialize(pset, timeout, timestamp);
260 TLOG(TLVL_INFO) <<
"reinitialize method called with "
261 <<
"ParameterSet = \"" << pset.to_string()
263 return initialize(pset, timeout, timestamp);
268 if (rt_priority_ > 0)
270 #pragma GCC diagnostic push
271 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
272 sched_param s_param = {};
273 s_param.sched_priority = rt_priority_;
274 if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param) != 0)
276 TLOG(TLVL_WARNING) <<
"setting realtime priority failed";
278 #pragma GCC diagnostic pop
284 if (rt_priority_ > 0)
286 #pragma GCC diagnostic push
287 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
288 sched_param s_param = {};
289 s_param.sched_priority = rt_priority_;
290 int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
294 <<
"Failed to set realtime priority to " << rt_priority_
295 <<
", return code = " << status;
297 #pragma GCC diagnostic pop
302 TLOG(TLVL_DEBUG + 32) <<
"Sending initial table.";
303 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
304 auto nextSendTime = startTime;
306 while (!stop_requested_ && !pause_requested_)
311 startTime = artdaq::MonitoredQuantity::getCurrentTime();
313 if (startTime >= nextSendTime)
315 auto table = policy_->GetCurrentTable();
319 TLOG(TLVL_WARNING) <<
"Routing Policy generated Empty table for this routing interval (" << current_table_interval_ms_ <<
" ms)! This may indicate issues with the receivers, if it persists."
320 <<
" Next seqID=" << policy_->GetNextSequenceID() <<
", Policy held tokens=" << policy_->GetHeldTokenCount();
324 send_event_table(table);
325 ++table_update_count_;
326 delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
327 statsHelperPtr_->addSample(TABLE_UPDATES_STAT_KEY, delta_time);
328 TLOG(TLVL_DEBUG + 34) <<
"process_fragments TABLE_UPDATES_STAT_KEY=" << delta_time;
330 bool readyToReport = statsHelperPtr_->readyToReport();
333 std::string statString = buildStatisticsString_();
334 TLOG(TLVL_INFO) << statString;
339 auto max_tokens = policy_->GetMaxNumberOfTokens();
342 auto frac = policy_->GetTokensUsedSinceLastUpdate() /
static_cast<double>(max_tokens);
343 policy_->ResetTokensUsedSinceLastUpdate();
344 if (frac > table_update_high_fraction_) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
345 if (frac < table_update_low_fraction_) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
346 if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
347 if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
349 nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
350 TLOG(TLVL_DEBUG + 32) <<
"current_table_interval_ms is now " << current_table_interval_ms_;
351 statsHelperPtr_->addSample(CURRENT_TABLE_INTERVAL_STAT_KEY, current_table_interval_ms_ / 1000.0);
355 usleep(current_table_interval_ms_ * 10);
360 TLOG(TLVL_DEBUG + 32) <<
"stop_requested_ is " << stop_requested_ <<
", pause_requested_ is " << pause_requested_ <<
", exiting process_event_table loop";
362 metricMan->do_stop();
367 std::lock_guard<std::mutex> lk(fd_mutex_);
368 for (
auto& dest : connected_fds_)
370 for (
auto& connected_fd : dest.second)
373 TLOG(TLVL_DEBUG + 32) <<
"Sending table information for " << header.
nEntries <<
" events to destination " << dest.first;
374 TRACE(16,
"headerData:0x%016lx%016lx packetData:0x%016lx%016lx", ((
unsigned long*)&header)[0], ((
unsigned long*)&header)[1], ((
unsigned long*)&packet[0])[0], ((
unsigned long*)&packet[0])[1]);
375 auto sts = write(connected_fd, &header,
sizeof(header));
376 if (sts !=
sizeof(header))
378 TLOG(TLVL_ERROR) <<
"Error sending routing header to fd " << connected_fd <<
", rank " << dest.first;
385 TLOG(TLVL_ERROR) <<
"Error sending routing table. sts=" << sts <<
"/" << packet.size() *
sizeof(
detail::RoutingPacketEntry) <<
", fd=" << connected_fd <<
", rank=" << dest.first;
394 std::string resultString;
397 auto tmpString = app_name +
" run number = " + std::to_string(run_id_.run()) +
", table updates sent = " + std::to_string(table_update_count_) +
", Receiver tokens received = " + std::to_string(token_receiver_->getReceivedTokenCount());
401 std::string artdaq::RoutingManagerCore::buildStatisticsString_()
const
403 std::ostringstream oss;
404 oss << app_name <<
" statistics:" << std::endl;
406 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
407 if (mqPtr !=
nullptr)
409 artdaq::MonitoredQuantityStats stats;
410 mqPtr->getStats(stats);
411 oss <<
" Table Update statistics: "
412 << stats.recentSampleCount <<
" table updates sent at "
413 << stats.recentSampleRate <<
" table updates/sec, , monitor window = "
414 << stats.recentDuration <<
" sec" << std::endl;
415 oss <<
" Average times per table update: ";
416 if (stats.recentSampleRate > 0.0)
418 oss <<
" elapsed time = "
419 << (1.0 / stats.recentSampleRate) <<
" sec";
423 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
424 if (mqPtr !=
nullptr)
426 artdaq::MonitoredQuantityStats stats;
427 mqPtr->getStats(stats);
428 oss <<
" Received Token statistics: "
429 << stats.recentSampleCount <<
" tokens received at "
430 << stats.recentSampleRate <<
" tokens/sec, , monitor window = "
431 << stats.recentDuration <<
" sec" << std::endl;
432 oss <<
" Average times per token: ";
433 if (stats.recentSampleRate > 0.0)
435 oss <<
" elapsed time = "
436 << (1.0 / stats.recentSampleRate) <<
" sec";
438 oss <<
", input token wait time = "
439 << mqPtr->getRecentValueSum() <<
" sec" << std::endl;
445 void artdaq::RoutingManagerCore::sendMetrics_()
449 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
450 if (mqPtr !=
nullptr)
452 artdaq::MonitoredQuantityStats stats;
453 mqPtr->getStats(stats);
454 metricMan->sendMetric(
"Table Update Count", stats.fullSampleCount,
"updates", 1, MetricMode::LastPoint);
455 metricMan->sendMetric(
"Table Update Rate", stats.recentSampleRate,
"updates/sec", 1, MetricMode::Average);
458 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
459 if (mqPtr !=
nullptr)
461 artdaq::MonitoredQuantityStats stats;
462 mqPtr->getStats(stats);
463 metricMan->sendMetric(
"Receiver Token Count", stats.fullSampleCount,
"updates", 1, MetricMode::LastPoint);
464 metricMan->sendMetric(
"Receiver Token Rate", stats.recentSampleRate,
"updates/sec", 1, MetricMode::Average);
465 metricMan->sendMetric(
"Total Receiver Token Wait Time", mqPtr->getRecentValueSum(),
"seconds", 3, MetricMode::Average);
468 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(CURRENT_TABLE_INTERVAL_STAT_KEY);
469 if (mqPtr.get() !=
nullptr)
471 artdaq::MonitoredQuantityStats stats;
472 mqPtr->getStats(stats);
473 metricMan->sendMetric(
"Table Update Interval", stats.recentValueAverage,
"s", 3, MetricMode::Average);
478 void artdaq::RoutingManagerCore::listen_()
482 epoll_fd_ = epoll_create1(0);
485 while (shutdown_requested_ ==
false)
487 TLOG(TLVL_DEBUG + 33) <<
"listen_: Listening/accepting new connections on port " << table_listen_port_;
490 TLOG(TLVL_DEBUG + 32) <<
"listen_: Opening listener";
495 TLOG(TLVL_DEBUG + 32) <<
"listen_: Error creating listen_fd!";
503 FD_SET(listen_fd, &rfds);
505 res = select(listen_fd + 1, &rfds, static_cast<fd_set*>(
nullptr), static_cast<fd_set*>(
nullptr), &tv);
510 socklen_t arglen =
sizeof(un);
512 TLOG(TLVL_DEBUG + 32) <<
"listen_: Calling accept";
513 fd = accept(listen_fd, reinterpret_cast<sockaddr*>(&un), &arglen);
514 TLOG(TLVL_DEBUG + 32) <<
"listen_: Done with accept";
516 TLOG(TLVL_DEBUG + 32) <<
"listen_: Reading connect message";
517 socklen_t lenlen =
sizeof(tv);
519 setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, lenlen);
520 detail::RoutingRequest rch;
521 uint64_t mark_us = TimeUtils::gettimeofday_us();
522 sts = read(fd, &rch,
sizeof(rch));
523 uint64_t delta_us = TimeUtils::gettimeofday_us() - mark_us;
524 TLOG(TLVL_DEBUG + 32) <<
"listen_: Read of connect message took " << delta_us <<
" microseconds.";
525 if (sts !=
sizeof(rch))
527 TLOG(TLVL_DEBUG + 32) <<
"listen_: Wrong message header length received!";
533 if (rch.header != ROUTING_MAGIC || !(rch.mode == detail::RoutingRequest::RequestMode::Connect))
535 TLOG(TLVL_DEBUG + 32) <<
"listen_: Wrong magic bytes in header! rch.header: " << std::hex << rch.header;
541 std::lock_guard<std::mutex> lk(fd_mutex_);
542 connected_fds_[rch.rank].insert(fd);
543 struct epoll_event ev;
546 epoll_ctl(epoll_fd_, EPOLL_CTL_ADD, fd, &ev);
547 TLOG(TLVL_INFO) <<
"listen_: New fd is " << fd <<
" for table receiver rank " << rch.rank;
551 TLOG(TLVL_DEBUG + 34) <<
"listen_: No connections in timeout interval!";
555 TLOG(TLVL_INFO) <<
"listen_: Shutting down connection listener";
560 std::lock_guard<std::mutex> lk(fd_mutex_);
561 for (
auto& fd_set : connected_fds_)
563 for (
auto& fd : fd_set.second)
565 epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, fd,
nullptr);
569 connected_fds_.clear();
573 void artdaq::RoutingManagerCore::receive_()
577 epoll_fd_ = epoll_create1(0);
579 std::vector<epoll_event> received_events(10);
584 std::lock_guard<std::mutex> lk(fd_mutex_);
585 nfds = epoll_wait(epoll_fd_, &received_events[0], received_events.size(), 1);
588 TLOG(TLVL_ERROR) <<
"Error status received from epoll_wait, exiting with code " << EXIT_FAILURE <<
", errno=" << errno <<
" (" << strerror(errno) <<
")";
589 perror(
"epoll_wait");
595 TLOG(TLVL_DEBUG + 35) <<
"Received " << nfds <<
" events on table sockets";
597 for (
auto n = 0; n < nfds; ++n)
603 if ((received_events[n].events & EPOLLIN) != 0)
605 detail::RoutingRequest buff;
606 auto stss = read(received_events[n].data.fd, &buff,
sizeof(detail::RoutingRequest) - sts);
610 TLOG(TLVL_INFO) <<
"Received 0-size request from " << find_fd_(received_events[n].data.fd);
613 else if (stss < 0 && errno == EAGAIN)
615 TLOG(TLVL_DEBUG + 32) <<
"No more requests from this rank. Continuing poll loop.";
620 TLOG(TLVL_ERROR) <<
"Error reading from request socket: sts=" << sts <<
", errno=" << errno <<
" (" << strerror(errno) <<
")";
621 close(received_events[n].data.fd);
622 epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, received_events[n].data.fd,
nullptr);
625 else if (sts ==
sizeof(detail::RoutingRequest) && buff.header != ROUTING_MAGIC)
627 TLOG(TLVL_ERROR) <<
"Received invalid request from " << find_fd_(received_events[n].data.fd) <<
" sts=" << sts <<
", header=" << std::hex << buff.header;
630 else if (sts ==
sizeof(detail::RoutingRequest))
635 detail::RoutingPacketEntry reply;
639 case detail::RoutingRequest::RequestMode::Disconnect:
640 connected_fds_[buff.rank].erase(received_events[n].data.fd);
641 close(received_events[n].data.fd);
642 epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, received_events[n].data.fd,
nullptr);
645 case detail::RoutingRequest::RequestMode::Request:
646 reply = policy_->GetRouteForSequenceID(buff.sequence_id, buff.rank);
647 if (reply.sequence_id == buff.sequence_id)
649 TLOG(TLVL_DEBUG + 33) <<
"Reply to request from " << buff.rank <<
" with route to " << reply.destination_rank <<
" for sequence ID " << buff.sequence_id;
650 detail::RoutingPacketHeader hdr(1);
651 write(received_events[n].data.fd, &hdr,
sizeof(hdr));
652 write(received_events[n].data.fd, &reply,
sizeof(detail::RoutingPacketEntry));
656 TLOG(TLVL_DEBUG + 33) <<
"Unable to route request, replying with empty RoutingPacket";
657 detail::RoutingPacketHeader hdr(0);
658 write(received_events[n].data.fd, &hdr,
sizeof(hdr));
669 TLOG(TLVL_DEBUG + 32) <<
"Received event mask " << received_events[n].events <<
" from table socket rank " << find_fd_(received_events[n].data.fd);
676 int artdaq::RoutingManagerCore::find_fd_(
int fd)
const
678 for (
auto& rank : connected_fds_)
680 if (rank.second.count(fd) != 0)
This class manages MonitoredQuantity instances for the *Core classes.
bool shutdown(uint64_t)
Shuts Down the RoutingManagerCore.
bool start(art::RunID id, uint64_t, uint64_t)
Start the RoutingManagerCore.
A row of the Routing Table.
static std::string RequestModeToString(RequestMode m)
Convert a RequestMode enumeration value to string.
static const std::string TABLE_UPDATES_STAT_KEY
Key for Table Update count MonnitoredQuantity.
bool pause(uint64_t, uint64_t)
Pauses the RoutingManagerCore.
int TCP_listen_fd(int port, int rcvbuf)
Create a TCP listening socket on the given port and INADDR_ANY, with the given receive buffer...
Multiple sources sending to a single destination. RoutingManager pushes table updates to all senders...
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
bool initialize(fhicl::ParameterSet const &pset, uint64_t, uint64_t)
Processes the initialize request.
Multiple sources sending to a single destination. Table updates are triggered by senders requesting r...
bool stop(uint64_t, uint64_t)
Stops the RoutingManagerCore.
void process_event_table()
Main loop of the RoutingManagerCore. Determines when to send the next table update, asks the RoutingManagerPolicy for the table to send, and sends it.
bool soft_initialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Soft-Initializes the RoutingManagerCore.
static const std::string TOKENS_RECEIVED_STAT_KEY
Key for the Tokens Received MonitoredQuantity.
std::string report(std::string const &) const
Send a report on the current status of the RoutingManagerCore.
bool resume(uint64_t, uint64_t)
Resumes the RoutingManagerCore.
void send_event_table(detail::RoutingPacket packet)
Sends a detail::RoutingPacket to the table receivers.
RoutingManagerCore()
RoutingManagerCore Constructor.
static const std::string CURRENT_TABLE_INTERVAL_STAT_KEY
Key for the Current Table Interval MonitoredQuantity.
std::shared_ptr< RoutingManagerPolicy > makeRoutingManagerPolicy(std::string const &policy_plugin_spec, fhicl::ParameterSet const &ps)
Load a RoutingManagerPolicy plugin.
bool reinitialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Reinitializes the RoutingManagerCore.