10 #include "canvas/Utilities/Exception.h"
11 #include "cetlib_except/exception.h"
13 #include "artdaq/DAQdata/Globals.hh"
14 #define TRACE_NAME (app_name + "_RoutingMasterCore").c_str() // before trace.h
15 #include "artdaq-core/Data/Fragment.hh"
16 #include "artdaq-core/Utilities/ExceptionHandler.hh"
18 #include "artdaq/Application/RoutingMasterCore.hh"
21 #include "artdaq/RoutingPolicies/makeRoutingMasterPolicy.hh"
29 : shutdown_requested_(false)
30 , stop_requested_(true)
31 , pause_requested_(false)
36 TLOG(TLVL_DEBUG) <<
"Constructor";
43 TLOG(TLVL_DEBUG) <<
"Destructor";
44 artdaq::StatisticsCollection::getInstance().requestStop();
45 token_receiver_->stopTokenReception(
true);
50 TLOG(TLVL_DEBUG) <<
"initialize method called with "
51 <<
"ParameterSet = \"" << pset.to_string()
55 fhicl::ParameterSet daq_pset;
58 daq_pset = pset.get<fhicl::ParameterSet>(
"daq");
63 <<
"Unable to find the DAQ parameters in the initialization "
64 <<
"ParameterSet: \"" + pset.to_string() +
"\".";
68 if (daq_pset.has_key(
"rank"))
70 if (my_rank >= 0 && daq_pset.get<
int>(
"rank") != my_rank)
72 TLOG(TLVL_WARNING) <<
"Routing Master rank specified at startup is different than rank specified at configure! Using rank received at configure!";
74 my_rank = daq_pset.get<
int>(
"rank");
78 TLOG(TLVL_ERROR) <<
"Routing Master rank not specified at startup or in configuration! Aborting";
84 policy_pset_ = daq_pset.get<fhicl::ParameterSet>(
"policy");
89 <<
"Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() +
"\".";
95 token_receiver_pset_ = daq_pset.get<fhicl::ParameterSet>(
"token_receiver");
100 <<
"Unable to find the token_receiver parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() +
"\".";
105 fhicl::ParameterSet metric_pset;
108 metric_pset = daq_pset.get<fhicl::ParameterSet>(
"metrics");
113 if (metric_pset.is_empty())
115 TLOG(TLVL_INFO) <<
"No metric plugins appear to be defined";
119 metricMan->initialize(metric_pset, app_name);
123 ExceptionHandler(ExceptionHandlerRethrow::no,
124 "Error loading metrics in RoutingMasterCore::initialize()");
128 auto policy_plugin_spec = policy_pset_.get<std::string>(
"policy",
"");
129 if (policy_plugin_spec.length() == 0)
132 <<
"No fragment generator (parameter name = \"policy\") was "
133 <<
"specified in the policy ParameterSet. The "
134 <<
"DAQ initialization PSet was \"" << daq_pset.to_string() <<
"\".";
143 std::stringstream exception_string;
144 exception_string <<
"Exception thrown during initialization of policy of type \""
145 << policy_plugin_spec <<
"\"";
147 ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
149 TLOG(TLVL_DEBUG) <<
"FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string();
154 rt_priority_ = daq_pset.get<
int>(
"rt_priority", 0);
155 sender_ranks_ = daq_pset.get<std::vector<int>>(
"sender_ranks");
157 receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
159 auto mode = daq_pset.get<
bool>(
"senders_send_by_send_count",
false);
161 max_table_update_interval_ms_ = daq_pset.get<
size_t>(
"table_update_interval_ms", 1000);
162 current_table_interval_ms_ = max_table_update_interval_ms_;
163 max_ack_cycle_count_ = daq_pset.get<
size_t>(
"table_ack_retry_count", 5);
164 send_tables_port_ = daq_pset.get<
int>(
"table_update_port", 35556);
165 receive_acks_port_ = daq_pset.get<
int>(
"table_acknowledge_port", 35557);
166 send_tables_address_ = daq_pset.get<std::string>(
"table_update_address",
"227.128.12.28");
167 multicast_out_hostname_ = daq_pset.get<std::string>(
"routing_master_hostname",
"localhost");
170 statsHelperPtr_->createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
173 token_receiver_.reset(
new TokenReceiver(token_receiver_pset_, policy_, routing_mode_, sender_ranks_.size(), max_table_update_interval_ms_));
174 token_receiver_->setStatsHelper(statsHelperPtr_, TOKENS_RECEIVED_STAT_KEY);
175 token_receiver_->startTokenReception();
176 token_receiver_->pauseTokenReception();
178 shutdown_requested_.store(
false);
185 stop_requested_.store(
false);
186 pause_requested_.store(
false);
188 statsHelperPtr_->resetStatistics();
190 metricMan->do_start();
191 table_update_count_ = 0;
192 token_receiver_->setRunNumber(run_id_.run());
193 token_receiver_->resumeTokenReception();
195 TLOG(TLVL_INFO) <<
"Started run " << run_id_.run();
201 TLOG(TLVL_INFO) <<
"Stopping run " << run_id_.run()
202 <<
" after " << table_update_count_ <<
" table updates."
203 <<
" and " << token_receiver_->getReceivedTokenCount() <<
" received tokens.";
204 stop_requested_.store(
true);
205 token_receiver_->pauseTokenReception();
206 run_id_ = art::RunID::flushRun();
212 TLOG(TLVL_INFO) <<
"Pausing run " << run_id_.run()
213 <<
" after " << table_update_count_ <<
" table updates."
214 <<
" and " << token_receiver_->getReceivedTokenCount() <<
" received tokens.";
215 pause_requested_.store(
true);
221 TLOG(TLVL_DEBUG) <<
"Resuming run " << run_id_.run();
222 pause_requested_.store(
false);
223 metricMan->do_start();
229 shutdown_requested_.store(
true);
230 token_receiver_->stopTokenReception();
232 metricMan->shutdown();
238 TLOG(TLVL_INFO) <<
"soft_initialize method called with "
239 <<
"ParameterSet = \"" << pset.to_string()
241 return initialize(pset, e, f);
246 TLOG(TLVL_INFO) <<
"reinitialize method called with "
247 <<
"ParameterSet = \"" << pset.to_string()
249 return initialize(pset, e, f);
254 if (rt_priority_ > 0)
256 #pragma GCC diagnostic push
257 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
258 sched_param s_param = {};
259 s_param.sched_priority = rt_priority_;
260 if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
261 TLOG(TLVL_WARNING) <<
"setting realtime priority failed";
262 #pragma GCC diagnostic pop
268 if (rt_priority_ > 0)
270 #pragma GCC diagnostic push
271 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
272 sched_param s_param = {};
273 s_param.sched_priority = rt_priority_;
274 int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
278 <<
"Failed to set realtime priority to " << rt_priority_
279 <<
", return code = " << status;
281 #pragma GCC diagnostic pop
286 TLOG(TLVL_DEBUG) <<
"Sending initial table.";
287 auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
288 auto nextSendTime = startTime;
290 while (!stop_requested_ && !pause_requested_)
292 startTime = artdaq::MonitoredQuantity::getCurrentTime();
294 if (startTime >= nextSendTime)
296 auto table = policy_->GetCurrentTable();
297 if (table.size() > 0)
299 send_event_table(table);
300 ++table_update_count_;
301 delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
302 statsHelperPtr_->addSample(TABLE_UPDATES_STAT_KEY, delta_time);
303 TLOG(16) <<
"process_fragments TABLE_UPDATES_STAT_KEY=" << delta_time;
305 bool readyToReport = statsHelperPtr_->readyToReport();
308 std::string statString = buildStatisticsString_();
309 TLOG(TLVL_INFO) << statString;
315 TLOG(TLVL_TRACE) <<
"No tokens received in this update interval (" << current_table_interval_ms_ <<
" ms)! This most likely means that the receivers are not keeping up!";
317 auto max_tokens = policy_->GetMaxNumberOfTokens();
320 auto frac = table.size() /
static_cast<double>(max_tokens);
321 if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
322 if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
323 if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
324 if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
326 nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
327 TLOG(TLVL_TRACE) <<
"current_table_interval_ms is now " << current_table_interval_ms_;
331 usleep(current_table_interval_ms_ * 10);
336 metricMan->do_stop();
342 if (table_socket_ == -1)
344 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
345 if (table_socket_ < 0)
347 TLOG(TLVL_ERROR) <<
"I failed to create the socket for sending Data Requests! Errno: " << errno;
350 auto sts =
ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
353 TLOG(TLVL_ERROR) <<
"Unable to resolve table_update_address";
358 if (multicast_out_hostname_ !=
"localhost")
360 TLOG(TLVL_DEBUG) <<
"Making sure that multicast sending uses the correct interface for hostname " << multicast_out_hostname_;
362 sts =
ResolveHost(multicast_out_hostname_.c_str(), addr);
365 throw art::Exception(art::errors::Configuration) <<
"RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;
369 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes,
sizeof(yes)) < 0)
371 throw art::Exception(art::errors::Configuration) <<
"RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
375 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes,
sizeof(yes)) < 0)
377 TLOG(TLVL_ERROR) <<
"Unable to enable multicast loopback on table socket";
380 if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr,
sizeof(addr)) == -1)
382 TLOG(TLVL_ERROR) <<
"Cannot set outgoing interface. Errno: " << errno;
386 if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (
void*)&yes,
sizeof(
int)) == -1)
388 TLOG(TLVL_ERROR) <<
"Cannot set request socket to broadcast. Errno: " << errno;
394 if (ack_socket_ == -1)
396 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
399 throw art::Exception(art::errors::Configuration) <<
"RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
403 struct sockaddr_in si_me_request;
406 if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes,
sizeof(yes)) < 0)
408 throw art::Exception(art::errors::Configuration) <<
"RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
415 socklen_t arglen =
sizeof(len);
416 sts = getsockopt(ack_socket_, SOL_SOCKET, SO_RCVBUF, &len, &arglen);
417 TLOG(TLVL_INFO) <<
"ACK RCVBUF initial: " << len <<
" sts/errno=" << sts <<
"/" << errno <<
" arglen=" << arglen;
419 memset(&si_me_request, 0,
sizeof(si_me_request));
420 si_me_request.sin_family = AF_INET;
421 si_me_request.sin_port = htons(receive_acks_port_);
422 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
423 if (bind(ack_socket_, reinterpret_cast<struct sockaddr*>(&si_me_request),
sizeof(si_me_request)) == -1)
425 throw art::Exception(art::errors::Configuration) <<
"RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
428 TLOG(TLVL_DEBUG) <<
"Listening for acks on 0.0.0.0 port " << receive_acks_port_;
431 auto acks = std::unordered_map<int, bool>();
432 for (
auto& r : sender_ranks_)
437 auto start_time = std::chrono::steady_clock::now();
438 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {
return !p.second; }) > 0 && !stop_requested_)
445 for (
auto ackIter = acks.begin(); ackIter != acks.end(); ++ackIter)
447 TLOG(27) <<
"Table update already acknowledged? rank " << ackIter->first <<
" is " << ackIter->second
448 <<
" (size of 'already_acknowledged_ranks bitset is " << (8 *
sizeof(header.already_acknowledged_ranks)) <<
")";
449 if (ackIter->first < static_cast<int>(8 *
sizeof(header.already_acknowledged_ranks)))
451 if (ackIter->second) { header.already_acknowledged_ranks.set(ackIter->first); }
455 assert(packetSize +
sizeof(header) < MAX_ROUTING_TABLE_SIZE);
456 std::vector<uint8_t> buffer(packetSize +
sizeof(header));
460 TLOG(TLVL_DEBUG) <<
"Sending table information for " << header.nEntries <<
" events to multicast group " << send_tables_address_ <<
", port " << send_tables_port_ <<
", outgoing interface " << multicast_out_hostname_;
461 TRACE(16,
"headerData:0x%016lx%016lx packetData:0x%016lx%016lx", ((
unsigned long*)&header)[0], ((
unsigned long*)&header)[1], ((
unsigned long*)&packet[0])[0], ((
unsigned long*)&packet[0])[1]);
462 auto sts = sendto(table_socket_, &buffer[0], buffer.size(), 0,
reinterpret_cast<struct sockaddr*
>(&send_tables_addr_),
sizeof(send_tables_addr_));
463 if (sts != static_cast<ssize_t>(buffer.size()))
465 TLOG(TLVL_ERROR) <<
"Error sending routing table. sts=" << sts;
470 auto first = packet[0].sequence_id;
471 auto last = packet.rbegin()->sequence_id;
472 TLOG(TLVL_DEBUG) <<
"Sent " << sts <<
" bytes. Expecting acks to have first= " << first <<
", and last= " << last;
474 auto startTime = std::chrono::steady_clock::now();
475 while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {
return !p.second; }) > 0)
477 auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
478 if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms)
480 if (++counter > max_ack_cycle_count_ && table_update_count_ > 0)
482 TLOG(TLVL_WARNING) <<
"Did not receive acks from all senders after resending table " << counter
483 <<
" times during the table_update_interval. Check the status of the senders!";
487 TLOG(TLVL_WARNING) <<
"Did not receive acks from all senders within the timeout (" << table_ack_wait_time_ms <<
" ms). Resending table update";
490 if (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {
return !p.second; }) <= 3)
492 auto ackIter = acks.begin();
493 while (ackIter != acks.end())
495 if (!ackIter->second)
497 TLOG(TLVL_TRACE) <<
"Did not receive ack from rank " << ackIter->first;
505 TLOG(20) <<
"send_event_table: Polling Request socket for new requests";
512 if (errno == EWOULDBLOCK || errno == EAGAIN)
514 TLOG(20) <<
"send_event_table: No more ack datagrams on ack socket.";
519 TLOG(TLVL_ERROR) <<
"An unexpected error occurred during ack packet receive";
525 TLOG(TLVL_DEBUG) <<
"Ack packet from rank " << buffer.
rank <<
" has first= " << buffer.
first_sequence_id
529 TLOG(TLVL_DEBUG) <<
"Received table update acknowledgement from sender with rank " << buffer.
rank <<
".";
530 acks[buffer.
rank] =
true;
531 TLOG(TLVL_DEBUG) <<
"There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {
return !p.second; })
532 <<
" acks outstanding";
536 if (!acks.count(buffer.
rank))
538 TLOG(TLVL_ERROR) <<
"Received acknowledgement from invalid rank " << buffer.
rank <<
"!"
539 <<
" Cross-talk between RoutingMasters means there's a configuration error!";
543 TLOG(TLVL_WARNING) <<
"Received acknowledgement from rank " << buffer.
rank
544 <<
" that had incorrect sequence ID information. Discarding."
545 <<
" Expected first/last=" << first <<
"/" << last
551 usleep(table_ack_wait_time_ms * 1000 / 10);
556 artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time;
557 metricMan->sendMetric(
"Avg Table Acknowledge Time", delta.count(),
"seconds", 3, MetricMode::Average);
563 std::string resultString;
566 auto tmpString = app_name +
" run number = " + std::to_string(run_id_.run()) +
", table updates sent = " + std::to_string(table_update_count_) +
", Receiver tokens received = " + std::to_string(token_receiver_->getReceivedTokenCount());
570 std::string artdaq::RoutingMasterCore::buildStatisticsString_()
const
572 std::ostringstream oss;
573 oss << app_name <<
" statistics:" << std::endl;
575 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
576 if (mqPtr.get() !=
nullptr)
578 artdaq::MonitoredQuantityStats stats;
579 mqPtr->getStats(stats);
580 oss <<
" Table Update statistics: "
581 << stats.recentSampleCount <<
" table updates sent at "
582 << stats.recentSampleRate <<
" table updates/sec, , monitor window = "
583 << stats.recentDuration <<
" sec" << std::endl;
584 oss <<
" Average times per table update: ";
585 if (stats.recentSampleRate > 0.0)
587 oss <<
" elapsed time = "
588 << (1.0 / stats.recentSampleRate) <<
" sec";
590 oss <<
", avg table acknowledgement wait time = "
591 << (mqPtr->getRecentValueSum() / sender_ranks_.size()) <<
" sec" << std::endl;
594 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
595 if (mqPtr.get() !=
nullptr)
597 artdaq::MonitoredQuantityStats stats;
598 mqPtr->getStats(stats);
599 oss <<
" Received Token statistics: "
600 << stats.recentSampleCount <<
" tokens received at "
601 << stats.recentSampleRate <<
" tokens/sec, , monitor window = "
602 << stats.recentDuration <<
" sec" << std::endl;
603 oss <<
" Average times per token: ";
604 if (stats.recentSampleRate > 0.0)
606 oss <<
" elapsed time = "
607 << (1.0 / stats.recentSampleRate) <<
" sec";
609 oss <<
", input token wait time = "
610 << mqPtr->getRecentValueSum() <<
" sec" << std::endl;
616 void artdaq::RoutingMasterCore::sendMetrics_()
620 auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
621 if (mqPtr.get() !=
nullptr)
623 artdaq::MonitoredQuantityStats stats;
624 mqPtr->getStats(stats);
625 metricMan->sendMetric(
"Table Update Count", static_cast<unsigned long>(stats.fullSampleCount),
"updates", 1, MetricMode::LastPoint);
626 metricMan->sendMetric(
"Table Update Rate", stats.recentSampleRate,
"updates/sec", 1, MetricMode::Average);
627 metricMan->sendMetric(
"Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()),
"seconds", 3, MetricMode::Average);
630 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
631 if (mqPtr.get() !=
nullptr)
633 artdaq::MonitoredQuantityStats stats;
634 mqPtr->getStats(stats);
635 metricMan->sendMetric(
"Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount),
"updates", 1, MetricMode::LastPoint);
636 metricMan->sendMetric(
"Receiver Token Rate", stats.recentSampleRate,
"updates/sec", 1, MetricMode::Average);
637 metricMan->sendMetric(
"Total Receiver Token Wait Time", mqPtr->getRecentValueSum(),
"seconds", 3, MetricMode::Average);
This class manages MonitoredQuantity instances for the *Core classes.
bool resume(uint64_t, uint64_t)
Resumes the RoutingMasterCore.
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
bool soft_initialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Soft-Initializes the RoutingMasterCore.
static const std::string TOKENS_RECEIVED_STAT_KEY
Key for the Tokens Received MonitoredQuantity.
A row of the Routing Table.
Receives event builder "free buffer" tokens and adds them to a specified RoutingPolicy.
bool reinitialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Reinitializes the RoutingMasterCore.
bool start(art::RunID id, uint64_t, uint64_t)
Start the RoutingMasterCore.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
RoutingMasterCore()
RoutingMasterCore Constructor.
Events should be routed by sequence ID (BR -> EB)
bool initialize(fhicl::ParameterSet const &pset, uint64_t, uint64_t)
Processes the initialize request.
std::shared_ptr< RoutingMasterPolicy > makeRoutingMasterPolicy(std::string const &policy_plugin_spec, fhicl::ParameterSet const &ps)
Load a RoutingMasterPolicy plugin.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
bool pause(uint64_t, uint64_t)
Pauses the RoutingMasterCore.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
bool stop(uint64_t, uint64_t)
Stops the RoutingMasterCore.
int rank
The rank from which the RoutingAckPacket came.
std::string report(std::string const &) const
Send a report on the current status of the RoutingMasterCore.
static const std::string TABLE_UPDATES_STAT_KEY
Key for Table Update count MonnitoredQuantity.
bool shutdown(uint64_t)
Shuts Down the RoutingMasterCore.
void process_event_table()
Main loop of the RoutingMasterCore. Determines when to send the next table update, asks the RoutingMasterPolicy for the table to send, and sends it.
void send_event_table(detail::RoutingPacket table)
Sends a detail::RoutingPacket to the table receivers.
Events should be routed by send count (EB -> Agg)