artdaq  v3_09_01
RoutingManagerCore.cc
1 #include <arpa/inet.h>
2 #include <netdb.h>
3 #include <pthread.h>
4 #include <sched.h>
5 #include <sys/epoll.h>
6 #include <sys/time.h>
7 #include <sys/un.h>
8 #include <algorithm>
9 #include <memory>
10 
11 #include "canvas/Utilities/Exception.h"
12 #include "cetlib_except/exception.h"
13 
14 #include "artdaq/DAQdata/Globals.hh" // include these 2 first to get tracemf.h -
15 #define TRACE_NAME (app_name + "_RoutingManagerCore").c_str() // before trace.h
16 #include "artdaq-core/Data/Fragment.hh"
17 #include "artdaq-core/Utilities/ExceptionHandler.hh"
18 
19 #include "artdaq/Application/RoutingManagerCore.hh"
22 #include "artdaq/RoutingPolicies/makeRoutingManagerPolicy.hh"
23 
24 const std::string artdaq::RoutingManagerCore::
25  TABLE_UPDATES_STAT_KEY("RoutingManagerCoreTableUpdates");
26 const std::string artdaq::RoutingManagerCore::
27  TOKENS_RECEIVED_STAT_KEY("RoutingManagerCoreTokensReceived");
28 
30  : shutdown_requested_(false)
31  , stop_requested_(true)
32  , pause_requested_(false)
33  , statsHelperPtr_(new artdaq::StatisticsHelper())
34 {
35  TLOG(TLVL_DEBUG) << "Constructor";
36  statsHelperPtr_->addMonitoredQuantityName(TABLE_UPDATES_STAT_KEY);
37  statsHelperPtr_->addMonitoredQuantityName(TOKENS_RECEIVED_STAT_KEY);
38 }
39 
41 {
42  TLOG(TLVL_DEBUG) << "Destructor";
43  artdaq::StatisticsCollection::getInstance().requestStop();
44  token_receiver_->stopTokenReception(true);
45 }
46 
47 bool artdaq::RoutingManagerCore::initialize(fhicl::ParameterSet const& pset, uint64_t /*unused*/, uint64_t /*unused*/)
48 {
49  TLOG(TLVL_DEBUG) << "initialize method called with "
50  << "ParameterSet = \"" << pset.to_string()
51  << "\".";
52 
53  // pull out the relevant parts of the ParameterSet
54  fhicl::ParameterSet daq_pset;
55  try
56  {
57  daq_pset = pset.get<fhicl::ParameterSet>("daq");
58  }
59  catch (...)
60  {
61  TLOG(TLVL_ERROR)
62  << "Unable to find the DAQ parameters in the initialization "
63  << "ParameterSet: \"" + pset.to_string() + "\".";
64  return false;
65  }
66 
67  if (daq_pset.has_key("rank"))
68  {
69  if (my_rank >= 0 && daq_pset.get<int>("rank") != my_rank)
70  {
71  TLOG(TLVL_WARNING) << "Routing Manager rank specified at startup is different than rank specified at configure! Using rank received at configure!";
72  }
73  my_rank = daq_pset.get<int>("rank");
74  }
75  if (my_rank == -1)
76  {
77  TLOG(TLVL_ERROR) << "Routing Manager rank not specified at startup or in configuration! Aborting";
78  exit(1);
79  }
80 
81  try
82  {
83  policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
84  }
85  catch (...)
86  {
87  TLOG(TLVL_ERROR)
88  << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\".";
89  return false;
90  }
91 
92  try
93  {
94  token_receiver_pset_ = daq_pset.get<fhicl::ParameterSet>("token_receiver");
95  }
96  catch (...)
97  {
98  TLOG(TLVL_ERROR)
99  << "Unable to find the token_receiver parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\".";
100  return false;
101  }
102 
103  // pull out the Metric part of the ParameterSet
104  fhicl::ParameterSet metric_pset;
105  try
106  {
107  metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
108  }
109  catch (...)
110  {} // OK if there's no metrics table defined in the FHiCL
111 
112  if (metric_pset.is_empty())
113  {
114  TLOG(TLVL_INFO) << "No metric plugins appear to be defined";
115  }
116  try
117  {
118  metricMan->initialize(metric_pset, app_name);
119  }
120  catch (...)
121  {
122  ExceptionHandler(ExceptionHandlerRethrow::no,
123  "Error loading metrics in RoutingManagerCore::initialize()");
124  }
125 
126  // create the requested RoutingPolicy
127  auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
128  if (policy_plugin_spec.length() == 0)
129  {
130  TLOG(TLVL_ERROR)
131  << "No fragment generator (parameter name = \"policy\") was "
132  << "specified in the policy ParameterSet. The "
133  << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\".";
134  return false;
135  }
136  try
137  {
138  policy_ = artdaq::makeRoutingManagerPolicy(policy_plugin_spec, policy_pset_);
139  }
140  catch (...)
141  {
142  std::stringstream exception_string;
143  exception_string << "Exception thrown during initialization of policy of type \""
144  << policy_plugin_spec << "\"";
145 
146  ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
147 
148  TLOG(TLVL_DEBUG) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string();
149 
150  return false;
151  }
152 
153  rt_priority_ = daq_pset.get<int>("rt_priority", 0);
154  sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
155 
156  receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
157 
158  auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
160  max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
161  current_table_interval_ms_ = max_table_update_interval_ms_;
162  max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
163  send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
164  receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
165  send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
166  multicast_out_hostname_ = daq_pset.get<std::string>("routing_manager_hostname", "localhost");
167 
168  // fetch the monitoring parameters and create the MonitoredQuantity instances
169  statsHelperPtr_->createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
170 
171  // create the requested TokenReceiver
172  token_receiver_ = std::make_unique<TokenReceiver>(token_receiver_pset_, policy_, routing_mode_, sender_ranks_.size(), max_table_update_interval_ms_);
173  token_receiver_->setStatsHelper(statsHelperPtr_, TOKENS_RECEIVED_STAT_KEY);
174  token_receiver_->startTokenReception();
175  token_receiver_->pauseTokenReception();
176 
177  shutdown_requested_.store(false);
178  return true;
179 }
180 
181 bool artdaq::RoutingManagerCore::start(art::RunID id, uint64_t /*unused*/, uint64_t /*unused*/)
182 {
183  run_id_ = id;
184  for (auto& rank : sender_ranks_)
185  {
186  if (active_ranks_.count(rank) == 0u)
187  {
188  active_ranks_.insert(rank);
189  }
190  }
191  stop_requested_.store(false);
192  pause_requested_.store(false);
193 
194  statsHelperPtr_->resetStatistics();
195 
196  metricMan->do_start();
197  table_update_count_ = 0;
198  token_receiver_->setRunNumber(run_id_.run());
199  token_receiver_->resumeTokenReception();
200 
201  TLOG(TLVL_INFO) << "Started run " << run_id_.run();
202  return true;
203 }
204 
205 bool artdaq::RoutingManagerCore::stop(uint64_t /*unused*/, uint64_t /*unused*/)
206 {
207  TLOG(TLVL_INFO) << "Stopping run " << run_id_.run()
208  << " after " << table_update_count_ << " table updates."
209  << " and " << token_receiver_->getReceivedTokenCount() << " received tokens.";
210  stop_requested_.store(true);
211  token_receiver_->pauseTokenReception();
212  run_id_ = art::RunID::flushRun();
213  return true;
214 }
215 
216 bool artdaq::RoutingManagerCore::pause(uint64_t /*unused*/, uint64_t /*unused*/)
217 {
218  TLOG(TLVL_INFO) << "Pausing run " << run_id_.run()
219  << " after " << table_update_count_ << " table updates."
220  << " and " << token_receiver_->getReceivedTokenCount() << " received tokens.";
221  pause_requested_.store(true);
222  return true;
223 }
224 
225 bool artdaq::RoutingManagerCore::resume(uint64_t /*unused*/, uint64_t /*unused*/)
226 {
227  TLOG(TLVL_DEBUG) << "Resuming run " << run_id_.run();
228  pause_requested_.store(false);
229  metricMan->do_start();
230  return true;
231 }
232 
233 bool artdaq::RoutingManagerCore::shutdown(uint64_t /*unused*/)
234 {
235  shutdown_requested_.store(true);
236  token_receiver_->stopTokenReception();
237  policy_.reset();
238  metricMan->shutdown();
239  return true;
240 }
241 
242 bool artdaq::RoutingManagerCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
243 {
244  TLOG(TLVL_INFO) << "soft_initialize method called with "
245  << "ParameterSet = \"" << pset.to_string()
246  << "\".";
247  return initialize(pset, e, f);
248 }
249 
250 bool artdaq::RoutingManagerCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
251 {
252  TLOG(TLVL_INFO) << "reinitialize method called with "
253  << "ParameterSet = \"" << pset.to_string()
254  << "\".";
255  return initialize(pset, e, f);
256 }
257 
259 {
260  if (rt_priority_ > 0)
261  {
262 #pragma GCC diagnostic push
263 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
264  sched_param s_param = {};
265  s_param.sched_priority = rt_priority_;
266  if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param) != 0)
267  {
268  TLOG(TLVL_WARNING) << "setting realtime priority failed";
269  }
270 #pragma GCC diagnostic pop
271  }
272 
273  // try-catch block here?
274 
275  // how to turn RT PRI off?
276  if (rt_priority_ > 0)
277  {
278 #pragma GCC diagnostic push
279 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
280  sched_param s_param = {};
281  s_param.sched_priority = rt_priority_;
282  int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
283  if (status != 0)
284  {
285  TLOG(TLVL_ERROR)
286  << "Failed to set realtime priority to " << rt_priority_
287  << ", return code = " << status;
288  }
289 #pragma GCC diagnostic pop
290  }
291 
292  //MPI_Barrier(local_group_comm_);
293 
294  TLOG(TLVL_DEBUG) << "Sending initial table.";
295  auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
296  auto nextSendTime = startTime;
297  double delta_time;
298  while (!stop_requested_ && !pause_requested_)
299  {
300  startTime = artdaq::MonitoredQuantity::getCurrentTime();
301 
302  if (startTime >= nextSendTime)
303  {
304  auto table = policy_->GetCurrentTable();
305  if (!table.empty())
306  {
307  send_event_table(table);
308  ++table_update_count_;
309  delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
310  statsHelperPtr_->addSample(TABLE_UPDATES_STAT_KEY, delta_time);
311  TLOG(16) << "process_fragments TABLE_UPDATES_STAT_KEY=" << delta_time;
312 
313  bool readyToReport = statsHelperPtr_->readyToReport();
314  if (readyToReport)
315  {
316  std::string statString = buildStatisticsString_();
317  TLOG(TLVL_INFO) << statString;
318  sendMetrics_();
319  }
320  }
321  else
322  {
323  TLOG(TLVL_TRACE) << "No tokens received in this update interval (" << current_table_interval_ms_ << " ms)! This most likely means that the receivers are not keeping up!";
324  }
325  auto max_tokens = policy_->GetMaxNumberOfTokens();
326  if (max_tokens > 0)
327  {
328  auto frac = table.size() / static_cast<double>(max_tokens);
329  if (frac > 0.75)
330  {
331  current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
332  }
333  if (frac < 0.5)
334  {
335  current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
336  }
337  if (current_table_interval_ms_ > max_table_update_interval_ms_)
338  {
339  current_table_interval_ms_ = max_table_update_interval_ms_;
340  }
341  if (current_table_interval_ms_ < 1)
342  {
343  current_table_interval_ms_ = 1;
344  }
345  }
346  nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
347  TLOG(TLVL_TRACE) << "current_table_interval_ms is now " << current_table_interval_ms_;
348  }
349  else
350  {
351  usleep(current_table_interval_ms_ * 10); // 1/100 of the table update interval
352  }
353  }
354 
355  if (stop_requested_ && ack_socket_ != -1) {
356  TLOG(TLVL_INFO) << "Shutting down Routing Manager: Draining ack socket BEGIN";
357  auto ready = true;
358  while (ready)
359  {
361  if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, nullptr, nullptr) < 0)
362  {
363  if (errno == EWOULDBLOCK || errno == EAGAIN)
364  {
365  TLOG(20) << "No more ack datagrams on ack socket.";
366  ready = false;
367  }
368  else
369  {
370  TLOG(TLVL_ERROR) << "An unexpected error occurred during ack packet receive";
371  exit(2);
372  }
373  }
374  }
375  TLOG(TLVL_INFO) << "Shutting down Routing Manager: Draining ack socket END";
376  }
377 
378  TLOG(TLVL_DEBUG) << "stop_requested_ is " << stop_requested_ << ", pause_requested_ is " << pause_requested_ << ", exiting process_event_table loop";
379  policy_->Reset();
380  metricMan->do_stop();
381 }
382 
384 {
385  // Reconnect table socket, if necessary
386  if (table_socket_ == -1)
387  {
388  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
389  if (table_socket_ < 0)
390  {
391  TLOG(TLVL_ERROR) << "I failed to create the socket for sending Data Requests! Errno: " << errno;
392  exit(1);
393  }
394  auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
395  if (sts == -1)
396  {
397  TLOG(TLVL_ERROR) << "Unable to resolve table_update_address";
398  exit(1);
399  }
400 
401  auto yes = 1;
402  if (multicast_out_hostname_ != "localhost")
403  {
404  TLOG(TLVL_DEBUG) << "Making sure that multicast sending uses the correct interface for hostname " << multicast_out_hostname_;
405  struct in_addr addr;
406  sts = GetInterfaceForNetwork(multicast_out_hostname_.c_str(), addr);
407  if (sts == -1)
408  {
409  throw art::Exception(art::errors::Configuration) << "RoutingManagerCore: Unable to determine the multicast interface address from the routing_manager_address parameter value of " << multicast_out_hostname_ << std::endl; // NOLINT(cert-err60-cpp)
410  exit(1);
411  }
412  char addr_str[INET_ADDRSTRLEN];
413  inet_ntop(AF_INET, &(addr), addr_str, INET_ADDRSTRLEN);
414  TLOG(TLVL_INFO) << "Successfully determined the multicast interface address for " << multicast_out_hostname_ << ": " << addr_str << " (RoutingManager sending table updates to BoardReaders)";
415 
416  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
417  {
418  TLOG(TLVL_ERROR) << "RoutingManagerCore: Unable to enable port reuse on table update socket";
419  throw art::Exception(art::errors::Configuration) << "RoutingManagerCore: Unable to enable port reuse on table update socket" << std::endl; // NOLINT(cert-err60-cpp)
420  exit(1);
421  }
422 
423  if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0)
424  {
425  TLOG(TLVL_ERROR) << "Unable to enable multicast loopback on table socket";
426  exit(1);
427  }
428  if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
429  {
430  TLOG(TLVL_ERROR) << "Cannot set outgoing interface. Errno: " << errno;
431  exit(1);
432  }
433  }
434  if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, &yes, sizeof(yes)) == -1)
435  {
436  TLOG(TLVL_ERROR) << "Cannot set request socket to broadcast. Errno: " << errno;
437  exit(1);
438  }
439  }
440 
441  // Reconnect ack socket, if necessary
442  if (ack_socket_ == -1)
443  {
444  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
445  if (ack_socket_ < 0)
446  {
447  throw art::Exception(art::errors::Configuration) << "RoutingManagerCore: Error creating socket for receiving table update acks!" << std::endl; // NOLINT(cert-err60-cpp)
448  exit(1);
449  }
450 
451  struct sockaddr_in si_me_request;
452 
453  auto yes = 1;
454  if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
455  {
456  TLOG(TLVL_ERROR) << "RoutingManagerCore: Unable to enable port reuse on ack socket. errno=" << errno;
457  throw art::Exception(art::errors::Configuration) << "RoutingManagerCore: Unable to enable port reuse on ack socket" << std::endl; // NOLINT(cert-err60-cpp)
458  exit(1);
459  }
460 
461  // 10-Apr-2019, KAB: debugging information about the size of the receive buffer
462  int sts;
463  int len = 0;
464  socklen_t arglen = sizeof(len);
465  sts = getsockopt(ack_socket_, SOL_SOCKET, SO_RCVBUF, &len, &arglen);
466  TLOG(TLVL_INFO) << "ACK RCVBUF initial: " << len << " sts/errno=" << sts << "/" << errno << " arglen=" << arglen;
467 
468  memset(&si_me_request, 0, sizeof(si_me_request));
469  si_me_request.sin_family = AF_INET;
470  si_me_request.sin_port = htons(receive_acks_port_);
471  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
472  if (bind(ack_socket_, reinterpret_cast<struct sockaddr*>(&si_me_request), sizeof(si_me_request)) == -1) // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
473  {
474  TLOG(TLVL_ERROR) << "RoutingManagerCore: Cannot bind request socket to port " << receive_acks_port_ << ", errno=" << errno;
475  throw art::Exception(art::errors::Configuration) << "RoutingManagerCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl; // NOLINT(cert-err60-cpp)
476  exit(1);
477  }
478  TLOG(TLVL_DEBUG) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_;
479  }
480 
481  auto acks = std::unordered_map<int, bool>();
482  for (auto& r : active_ranks_)
483  {
484  acks[r] = false;
485  }
486  auto counter = 0U;
487  auto start_time = std::chrono::steady_clock::now();
488  while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) { return !p.second; }) > 0 && !stop_requested_)
489  {
490  // Send table update
491  auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
492  auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
493 
494  // 10-Apr-2019, KAB: added information on which senders have already acknowledged this update
495  for (auto& ack : acks)
496  {
497  TLOG(27) << "Table update already acknowledged? rank " << ack.first << " is " << ack.second
498  << " (size of 'already_acknowledged_ranks bitset is " << (8 * sizeof(header.already_acknowledged_ranks)) << ")";
499  if (ack.first < static_cast<int>(8 * sizeof(header.already_acknowledged_ranks)))
500  {
501  if (ack.second) { header.already_acknowledged_ranks.set(ack.first); }
502  }
503  }
504 
505  assert(packetSize + sizeof(header) < MAX_ROUTING_TABLE_SIZE);
506  std::vector<uint8_t> buffer(packetSize + sizeof(header));
507  memcpy(&buffer[0], &header, sizeof(detail::RoutingPacketHeader));
508  memcpy(&buffer[sizeof(detail::RoutingPacketHeader)], &packet[0], packetSize);
509 
510  TLOG(TLVL_DEBUG) << "Sending table information for " << header.nEntries << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ << ", outgoing interface " << multicast_out_hostname_;
511  TRACE(16, "headerData:0x%016lx%016lx packetData:0x%016lx%016lx", ((unsigned long*)&header)[0], ((unsigned long*)&header)[1], ((unsigned long*)&packet[0])[0], ((unsigned long*)&packet[0])[1]); // NOLINT
512  auto sts = sendto(table_socket_, &buffer[0], buffer.size(), 0, reinterpret_cast<struct sockaddr*>(&send_tables_addr_), sizeof(send_tables_addr_)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
513  if (sts != static_cast<ssize_t>(buffer.size()))
514  {
515  TLOG(TLVL_ERROR) << "Error sending routing table. sts=" << sts;
516  }
517 
518  // Collect acks
519 
520  auto first = packet[0].sequence_id;
521  auto last = packet.rbegin()->sequence_id;
522  TLOG(TLVL_DEBUG) << "Sent " << sts << " bytes. Expecting acks to have first= " << first << ", and last= " << last;
523 
524  auto startTime = std::chrono::steady_clock::now();
525  while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) { return !p.second; }) > 0)
526  {
527  auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
528  if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms)
529  {
530  if (++counter > max_ack_cycle_count_ && table_update_count_ > 0)
531  {
532  TLOG(TLVL_WARNING) << "Did not receive acks from all senders after resending table " << counter
533  << " times during the table_update_interval. Check the status of the senders!";
534  }
535  else
536  {
537  TLOG(TLVL_WARNING) << "Did not receive acks from all senders within the timeout (" << table_ack_wait_time_ms << " ms). Resending table update";
538  }
539 
540  if (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) { return !p.second; }) <= 3)
541  {
542  auto ackIter = acks.begin();
543  while (ackIter != acks.end())
544  {
545  if (!ackIter->second)
546  {
547  TLOG(TLVL_TRACE) << "Did not receive ack from rank " << ackIter->first;
548  }
549  ++ackIter;
550  }
551  }
552  break;
553  }
554 
555  TLOG(20) << "send_event_table: Polling Request socket for new requests";
556  auto ready = true;
557  while (ready)
558  {
560  if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, nullptr, nullptr) < 0)
561  {
562  if (errno == EWOULDBLOCK || errno == EAGAIN)
563  {
564  TLOG(20) << "send_event_table: No more ack datagrams on ack socket.";
565  ready = false;
566  }
567  else
568  {
569  TLOG(TLVL_ERROR) << "An unexpected error occurred during ack packet receive";
570  exit(2);
571  }
572  }
573  else
574  {
575  TLOG(TLVL_DEBUG) << "Ack packet from rank " << buffer.rank << " has first= " << buffer.first_sequence_id
576  << " and last= " << buffer.last_sequence_id << ", packet_size=" << sizeof(detail::RoutingAckPacket);
577  if ((acks.count(buffer.rank) != 0u) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
578  {
579  TLOG(TLVL_DEBUG) << "Received table update acknowledgement from sender with rank " << buffer.rank << ".";
580  acks[buffer.rank] = true;
581  TLOG(TLVL_DEBUG) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) { return !p.second; })
582  << " acks outstanding";
583  }
584  else if ((acks.count(buffer.rank) != 0u) && detail::RoutingAckPacket::isEndOfDataRoutingAckPacket(buffer))
585  {
586  TLOG(TLVL_INFO) << "Received table update acknowledgement indicating end-of-data from rank " << buffer.rank << ".";
587  acks[buffer.rank] = true;
588  active_ranks_.erase(buffer.rank);
589  }
590  else
591  {
592  if (acks.count(buffer.rank) == 0u)
593  {
594  TLOG(TLVL_ERROR) << "Received acknowledgement from invalid rank " << buffer.rank << "!"
595  << " Cross-talk between RoutingManagers means there's a configuration error!";
596  }
597  else
598  {
599  TLOG(TLVL_WARNING) << "Received acknowledgement from rank " << buffer.rank
600  << " that had incorrect sequence ID information. Discarding."
601  << " Expected first/last=" << first << "/" << last
602  << " recvd=" << buffer.first_sequence_id << "/" << buffer.last_sequence_id;
603  }
604  }
605  }
606  }
607  usleep(table_ack_wait_time_ms * 1000 / 10);
608  }
609  }
610 
611  if (metricMan)
612  {
613  artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time;
614  metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average);
615  }
616 }
617 
618 std::string artdaq::RoutingManagerCore::report(std::string const& /*unused*/) const
619 {
620  std::string resultString;
621 
622  // if we haven't been able to come up with any report so far, say so
623  auto tmpString = app_name + " run number = " + std::to_string(run_id_.run()) + ", table updates sent = " + std::to_string(table_update_count_) + ", Receiver tokens received = " + std::to_string(token_receiver_->getReceivedTokenCount());
624  return tmpString;
625 }
626 
627 std::string artdaq::RoutingManagerCore::buildStatisticsString_() const
628 {
629  std::ostringstream oss;
630  oss << app_name << " statistics:" << std::endl;
631 
632  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
633  if (mqPtr != nullptr)
634  {
635  artdaq::MonitoredQuantityStats stats;
636  mqPtr->getStats(stats);
637  oss << " Table Update statistics: "
638  << stats.recentSampleCount << " table updates sent at "
639  << stats.recentSampleRate << " table updates/sec, , monitor window = "
640  << stats.recentDuration << " sec" << std::endl;
641  oss << " Average times per table update: ";
642  if (stats.recentSampleRate > 0.0)
643  {
644  oss << " elapsed time = "
645  << (1.0 / stats.recentSampleRate) << " sec";
646  }
647  oss << ", avg table acknowledgement wait time = "
648  << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
649  }
650 
651  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
652  if (mqPtr != nullptr)
653  {
654  artdaq::MonitoredQuantityStats stats;
655  mqPtr->getStats(stats);
656  oss << " Received Token statistics: "
657  << stats.recentSampleCount << " tokens received at "
658  << stats.recentSampleRate << " tokens/sec, , monitor window = "
659  << stats.recentDuration << " sec" << std::endl;
660  oss << " Average times per token: ";
661  if (stats.recentSampleRate > 0.0)
662  {
663  oss << " elapsed time = "
664  << (1.0 / stats.recentSampleRate) << " sec";
665  }
666  oss << ", input token wait time = "
667  << mqPtr->getRecentValueSum() << " sec" << std::endl;
668  }
669 
670  return oss.str();
671 }
672 
673 void artdaq::RoutingManagerCore::sendMetrics_()
674 {
675  if (metricMan)
676  {
677  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
678  if (mqPtr != nullptr)
679  {
680  artdaq::MonitoredQuantityStats stats;
681  mqPtr->getStats(stats);
682  metricMan->sendMetric("Table Update Count", stats.fullSampleCount, "updates", 1, MetricMode::LastPoint);
683  metricMan->sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
684  metricMan->sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average);
685  }
686 
687  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
688  if (mqPtr != nullptr)
689  {
690  artdaq::MonitoredQuantityStats stats;
691  mqPtr->getStats(stats);
692  metricMan->sendMetric("Receiver Token Count", stats.fullSampleCount, "updates", 1, MetricMode::LastPoint);
693  metricMan->sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
694  metricMan->sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
695  }
696  }
697 }
bool reinitialize(fhicl::ParameterSet const &pset, uint64_t e, uint64_t f)
Reinitializes the RoutingManagerCore.
This class manages MonitoredQuantity instances for the *Core classes.
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
bool shutdown(uint64_t)
Shuts Down the RoutingManagerCore.
bool start(art::RunID id, uint64_t, uint64_t)
Start the RoutingManagerCore.
A row of the Routing Table.
Events should be routed by sequence ID (BR -&gt; EB)
std::bitset< 1024 > already_acknowledged_ranks
Bitset of ranks which have already sent valid acknowledgements and therefore do not need to send agai...
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
static const std::string TABLE_UPDATES_STAT_KEY
Key for Table Update count MonnitoredQuantity.
int GetInterfaceForNetwork(char const *host_in, in_addr &addr)
Convert an IP address to the network address of the interface sharing the subnet mask.
Definition: TCPConnect.cc:223
bool pause(uint64_t, uint64_t)
Pauses the RoutingManagerCore.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
The header of the Routing Table, containing the magic bytes and the number of entries.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
Events should be routed by send count (EB -&gt; Agg)
bool soft_initialize(fhicl::ParameterSet const &pset, uint64_t e, uint64_t f)
Soft-Initializes the RoutingManagerCore.
bool initialize(fhicl::ParameterSet const &pset, uint64_t, uint64_t)
Processes the initialize request.
bool stop(uint64_t, uint64_t)
Stops the RoutingManagerCore.
void process_event_table()
Main loop of the RoutingManagerCore. Determines when to send the next table update, asks the RoutingManagerPolicy for the table to send, and sends it.
static const std::string TOKENS_RECEIVED_STAT_KEY
Key for the Tokens Received MonitoredQuantity.
std::string report(std::string const &) const
Send a report on the current status of the RoutingManagerCore.
bool resume(uint64_t, uint64_t)
Resumes the RoutingManagerCore.
void send_event_table(detail::RoutingPacket packet)
Sends a detail::RoutingPacket to the table receivers.
RoutingManagerCore()
RoutingManagerCore Constructor.
std::shared_ptr< RoutingManagerPolicy > makeRoutingManagerPolicy(std::string const &policy_plugin_spec, fhicl::ParameterSet const &ps)
Load a RoutingManagerPolicy plugin.