artdaq  v3_07_01
RoutingMasterCore.cc
1 #include <arpa/inet.h>
2 #include <netdb.h>
3 #include <pthread.h>
4 #include <sched.h>
5 #include <sys/epoll.h>
6 #include <sys/time.h>
7 #include <sys/un.h>
8 #include <algorithm>
9 
10 #include "canvas/Utilities/Exception.h"
11 #include "cetlib_except/exception.h"
12 
13 #include "artdaq/DAQdata/Globals.hh" // include these 2 first to get tracemf.h -
14 #define TRACE_NAME (app_name + "_RoutingMasterCore").c_str() // before trace.h
15 #include "artdaq-core/Data/Fragment.hh"
16 #include "artdaq-core/Utilities/ExceptionHandler.hh"
17 
18 #include "artdaq/Application/RoutingMasterCore.hh"
21 #include "artdaq/RoutingPolicies/makeRoutingMasterPolicy.hh"
22 
23 const std::string artdaq::RoutingMasterCore::
24  TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates");
25 const std::string artdaq::RoutingMasterCore::
26  TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived");
27 
29  : shutdown_requested_(false)
30  , stop_requested_(true)
31  , pause_requested_(false)
32  , statsHelperPtr_(new artdaq::StatisticsHelper())
33  , table_socket_(-1)
34  , ack_socket_(-1)
35 {
36  TLOG(TLVL_DEBUG) << "Constructor";
37  statsHelperPtr_->addMonitoredQuantityName(TABLE_UPDATES_STAT_KEY);
38  statsHelperPtr_->addMonitoredQuantityName(TOKENS_RECEIVED_STAT_KEY);
39 }
40 
42 {
43  TLOG(TLVL_DEBUG) << "Destructor";
44  artdaq::StatisticsCollection::getInstance().requestStop();
45  token_receiver_->stopTokenReception(true);
46 }
47 
48 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t)
49 {
50  TLOG(TLVL_DEBUG) << "initialize method called with "
51  << "ParameterSet = \"" << pset.to_string()
52  << "\".";
53 
54  // pull out the relevant parts of the ParameterSet
55  fhicl::ParameterSet daq_pset;
56  try
57  {
58  daq_pset = pset.get<fhicl::ParameterSet>("daq");
59  }
60  catch (...)
61  {
62  TLOG(TLVL_ERROR)
63  << "Unable to find the DAQ parameters in the initialization "
64  << "ParameterSet: \"" + pset.to_string() + "\".";
65  return false;
66  }
67 
68  if (daq_pset.has_key("rank"))
69  {
70  if (my_rank >= 0 && daq_pset.get<int>("rank") != my_rank)
71  {
72  TLOG(TLVL_WARNING) << "Routing Master rank specified at startup is different than rank specified at configure! Using rank received at configure!";
73  }
74  my_rank = daq_pset.get<int>("rank");
75  }
76  if (my_rank == -1)
77  {
78  TLOG(TLVL_ERROR) << "Routing Master rank not specified at startup or in configuration! Aborting";
79  exit(1);
80  }
81 
82  try
83  {
84  policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
85  }
86  catch (...)
87  {
88  TLOG(TLVL_ERROR)
89  << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\".";
90  return false;
91  }
92 
93  try
94  {
95  token_receiver_pset_ = daq_pset.get<fhicl::ParameterSet>("token_receiver");
96  }
97  catch (...)
98  {
99  TLOG(TLVL_ERROR)
100  << "Unable to find the token_receiver parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\".";
101  return false;
102  }
103 
104  // pull out the Metric part of the ParameterSet
105  fhicl::ParameterSet metric_pset;
106  try
107  {
108  metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
109  }
110  catch (...)
111  {} // OK if there's no metrics table defined in the FHiCL
112 
113  if (metric_pset.is_empty())
114  {
115  TLOG(TLVL_INFO) << "No metric plugins appear to be defined";
116  }
117  try
118  {
119  metricMan->initialize(metric_pset, app_name);
120  }
121  catch (...)
122  {
123  ExceptionHandler(ExceptionHandlerRethrow::no,
124  "Error loading metrics in RoutingMasterCore::initialize()");
125  }
126 
127  // create the requested RoutingPolicy
128  auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
129  if (policy_plugin_spec.length() == 0)
130  {
131  TLOG(TLVL_ERROR)
132  << "No fragment generator (parameter name = \"policy\") was "
133  << "specified in the policy ParameterSet. The "
134  << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\".";
135  return false;
136  }
137  try
138  {
139  policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_);
140  }
141  catch (...)
142  {
143  std::stringstream exception_string;
144  exception_string << "Exception thrown during initialization of policy of type \""
145  << policy_plugin_spec << "\"";
146 
147  ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
148 
149  TLOG(TLVL_DEBUG) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string();
150 
151  return false;
152  }
153 
154  rt_priority_ = daq_pset.get<int>("rt_priority", 0);
155  sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
156 
157  receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
158 
159  auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
161  max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
162  current_table_interval_ms_ = max_table_update_interval_ms_;
163  max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
164  send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
165  receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
166  send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
167  multicast_out_hostname_ = daq_pset.get<std::string>("routing_master_hostname", "localhost");
168 
169  // fetch the monitoring parameters and create the MonitoredQuantity instances
170  statsHelperPtr_->createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
171 
172  // create the requested TokenReceiver
173  token_receiver_.reset(new TokenReceiver(token_receiver_pset_, policy_, routing_mode_, sender_ranks_.size(), max_table_update_interval_ms_));
174  token_receiver_->setStatsHelper(statsHelperPtr_, TOKENS_RECEIVED_STAT_KEY);
175  token_receiver_->startTokenReception();
176  token_receiver_->pauseTokenReception();
177 
178  shutdown_requested_.store(false);
179  return true;
180 }
181 
182 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t)
183 {
184  run_id_ = id;
185  stop_requested_.store(false);
186  pause_requested_.store(false);
187 
188  statsHelperPtr_->resetStatistics();
189 
190  metricMan->do_start();
191  table_update_count_ = 0;
192  token_receiver_->setRunNumber(run_id_.run());
193  token_receiver_->resumeTokenReception();
194 
195  TLOG(TLVL_INFO) << "Started run " << run_id_.run();
196  return true;
197 }
198 
199 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t)
200 {
201  TLOG(TLVL_INFO) << "Stopping run " << run_id_.run()
202  << " after " << table_update_count_ << " table updates."
203  << " and " << token_receiver_->getReceivedTokenCount() << " received tokens.";
204  stop_requested_.store(true);
205  token_receiver_->pauseTokenReception();
206  run_id_ = art::RunID::flushRun();
207  return true;
208 }
209 
210 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t)
211 {
212  TLOG(TLVL_INFO) << "Pausing run " << run_id_.run()
213  << " after " << table_update_count_ << " table updates."
214  << " and " << token_receiver_->getReceivedTokenCount() << " received tokens.";
215  pause_requested_.store(true);
216  return true;
217 }
218 
219 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t)
220 {
221  TLOG(TLVL_DEBUG) << "Resuming run " << run_id_.run();
222  pause_requested_.store(false);
223  metricMan->do_start();
224  return true;
225 }
226 
228 {
229  shutdown_requested_.store(true);
230  token_receiver_->stopTokenReception();
231  policy_.reset();
232  metricMan->shutdown();
233  return true;
234 }
235 
236 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
237 {
238  TLOG(TLVL_INFO) << "soft_initialize method called with "
239  << "ParameterSet = \"" << pset.to_string()
240  << "\".";
241  return initialize(pset, e, f);
242 }
243 
244 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
245 {
246  TLOG(TLVL_INFO) << "reinitialize method called with "
247  << "ParameterSet = \"" << pset.to_string()
248  << "\".";
249  return initialize(pset, e, f);
250 }
251 
253 {
254  if (rt_priority_ > 0)
255  {
256 #pragma GCC diagnostic push
257 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
258  sched_param s_param = {};
259  s_param.sched_priority = rt_priority_;
260  if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
261  TLOG(TLVL_WARNING) << "setting realtime priority failed";
262 #pragma GCC diagnostic pop
263  }
264 
265  // try-catch block here?
266 
267  // how to turn RT PRI off?
268  if (rt_priority_ > 0)
269  {
270 #pragma GCC diagnostic push
271 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
272  sched_param s_param = {};
273  s_param.sched_priority = rt_priority_;
274  int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
275  if (status != 0)
276  {
277  TLOG(TLVL_ERROR)
278  << "Failed to set realtime priority to " << rt_priority_
279  << ", return code = " << status;
280  }
281 #pragma GCC diagnostic pop
282  }
283 
284  //MPI_Barrier(local_group_comm_);
285 
286  TLOG(TLVL_DEBUG) << "Sending initial table.";
287  auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
288  auto nextSendTime = startTime;
289  double delta_time;
290  while (!stop_requested_ && !pause_requested_)
291  {
292  startTime = artdaq::MonitoredQuantity::getCurrentTime();
293 
294  if (startTime >= nextSendTime)
295  {
296  auto table = policy_->GetCurrentTable();
297  if (table.size() > 0)
298  {
299  send_event_table(table);
300  ++table_update_count_;
301  delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
302  statsHelperPtr_->addSample(TABLE_UPDATES_STAT_KEY, delta_time);
303  TLOG(16) << "process_fragments TABLE_UPDATES_STAT_KEY=" << delta_time;
304 
305  bool readyToReport = statsHelperPtr_->readyToReport();
306  if (readyToReport)
307  {
308  std::string statString = buildStatisticsString_();
309  TLOG(TLVL_INFO) << statString;
310  sendMetrics_();
311  }
312  }
313  else
314  {
315  TLOG(TLVL_TRACE) << "No tokens received in this update interval (" << current_table_interval_ms_ << " ms)! This most likely means that the receivers are not keeping up!";
316  }
317  auto max_tokens = policy_->GetMaxNumberOfTokens();
318  if (max_tokens > 0)
319  {
320  auto frac = table.size() / static_cast<double>(max_tokens);
321  if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
322  if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
323  if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
324  if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
325  }
326  nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
327  TLOG(TLVL_TRACE) << "current_table_interval_ms is now " << current_table_interval_ms_;
328  }
329  else
330  {
331  usleep(current_table_interval_ms_ * 10); // 1/100 of the table update interval
332  }
333  }
334 
335  TLOG(TLVL_DEBUG) << "stop_requested_ is " << stop_requested_ << ", pause_requested_ is " << pause_requested_ << ", exiting process_event_table loop";
336  policy_->Reset();
337  metricMan->do_stop();
338 }
339 
341 {
342  // Reconnect table socket, if necessary
343  if (table_socket_ == -1)
344  {
345  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
346  if (table_socket_ < 0)
347  {
348  TLOG(TLVL_ERROR) << "I failed to create the socket for sending Data Requests! Errno: " << errno;
349  exit(1);
350  }
351  auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
352  if (sts == -1)
353  {
354  TLOG(TLVL_ERROR) << "Unable to resolve table_update_address";
355  exit(1);
356  }
357 
358  auto yes = 1;
359  if (multicast_out_hostname_ != "localhost")
360  {
361  TLOG(TLVL_DEBUG) << "Making sure that multicast sending uses the correct interface for hostname " << multicast_out_hostname_;
362  struct in_addr addr;
363  sts = GetInterfaceForNetwork(multicast_out_hostname_.c_str(), addr);
364  if (sts == -1)
365  {
366  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to determine the multicast interface address from the routing_master_address parameter value of " << multicast_out_hostname_ << std::endl;
367  exit(1);
368  }
369  char addr_str[INET_ADDRSTRLEN];
370  inet_ntop(AF_INET, &(addr), addr_str, INET_ADDRSTRLEN);
371  TLOG(TLVL_INFO) << "Successfully determined the multicast interface address for " << multicast_out_hostname_ << ": " << addr_str << " (RoutingMaster sending table updates to BoardReaders)";
372 
373  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
374  {
375  TLOG(TLVL_ERROR) << "RoutingMasterCore: Unable to enable port reuse on table update socket";
376  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
377  exit(1);
378  }
379 
380  if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0)
381  {
382  TLOG(TLVL_ERROR) << "Unable to enable multicast loopback on table socket";
383  exit(1);
384  }
385  if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
386  {
387  TLOG(TLVL_ERROR) << "Cannot set outgoing interface. Errno: " << errno;
388  exit(1);
389  }
390  }
391  if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1)
392  {
393  TLOG(TLVL_ERROR) << "Cannot set request socket to broadcast. Errno: " << errno;
394  exit(1);
395  }
396  }
397 
398  // Reconnect ack socket, if necessary
399  if (ack_socket_ == -1)
400  {
401  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
402  if (ack_socket_ < 0)
403  {
404  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
405  exit(1);
406  }
407 
408  struct sockaddr_in si_me_request;
409 
410  auto yes = 1;
411  if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
412  {
413  TLOG(TLVL_ERROR) << "RoutingMasterCore: Unable to enable port reuse on ack socket. errno=" << errno;
414  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
415  exit(1);
416  }
417 
418  // 10-Apr-2019, KAB: debugging information about the size of the receive buffer
419  int sts;
420  int len = 0;
421  socklen_t arglen = sizeof(len);
422  sts = getsockopt(ack_socket_, SOL_SOCKET, SO_RCVBUF, &len, &arglen);
423  TLOG(TLVL_INFO) << "ACK RCVBUF initial: " << len << " sts/errno=" << sts << "/" << errno << " arglen=" << arglen;
424 
425  memset(&si_me_request, 0, sizeof(si_me_request));
426  si_me_request.sin_family = AF_INET;
427  si_me_request.sin_port = htons(receive_acks_port_);
428  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
429  if (bind(ack_socket_, reinterpret_cast<struct sockaddr*>(&si_me_request), sizeof(si_me_request)) == -1)
430  {
431  TLOG(TLVL_ERROR) << "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << ", errno=" << errno;
432  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
433  exit(1);
434  }
435  TLOG(TLVL_DEBUG) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_;
436  }
437 
438  auto acks = std::unordered_map<int, bool>();
439  for (auto& r : sender_ranks_)
440  {
441  acks[r] = false;
442  }
443  auto counter = 0U;
444  auto start_time = std::chrono::steady_clock::now();
445  while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) { return !p.second; }) > 0 && !stop_requested_)
446  {
447  // Send table update
448  auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
449  auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
450 
451  // 10-Apr-2019, KAB: added information on which senders have already acknowledged this update
452  for (auto ackIter = acks.begin(); ackIter != acks.end(); ++ackIter)
453  {
454  TLOG(27) << "Table update already acknowledged? rank " << ackIter->first << " is " << ackIter->second
455  << " (size of 'already_acknowledged_ranks bitset is " << (8 * sizeof(header.already_acknowledged_ranks)) << ")";
456  if (ackIter->first < static_cast<int>(8 * sizeof(header.already_acknowledged_ranks)))
457  {
458  if (ackIter->second) { header.already_acknowledged_ranks.set(ackIter->first); }
459  }
460  }
461 
462  assert(packetSize + sizeof(header) < MAX_ROUTING_TABLE_SIZE);
463  std::vector<uint8_t> buffer(packetSize + sizeof(header));
464  memcpy(&buffer[0], &header, sizeof(detail::RoutingPacketHeader));
465  memcpy(&buffer[sizeof(detail::RoutingPacketHeader)], &packet[0], packetSize);
466 
467  TLOG(TLVL_DEBUG) << "Sending table information for " << header.nEntries << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ << ", outgoing interface " << multicast_out_hostname_;
468  TRACE(16, "headerData:0x%016lx%016lx packetData:0x%016lx%016lx", ((unsigned long*)&header)[0], ((unsigned long*)&header)[1], ((unsigned long*)&packet[0])[0], ((unsigned long*)&packet[0])[1]);
469  auto sts = sendto(table_socket_, &buffer[0], buffer.size(), 0, reinterpret_cast<struct sockaddr*>(&send_tables_addr_), sizeof(send_tables_addr_));
470  if (sts != static_cast<ssize_t>(buffer.size()))
471  {
472  TLOG(TLVL_ERROR) << "Error sending routing table. sts=" << sts;
473  }
474 
475  // Collect acks
476 
477  auto first = packet[0].sequence_id;
478  auto last = packet.rbegin()->sequence_id;
479  TLOG(TLVL_DEBUG) << "Sent " << sts << " bytes. Expecting acks to have first= " << first << ", and last= " << last;
480 
481  auto startTime = std::chrono::steady_clock::now();
482  while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) { return !p.second; }) > 0)
483  {
484  auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
485  if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms)
486  {
487  if (++counter > max_ack_cycle_count_ && table_update_count_ > 0)
488  {
489  TLOG(TLVL_WARNING) << "Did not receive acks from all senders after resending table " << counter
490  << " times during the table_update_interval. Check the status of the senders!";
491  }
492  else
493  {
494  TLOG(TLVL_WARNING) << "Did not receive acks from all senders within the timeout (" << table_ack_wait_time_ms << " ms). Resending table update";
495  }
496 
497  if (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) { return !p.second; }) <= 3)
498  {
499  auto ackIter = acks.begin();
500  while (ackIter != acks.end())
501  {
502  if (!ackIter->second)
503  {
504  TLOG(TLVL_TRACE) << "Did not receive ack from rank " << ackIter->first;
505  }
506  ++ackIter;
507  }
508  }
509  break;
510  }
511 
512  TLOG(20) << "send_event_table: Polling Request socket for new requests";
513  auto ready = true;
514  while (ready)
515  {
517  if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0)
518  {
519  if (errno == EWOULDBLOCK || errno == EAGAIN)
520  {
521  TLOG(20) << "send_event_table: No more ack datagrams on ack socket.";
522  ready = false;
523  }
524  else
525  {
526  TLOG(TLVL_ERROR) << "An unexpected error occurred during ack packet receive";
527  exit(2);
528  }
529  }
530  else
531  {
532  TLOG(TLVL_DEBUG) << "Ack packet from rank " << buffer.rank << " has first= " << buffer.first_sequence_id
533  << " and last= " << buffer.last_sequence_id << ", packet_size=" << sizeof(detail::RoutingAckPacket);
534  if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
535  {
536  TLOG(TLVL_DEBUG) << "Received table update acknowledgement from sender with rank " << buffer.rank << ".";
537  acks[buffer.rank] = true;
538  TLOG(TLVL_DEBUG) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) { return !p.second; })
539  << " acks outstanding";
540  }
541  else
542  {
543  if (!acks.count(buffer.rank))
544  {
545  TLOG(TLVL_ERROR) << "Received acknowledgement from invalid rank " << buffer.rank << "!"
546  << " Cross-talk between RoutingMasters means there's a configuration error!";
547  }
548  else
549  {
550  TLOG(TLVL_WARNING) << "Received acknowledgement from rank " << buffer.rank
551  << " that had incorrect sequence ID information. Discarding."
552  << " Expected first/last=" << first << "/" << last
553  << " recvd=" << buffer.first_sequence_id << "/" << buffer.last_sequence_id;
554  }
555  }
556  }
557  }
558  usleep(table_ack_wait_time_ms * 1000 / 10);
559  }
560  }
561  if (metricMan)
562  {
563  artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time;
564  metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average);
565  }
566 }
567 
568 std::string artdaq::RoutingMasterCore::report(std::string const&) const
569 {
570  std::string resultString;
571 
572  // if we haven't been able to come up with any report so far, say so
573  auto tmpString = app_name + " run number = " + std::to_string(run_id_.run()) + ", table updates sent = " + std::to_string(table_update_count_) + ", Receiver tokens received = " + std::to_string(token_receiver_->getReceivedTokenCount());
574  return tmpString;
575 }
576 
577 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const
578 {
579  std::ostringstream oss;
580  oss << app_name << " statistics:" << std::endl;
581 
582  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
583  if (mqPtr.get() != nullptr)
584  {
585  artdaq::MonitoredQuantityStats stats;
586  mqPtr->getStats(stats);
587  oss << " Table Update statistics: "
588  << stats.recentSampleCount << " table updates sent at "
589  << stats.recentSampleRate << " table updates/sec, , monitor window = "
590  << stats.recentDuration << " sec" << std::endl;
591  oss << " Average times per table update: ";
592  if (stats.recentSampleRate > 0.0)
593  {
594  oss << " elapsed time = "
595  << (1.0 / stats.recentSampleRate) << " sec";
596  }
597  oss << ", avg table acknowledgement wait time = "
598  << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
599  }
600 
601  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
602  if (mqPtr.get() != nullptr)
603  {
604  artdaq::MonitoredQuantityStats stats;
605  mqPtr->getStats(stats);
606  oss << " Received Token statistics: "
607  << stats.recentSampleCount << " tokens received at "
608  << stats.recentSampleRate << " tokens/sec, , monitor window = "
609  << stats.recentDuration << " sec" << std::endl;
610  oss << " Average times per token: ";
611  if (stats.recentSampleRate > 0.0)
612  {
613  oss << " elapsed time = "
614  << (1.0 / stats.recentSampleRate) << " sec";
615  }
616  oss << ", input token wait time = "
617  << mqPtr->getRecentValueSum() << " sec" << std::endl;
618  }
619 
620  return oss.str();
621 }
622 
623 void artdaq::RoutingMasterCore::sendMetrics_()
624 {
625  if (metricMan)
626  {
627  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
628  if (mqPtr.get() != nullptr)
629  {
630  artdaq::MonitoredQuantityStats stats;
631  mqPtr->getStats(stats);
632  metricMan->sendMetric("Table Update Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::LastPoint);
633  metricMan->sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
634  metricMan->sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average);
635  }
636 
637  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
638  if (mqPtr.get() != nullptr)
639  {
640  artdaq::MonitoredQuantityStats stats;
641  mqPtr->getStats(stats);
642  metricMan->sendMetric("Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::LastPoint);
643  metricMan->sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
644  metricMan->sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
645  }
646  }
647 }
This class manages MonitoredQuantity instances for the *Core classes.
bool resume(uint64_t, uint64_t)
Resumes the RoutingMasterCore.
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
bool soft_initialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Soft-Initializes the RoutingMasterCore.
static const std::string TOKENS_RECEIVED_STAT_KEY
Key for the Tokens Received MonitoredQuantity.
A row of the Routing Table.
Receives event builder &quot;free buffer&quot; tokens and adds them to a specified RoutingPolicy.
bool reinitialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Reinitializes the RoutingMasterCore.
bool start(art::RunID id, uint64_t, uint64_t)
Start the RoutingMasterCore.
std::bitset< 1024 > already_acknowledged_ranks
Bitset of ranks which have already sent valid acknowledgements and therefore do not need to send agai...
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
RoutingMasterCore()
RoutingMasterCore Constructor.
int GetInterfaceForNetwork(char const *host_in, in_addr &addr)
Convert an IP address to the network address of the interface sharing the subnet mask.
Definition: TCPConnect.cc:217
Events should be routed by sequence ID (BR -&gt; EB)
bool initialize(fhicl::ParameterSet const &pset, uint64_t, uint64_t)
Processes the initialize request.
std::shared_ptr< RoutingMasterPolicy > makeRoutingMasterPolicy(std::string const &policy_plugin_spec, fhicl::ParameterSet const &ps)
Load a RoutingMasterPolicy plugin.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
bool pause(uint64_t, uint64_t)
Pauses the RoutingMasterCore.
The header of the Routing Table, containing the magic bytes and the number of entries.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
bool stop(uint64_t, uint64_t)
Stops the RoutingMasterCore.
int rank
The rank from which the RoutingAckPacket came.
std::string report(std::string const &) const
Send a report on the current status of the RoutingMasterCore.
static const std::string TABLE_UPDATES_STAT_KEY
Key for Table Update count MonnitoredQuantity.
bool shutdown(uint64_t)
Shuts Down the RoutingMasterCore.
void process_event_table()
Main loop of the RoutingMasterCore. Determines when to send the next table update, asks the RoutingMasterPolicy for the table to send, and sends it.
void send_event_table(detail::RoutingPacket table)
Sends a detail::RoutingPacket to the table receivers.
Events should be routed by send count (EB -&gt; Agg)