artdaq  v3_00_01
RoutingMasterCore.cc
1 #include <sys/un.h>
2 #include <sys/time.h>
3 #include <sys/epoll.h>
4 #include <arpa/inet.h>
5 #include <netdb.h>
6 #include <pthread.h>
7 #include <sched.h>
8 #include <algorithm>
9 
10 #include "canvas/Utilities/Exception.h"
11 #include "cetlib/exception.h"
12 
13 #define TRACE_NAME "RoutingMasterCore" // include these 2 first -
14 #include "artdaq/DAQdata/Globals.hh" // to get tracemf.h before trace.h
15 #include "artdaq-core/Data/Fragment.hh"
16 #include "artdaq-core/Utilities/ExceptionHandler.hh"
17 
18 #include "artdaq/Application/RoutingMasterCore.hh"
19 #include "artdaq/Application/Routing/makeRoutingMasterPolicy.hh"
22 
23 const std::string artdaq::RoutingMasterCore::
24 TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates");
25 const std::string artdaq::RoutingMasterCore::
26 TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived");
27 
28 artdaq::RoutingMasterCore::RoutingMasterCore(int rank, std::string name) :
29  name_(name)
30  , received_token_counter_()
31  , shutdown_requested_(false)
32  , stop_requested_(false)
33  , pause_requested_(false)
34  , token_socket_(-1)
35  , table_socket_(-1)
36  , ack_socket_(-1)
37 {
38  TLOG_DEBUG(name_) << "Constructor" << TLOG_ENDL;
41  metricMan = &metricMan_;
42  my_rank = rank;
43 }
44 
46 {
47  TLOG_DEBUG(name_) << "Destructor" << TLOG_ENDL;
48  if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
49 }
50 
51 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t)
52 {
53  TLOG_DEBUG(name_) << "initialize method called with "
54  << "ParameterSet = \"" << pset.to_string()
55  << "\"." << TLOG_ENDL;
56 
57  // pull out the relevant parts of the ParameterSet
58  fhicl::ParameterSet daq_pset;
59  try
60  {
61  daq_pset = pset.get<fhicl::ParameterSet>("daq");
62  }
63  catch (...)
64  {
65  TLOG_ERROR(name_)
66  << "Unable to find the DAQ parameters in the initialization "
67  << "ParameterSet: \"" + pset.to_string() + "\"." << TLOG_ENDL;
68  return false;
69  }
70  try
71  {
72  policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
73  }
74  catch (...)
75  {
76  TLOG_ERROR(name_)
77  << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\"." << TLOG_ENDL;
78  return false;
79  }
80 
81  // pull out the Metric part of the ParameterSet
82  fhicl::ParameterSet metric_pset;
83  try
84  {
85  metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
86  }
87  catch (...) {} // OK if there's no metrics table defined in the FHiCL
88 
89  if (metric_pset.is_empty())
90  {
91  TLOG_INFO(name_) << "No metric plugins appear to be defined" << TLOG_ENDL;
92  }
93  try
94  {
95  metricMan_.initialize(metric_pset, name_);
96  }
97  catch (...)
98  {
99  ExceptionHandler(ExceptionHandlerRethrow::no,
100  "Error loading metrics in RoutingMasterCore::initialize()");
101  }
102 
103  // create the requested CommandableFragmentGenerator
104  auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
105  if (policy_plugin_spec.length() == 0)
106  {
107  TLOG_ERROR(name_)
108  << "No fragment generator (parameter name = \"policy\") was "
109  << "specified in the policy ParameterSet. The "
110  << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\"." << TLOG_ENDL;
111  return false;
112  }
113  try
114  {
115  policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_);
116  }
117  catch (...)
118  {
119  std::stringstream exception_string;
120  exception_string << "Exception thrown during initialization of policy of type \""
121  << policy_plugin_spec << "\"";
122 
123  ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
124 
125  TLOG_DEBUG(name_) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string() << TLOG_ENDL;
126 
127  return false;
128  }
129 
130  rt_priority_ = daq_pset.get<int>("rt_priority", 0);
131  sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
132  num_receivers_ = policy_->GetReceiverCount();
133 
134  receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
135  receive_token_events_ = std::vector<epoll_event>(num_receivers_ + 1);
136 
137  auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
139  max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
140  current_table_interval_ms_ = max_table_update_interval_ms_;
141  max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
142  receive_token_port_ = daq_pset.get<int>("routing_token_port", 35555);
143  send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
144  receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
145  send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
146  receive_address_ = daq_pset.get<std::string>("routing_master_hostname", "localhost");
147 
148  // fetch the monitoring parameters and create the MonitoredQuantity instances
149  statsHelper_.createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
150 
151  shutdown_requested_.store(false);
152  start_recieve_token_thread_();
153  return true;
154 }
155 
156 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t)
157 {
158  stop_requested_.store(false);
159  pause_requested_.store(false);
160 
161  statsHelper_.resetStatistics();
162  policy_->Reset();
163 
164  metricMan_.do_start();
165  run_id_ = id;
166  table_update_count_ = 0;
167  received_token_count_ = 0;
168 
169  TLOG_DEBUG(name_) << "Started run " << std::to_string(run_id_.run()) << TLOG_ENDL;
170  return true;
171 }
172 
173 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t)
174 {
175  TLOG_DEBUG(name_) << "Stopping run " << std::to_string(run_id_.run())
176  << " after " << std::to_string(table_update_count_) << " table updates."
177  << " and " << received_token_count_ << " received tokens." << TLOG_ENDL;
178  stop_requested_.store(true);
179  return true;
180 }
181 
182 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t)
183 {
184  TLOG_DEBUG(name_) << "Pausing run " << std::to_string(run_id_.run())
185  << " after " << table_update_count_ << " table updates."
186  << " and " << received_token_count_ << " received tokens." << TLOG_ENDL;
187  pause_requested_.store(true);
188  return true;
189 }
190 
191 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t)
192 {
193  TLOG_DEBUG(name_) << "Resuming run " << run_id_.run() << TLOG_ENDL;
194  policy_->Reset();
195  pause_requested_.store(false);
196  metricMan_.do_start();
197  return true;
198 }
199 
201 {
202  shutdown_requested_.store(true);
203  if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
204  policy_.reset(nullptr);
205  metricMan_.shutdown();
206  return true;
207 }
208 
209 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
210 {
211  TLOG_DEBUG(name_) << "soft_initialize method called with "
212  << "ParameterSet = \"" << pset.to_string()
213  << "\"." << TLOG_ENDL;
214  return initialize(pset, e, f);
215 }
216 
217 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
218 {
219  TLOG_DEBUG(name_) << "reinitialize method called with "
220  << "ParameterSet = \"" << pset.to_string()
221  << "\"." << TLOG_ENDL;
222  return initialize(pset, e, f);
223 }
224 
226 {
227  if (rt_priority_ > 0)
228  {
229 #pragma GCC diagnostic push
230 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
231  sched_param s_param = {};
232  s_param.sched_priority = rt_priority_;
233  if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
234  TLOG_WARNING(name_) << "setting realtime priority failed" << TLOG_ENDL;
235 #pragma GCC diagnostic pop
236  }
237 
238  // try-catch block here?
239 
240  // how to turn RT PRI off?
241  if (rt_priority_ > 0)
242  {
243 #pragma GCC diagnostic push
244 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
245  sched_param s_param = {};
246  s_param.sched_priority = rt_priority_;
247  int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
248  if (status != 0)
249  {
250  TLOG_ERROR(name_)
251  << "Failed to set realtime priority to " << std::to_string(rt_priority_)
252  << ", return code = " << status << TLOG_ENDL;
253  }
254 #pragma GCC diagnostic pop
255  }
256 
257  //MPI_Barrier(local_group_comm_);
258 
259  TLOG_DEBUG(name_) << "Sending initial table." << TLOG_ENDL;
260  auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
261  auto nextSendTime = startTime;
262  double delta_time;
263  while (true)
264  {
265  if (stop_requested_ || pause_requested_) { break; }
266  startTime = artdaq::MonitoredQuantity::getCurrentTime();
267 
268  if (startTime >= nextSendTime)
269  {
270  auto table = policy_->GetCurrentTable();
271  if (table.size() > 0)
272  {
273  send_event_table(table);
274  ++table_update_count_;
275  delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
276  statsHelper_.addSample(TABLE_UPDATES_STAT_KEY, delta_time);
277  TLOG_ARB(16, name_) << "process_fragments TABLE_UPDATES_STAT_KEY=" << std::to_string(delta_time) << TLOG_ENDL;
278  }
279  else
280  {
281  TLOG_WARNING(name_) << "No tokens received in this update interval! This is most likely a Very Bad Thing!" << TLOG_ENDL;
282  }
283  auto max_tokens = policy_->GetMaxNumberOfTokens();
284  if (max_tokens > 0)
285  {
286  auto frac = table.size() / static_cast<double>(max_tokens);
287  if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
288  if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
289  if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
290  if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
291  }
292  nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
293  TLOG_DEBUG(name_) << "current_table_interval_ms is now " << current_table_interval_ms_ << TLOG_ENDL;
294  }
295  else
296  {
297  usleep(current_table_interval_ms_ * 10); // 1/100 of the table update interval
298  }
299  }
300 
301  metricMan_.do_stop();
302 
303  return table_update_count_;
304 }
305 
307 {
308  // Reconnect table socket, if necessary
309  if (table_socket_ == -1)
310  {
311  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
312  if (table_socket_ < 0)
313  {
314  TLOG_ERROR(name_) << "I failed to create the socket for sending Data Requests! Errno: " << std::to_string(errno) << TLOG_ENDL;
315  exit(1);
316  }
317  auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
318  if (sts == -1)
319  {
320  TLOG_ERROR(name_) << "Unable to resolve table_update_address" << TLOG_ENDL;
321  exit(1);
322  }
323 
324  auto yes = 1;
325  if (receive_address_ != "localhost")
326  {
327  TLOG_DEBUG(name_) << "Making sure that multicast sending uses the correct interface for hostname " << receive_address_ << TLOG_ENDL;
328  struct in_addr addr;
329  sts = ResolveHost(receive_address_.c_str(), addr);
330  if (sts == -1)
331  {
332  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;;
333  }
334 
335  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
336  {
337  throw art::Exception(art::errors::Configuration) <<
338  "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
339  exit(1);
340  }
341 
342  if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0)
343  {
344  TLOG_ERROR("RequestSender") << "Unable to enable multicast loopback on table socket" << TLOG_ENDL;
345  exit(1);
346  }
347  if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
348  {
349  TLOG_ERROR(name_) << "Cannot set outgoing interface. Errno: " << std::to_string(errno) << TLOG_ENDL;
350  exit(1);
351  }
352  }
353  if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1)
354  {
355  TLOG_ERROR(name_) << "Cannot set request socket to broadcast. Errno: " << std::to_string(errno) << TLOG_ENDL;
356  exit(1);
357  }
358  }
359 
360  // Reconnect ack socket, if necessary
361  if (ack_socket_ == -1)
362  {
363  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
364  if (ack_socket_ < 0)
365  {
366  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
367  exit(1);
368  }
369 
370  struct sockaddr_in si_me_request;
371 
372  auto yes = 1;
373  if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
374  {
375  throw art::Exception(art::errors::Configuration) <<
376  "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
377  exit(1);
378  }
379  memset(&si_me_request, 0, sizeof(si_me_request));
380  si_me_request.sin_family = AF_INET;
381  si_me_request.sin_port = htons(receive_acks_port_);
382  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
383  if (bind(ack_socket_, reinterpret_cast<struct sockaddr *>(&si_me_request), sizeof(si_me_request)) == -1)
384  {
385  throw art::Exception(art::errors::Configuration) <<
386  "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
387  exit(1);
388  }
389  TLOG_DEBUG(name_) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_ << TLOG_ENDL;
390  }
391 
392  auto acks = std::unordered_map<int, bool>();
393  for (auto& r : sender_ranks_)
394  {
395  acks[r] = false;
396  }
397  auto counter = 0U;
398  auto start_time = std::chrono::steady_clock::now();
399  while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
400  {
401  // Send table update
402  auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
403  auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
404 
405  TLOG_DEBUG(name_) << "Sending table information for " << std::to_string(header.nEntries) << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ << TLOG_ENDL;
406  if (sendto(table_socket_, &header, sizeof(detail::RoutingPacketHeader), 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
407  {
408  TLOG_ERROR(name_) << "Error sending request message header" << TLOG_ENDL;
409  }
410  if (sendto(table_socket_, &packet[0], packetSize, 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
411  {
412  TLOG_ERROR(name_) << "Error sending request message data" << TLOG_ENDL;
413  }
414 
415  // Collect acks
416 
417  auto first = packet[0].sequence_id;
418  auto last = packet.rbegin()->sequence_id;
419  TLOG_DEBUG(name_) << "Expecting acks to have first= " << std::to_string(first) << ", and last= " << std::to_string(last) << TLOG_ENDL;
420 
421 
422  auto startTime = std::chrono::steady_clock::now();
423  while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
424  {
425  auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
426  if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms)
427  {
428  if (counter > max_ack_cycle_count_ && table_update_count_ > 0)
429  {
430  TLOG_ERROR(name_) << "Did not receive acks from all senders after resending table " << std::to_string(counter)
431  << " times during the table_update_interval. Check the status of the senders!" << TLOG_ENDL;
432  break;
433  }
434  TLOG_WARNING(name_) << "Did not receive acks from all senders within the table_ack_wait_time. Resending table update" << TLOG_ENDL;
435  break;
436  }
437 
438  TLOG_ARB(20, name_) << "send_event_table: Polling Request socket for new requests" << TLOG_ENDL;
439  auto ready = true;
440  while (ready)
441  {
443  if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0)
444  {
445  if (errno == EWOULDBLOCK || errno == EAGAIN)
446  {
447  TLOG_ARB(20, name_) << "send_event_table: No more ack datagrams on ack socket." << TLOG_ENDL;
448  ready = false;
449  }
450  else
451  {
452  TLOG_ERROR(name_) << "An unexpected error occurred during ack packet receive" << TLOG_ENDL;
453  exit(2);
454  }
455  }
456  else
457  {
458  TLOG_DEBUG(name_) << "Ack packet from rank " << buffer.rank << " has first= " << std::to_string(buffer.first_sequence_id)
459  << " and last= " << std::to_string(buffer.last_sequence_id) << TLOG_ENDL;
460  if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
461  {
462  TLOG_DEBUG(name_) << "Received table update acknowledgement from sender with rank " << std::to_string(buffer.rank) << "." << TLOG_ENDL;
463  acks[buffer.rank] = true;
464  TLOG_DEBUG(name_) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; })
465  << " acks outstanding" << TLOG_ENDL;
466  }
467  else
468  {
469  if (!acks.count(buffer.rank))
470  {
471  TLOG_ERROR(name_) << "Received acknowledgement from invalid rank " << buffer.rank << "!"
472  << " Cross-talk between RoutingMasters means there's a configuration error!" << TLOG_ENDL;
473  }
474  else
475  {
476  TLOG_WARNING(name_) << "Received acknowledgement from rank " << buffer.rank
477  << " that had incorrect sequence ID information. Discarding." << TLOG_ENDL;
478  }
479  }
480  }
481  }
482  usleep(table_ack_wait_time_ms * 1000 / 10);
483  }
484  }
485  if (metricMan)
486  {
487  artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time;
488  metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average);
489  }
490 }
491 
492 void artdaq::RoutingMasterCore::receive_tokens_()
493 {
494  while (!shutdown_requested_)
495  {
496  TLOG_DEBUG(name_) << "Receive Token loop start" << TLOG_ENDL;
497  if (token_socket_ == -1)
498  {
499  TLOG_DEBUG(name_) << "Opening token listener socket" << TLOG_ENDL;
500  token_socket_ = TCP_listen_fd(receive_token_port_, 3 * sizeof(detail::RoutingToken));
501 
502  if (token_epoll_fd_ != -1) close(token_epoll_fd_);
503  struct epoll_event ev;
504  token_epoll_fd_ = epoll_create1(0);
505  ev.events = EPOLLIN | EPOLLPRI;
506  ev.data.fd = token_socket_;
507  if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, token_socket_, &ev) == -1)
508  {
509  TLOG_ERROR(name_) << "Could not register listen socket to epoll fd" << TLOG_ENDL;
510  exit(3);
511  }
512  }
513  if (token_socket_ == -1 || token_epoll_fd_ == -1)
514  {
515  TLOG_DEBUG(name_) << "One of the listen sockets was not opened successfully." << TLOG_ENDL;
516  return;
517  }
518 
519  auto nfds = epoll_wait(token_epoll_fd_, &receive_token_events_[0], receive_token_events_.size(), current_table_interval_ms_);
520  if (nfds == -1)
521  {
522  perror("epoll_wait");
523  exit(EXIT_FAILURE);
524  }
525 
526  TLOG_DEBUG(name_) << "Received " << std::to_string(nfds) << " events" << TLOG_ENDL;
527  for (auto n = 0; n < nfds; ++n)
528  {
529  if (receive_token_events_[n].data.fd == token_socket_)
530  {
531  TLOG_DEBUG(name_) << "Accepting new connection on token_socket" << TLOG_ENDL;
532  sockaddr_in addr;
533  socklen_t arglen = sizeof(addr);
534  auto conn_sock = accept(token_socket_, (struct sockaddr *)&addr, &arglen);
535 
536  if (conn_sock == -1)
537  {
538  perror("accept");
539  exit(EXIT_FAILURE);
540  }
541 
542  receive_token_addrs_[conn_sock] = std::string(inet_ntoa(addr.sin_addr));
543  struct epoll_event ev;
544  ev.events = EPOLLIN | EPOLLET;
545  ev.data.fd = conn_sock;
546  if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, conn_sock, &ev) == -1)
547  {
548  perror("epoll_ctl: conn_sock");
549  exit(EXIT_FAILURE);
550  }
551  }
552  else
553  {
554  auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
555  detail::RoutingToken buff;
556  auto sts = read(receive_token_events_[n].data.fd, &buff, sizeof(detail::RoutingToken));
557  if (sts != sizeof(detail::RoutingToken) || buff.header != TOKEN_MAGIC)
558  {
559  TLOG_ERROR(name_) << "Received invalid token from " << receive_token_addrs_[receive_token_events_[n].data.fd] << TLOG_ENDL;
560  }
561  else
562  {
563  TLOG_DEBUG(name_) << "Received token from " << std::to_string(buff.rank) << " indicating " << buff.new_slots_free << " slots are free." << TLOG_ENDL;
564  received_token_count_ += buff.new_slots_free;
566  {
567  policy_->AddReceiverToken(buff.rank, buff.new_slots_free);
568  }
569  else if (routing_mode_ == detail::RoutingMasterMode::RouteBySendCount)
570  {
571  if (!received_token_counter_.count(buff.rank)) received_token_counter_[buff.rank] = 0;
572  received_token_counter_[buff.rank] += buff.new_slots_free;
573  TLOG_DEBUG(name_) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() << "." << TLOG_ENDL;
574  while (received_token_counter_[buff.rank] >= sender_ranks_.size())
575  {
576  TLOG_DEBUG(name_) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size()
577  << "... Sending token to policy" << TLOG_ENDL;
578  policy_->AddReceiverToken(buff.rank, 1);
579  received_token_counter_[buff.rank] -= sender_ranks_.size();
580  }
581  }
582  }
583  auto delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
584  statsHelper_.addSample(TOKENS_RECEIVED_STAT_KEY, delta_time);
585 
586  }
587  }
588  }
589 }
590 
591 void artdaq::RoutingMasterCore::start_recieve_token_thread_()
592 {
593  if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
594  TLOG_INFO(name_) << "Starting Token Reception Thread" << TLOG_ENDL;
595  ev_token_receive_thread_ = boost::thread(&RoutingMasterCore::receive_tokens_, this);
596 }
597 
598 std::string artdaq::RoutingMasterCore::report(std::string const&) const
599 {
600  std::string resultString;
601 
602  // if we haven't been able to come up with any report so far, say so
603  auto tmpString = name_ + " run number = " + std::to_string(run_id_.run())
604  + ", table updates sent = " + std::to_string(table_update_count_)
605  + ", Receiver tokens received = " + std::to_string(received_token_count_);
606  return tmpString;
607 }
608 
609 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const
610 {
611  std::ostringstream oss;
612  oss << name_ << " statistics:" << std::endl;
613 
614  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
615  if (mqPtr.get() != nullptr)
616  {
617  artdaq::MonitoredQuantityStats stats;
618  mqPtr->getStats(stats);
619  oss << " Table Update statistics: "
620  << stats.recentSampleCount << " table updates sent at "
621  << stats.recentSampleRate << " table updates/sec, , monitor window = "
622  << stats.recentDuration << " sec" << std::endl;
623  oss << " Average times per table update: ";
624  if (stats.recentSampleRate > 0.0)
625  {
626  oss << " elapsed time = "
627  << (1.0 / stats.recentSampleRate) << " sec";
628  }
629  oss << ", avg table acknowledgement wait time = "
630  << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
631  }
632 
633  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
634  if (mqPtr.get() != nullptr)
635  {
636  artdaq::MonitoredQuantityStats stats;
637  mqPtr->getStats(stats);
638  oss << " Received Token statistics: "
639  << stats.recentSampleCount << " tokens received at "
640  << stats.recentSampleRate << " tokens/sec, , monitor window = "
641  << stats.recentDuration << " sec" << std::endl;
642  oss << " Average times per token: ";
643  if (stats.recentSampleRate > 0.0)
644  {
645  oss << " elapsed time = "
646  << (1.0 / stats.recentSampleRate) << " sec";
647  }
648  oss << ", input token wait time = "
649  << mqPtr->getRecentValueSum() << " sec" << std::endl;
650  }
651 
652  return oss.str();
653 }
654 
655 void artdaq::RoutingMasterCore::sendMetrics_()
656 {
657  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
658  if (mqPtr.get() != nullptr)
659  {
660  artdaq::MonitoredQuantityStats stats;
661  mqPtr->getStats(stats);
662  metricMan_.sendMetric("Table Update Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::Accumulate);
663  metricMan_.sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
664  metricMan_.sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average);
665  }
666 
667  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
668  if (mqPtr.get() != nullptr)
669  {
670  artdaq::MonitoredQuantityStats stats;
671  mqPtr->getStats(stats);
672  metricMan_.sendMetric("Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::Accumulate);
673  metricMan_.sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
674  metricMan_.sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
675  }
676 }
bool resume(uint64_t, uint64_t)
Resumes the RoutingMasterCore.
size_t process_event_table()
Main loop of the RoutingMasterCore. Determines when to send the next table update, asks the RoutingMasterPolicy for the table to send, and sends it.
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:27
bool soft_initialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Soft-Initializes the RoutingMasterCore.
static const std::string TOKENS_RECEIVED_STAT_KEY
Key for the Tokens Received MonitoredQuantity.
A row of the Routing Table.
bool reinitialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Reinitializes the RoutingMasterCore.
bool start(art::RunID id, uint64_t, uint64_t)
Start the RoutingMasterCore.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
RoutingMasterCore(int rank, std::string name)
RoutingMasterCore Constructor.
Events should be routed by sequence ID (BR -&gt; EB)
bool initialize(fhicl::ParameterSet const &pset, uint64_t, uint64_t)
Processes the initialize request.
The RoutingToken contains the magic bytes, the rank of the token sender, and the number of slots free...
int TCP_listen_fd(int port, int rcvbuf)
Create a TCP listening socket on the given port and INADDR_ANY, with the given receive buffer...
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
bool pause(uint64_t, uint64_t)
Pauses the RoutingMasterCore.
The header of the Routing Table, containing the magic bytes and the number of entries.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
bool stop(uint64_t, uint64_t)
Stops the RoutingMasterCore.
int rank
The rank from which the RoutingAckPacket came.
std::unique_ptr< RoutingMasterPolicy > makeRoutingMasterPolicy(std::string const &policy_plugin_spec, fhicl::ParameterSet const &ps)
Load a RoutingMasterPolicy plugin.
std::string report(std::string const &) const
Send a report on the current status of the RoutingMasterCore.
static const std::string TABLE_UPDATES_STAT_KEY
Key for Table Update count MonnitoredQuantity.
bool shutdown(uint64_t)
Shuts Down the RoutingMasterCore.
void send_event_table(detail::RoutingPacket table)
Sends a detail::RoutingPacket to the table receivers.
Events should be routed by send count (EB -&gt; Agg)