artdaq  v3_00_03
RoutingMasterCore.cc
1 #include <sys/un.h>
2 #include <sys/time.h>
3 #include <sys/epoll.h>
4 #include <arpa/inet.h>
5 #include <netdb.h>
6 #include <pthread.h>
7 #include <sched.h>
8 #include <algorithm>
9 
10 #include "canvas/Utilities/Exception.h"
11 #include "cetlib/exception.h"
12 
13 #define TRACE_NAME "RoutingMasterCore" // include these 2 first -
14 #include "artdaq/DAQdata/Globals.hh" // to get tracemf.h before trace.h
15 #include "artdaq-core/Data/Fragment.hh"
16 #include "artdaq-core/Utilities/ExceptionHandler.hh"
17 
18 #include "artdaq/Application/RoutingMasterCore.hh"
19 #include "artdaq/Application/Routing/makeRoutingMasterPolicy.hh"
22 
23 const std::string artdaq::RoutingMasterCore::
24 TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates");
25 const std::string artdaq::RoutingMasterCore::
26 TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived");
27 
29  : received_token_counter_()
30  , shutdown_requested_(false)
31  , stop_requested_(false)
32  , pause_requested_(false)
33  , token_socket_(-1)
34  , table_socket_(-1)
35  , ack_socket_(-1)
36 {
37  TLOG_DEBUG(app_name) << "Constructor" << TLOG_ENDL;
40  metricMan = &metricMan_;
41 }
42 
44 {
45  TLOG_DEBUG(app_name) << "Destructor" << TLOG_ENDL;
46  if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
47 }
48 
49 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t)
50 {
51  TLOG_DEBUG(app_name) << "initialize method called with "
52  << "ParameterSet = \"" << pset.to_string()
53  << "\"." << TLOG_ENDL;
54 
55  // pull out the relevant parts of the ParameterSet
56  fhicl::ParameterSet daq_pset;
57  try
58  {
59  daq_pset = pset.get<fhicl::ParameterSet>("daq");
60  }
61  catch (...)
62  {
63  TLOG_ERROR(app_name)
64  << "Unable to find the DAQ parameters in the initialization "
65  << "ParameterSet: \"" + pset.to_string() + "\"." << TLOG_ENDL;
66  return false;
67  }
68  try
69  {
70  policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
71  }
72  catch (...)
73  {
74  TLOG_ERROR(app_name)
75  << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\"." << TLOG_ENDL;
76  return false;
77  }
78 
79  // pull out the Metric part of the ParameterSet
80  fhicl::ParameterSet metric_pset;
81  try
82  {
83  metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
84  }
85  catch (...) {} // OK if there's no metrics table defined in the FHiCL
86 
87  if (metric_pset.is_empty())
88  {
89  TLOG_INFO(app_name) << "No metric plugins appear to be defined" << TLOG_ENDL;
90  }
91  try
92  {
93  metricMan_.initialize(metric_pset, app_name);
94  }
95  catch (...)
96  {
97  ExceptionHandler(ExceptionHandlerRethrow::no,
98  "Error loading metrics in RoutingMasterCore::initialize()");
99  }
100 
101  // create the requested CommandableFragmentGenerator
102  auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
103  if (policy_plugin_spec.length() == 0)
104  {
105  TLOG_ERROR(app_name)
106  << "No fragment generator (parameter name = \"policy\") was "
107  << "specified in the policy ParameterSet. The "
108  << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\"." << TLOG_ENDL;
109  return false;
110  }
111  try
112  {
113  policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_);
114  }
115  catch (...)
116  {
117  std::stringstream exception_string;
118  exception_string << "Exception thrown during initialization of policy of type \""
119  << policy_plugin_spec << "\"";
120 
121  ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
122 
123  TLOG_DEBUG(app_name) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string() << TLOG_ENDL;
124 
125  return false;
126  }
127 
128  rt_priority_ = daq_pset.get<int>("rt_priority", 0);
129  sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
130  num_receivers_ = policy_->GetReceiverCount();
131 
132  receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
133  receive_token_events_ = std::vector<epoll_event>(num_receivers_ + 1);
134 
135  auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
137  max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
138  current_table_interval_ms_ = max_table_update_interval_ms_;
139  max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
140  receive_token_port_ = daq_pset.get<int>("routing_token_port", 35555);
141  send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
142  receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
143  send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
144  receive_address_ = daq_pset.get<std::string>("routing_master_hostname", "localhost");
145 
146  // fetch the monitoring parameters and create the MonitoredQuantity instances
147  statsHelper_.createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
148 
149  shutdown_requested_.store(false);
150  start_recieve_token_thread_();
151  return true;
152 }
153 
154 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t)
155 {
156  stop_requested_.store(false);
157  pause_requested_.store(false);
158 
159  statsHelper_.resetStatistics();
160  policy_->Reset();
161 
162  metricMan_.do_start();
163  run_id_ = id;
164  table_update_count_ = 0;
165  received_token_count_ = 0;
166 
167  TLOG_DEBUG(app_name) << "Started run " << std::to_string(run_id_.run()) << TLOG_ENDL;
168  return true;
169 }
170 
171 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t)
172 {
173  TLOG_DEBUG(app_name) << "Stopping run " << std::to_string(run_id_.run())
174  << " after " << std::to_string(table_update_count_) << " table updates."
175  << " and " << received_token_count_ << " received tokens." << TLOG_ENDL;
176  stop_requested_.store(true);
177  return true;
178 }
179 
180 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t)
181 {
182  TLOG_DEBUG(app_name) << "Pausing run " << std::to_string(run_id_.run())
183  << " after " << table_update_count_ << " table updates."
184  << " and " << received_token_count_ << " received tokens." << TLOG_ENDL;
185  pause_requested_.store(true);
186  return true;
187 }
188 
189 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t)
190 {
191  TLOG_DEBUG(app_name) << "Resuming run " << run_id_.run() << TLOG_ENDL;
192  policy_->Reset();
193  pause_requested_.store(false);
194  metricMan_.do_start();
195  return true;
196 }
197 
199 {
200  shutdown_requested_.store(true);
201  if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
202  policy_.reset(nullptr);
203  metricMan_.shutdown();
204  return true;
205 }
206 
207 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
208 {
209  TLOG_DEBUG(app_name) << "soft_initialize method called with "
210  << "ParameterSet = \"" << pset.to_string()
211  << "\"." << TLOG_ENDL;
212  return initialize(pset, e, f);
213 }
214 
215 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
216 {
217  TLOG_DEBUG(app_name) << "reinitialize method called with "
218  << "ParameterSet = \"" << pset.to_string()
219  << "\"." << TLOG_ENDL;
220  return initialize(pset, e, f);
221 }
222 
224 {
225  if (rt_priority_ > 0)
226  {
227 #pragma GCC diagnostic push
228 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
229  sched_param s_param = {};
230  s_param.sched_priority = rt_priority_;
231  if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
232  TLOG_WARNING(app_name) << "setting realtime priority failed" << TLOG_ENDL;
233 #pragma GCC diagnostic pop
234  }
235 
236  // try-catch block here?
237 
238  // how to turn RT PRI off?
239  if (rt_priority_ > 0)
240  {
241 #pragma GCC diagnostic push
242 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
243  sched_param s_param = {};
244  s_param.sched_priority = rt_priority_;
245  int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
246  if (status != 0)
247  {
248  TLOG_ERROR(app_name)
249  << "Failed to set realtime priority to " << std::to_string(rt_priority_)
250  << ", return code = " << status << TLOG_ENDL;
251  }
252 #pragma GCC diagnostic pop
253  }
254 
255  //MPI_Barrier(local_group_comm_);
256 
257  TLOG_DEBUG(app_name) << "Sending initial table." << TLOG_ENDL;
258  auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
259  auto nextSendTime = startTime;
260  double delta_time;
261  while (true)
262  {
263  if (stop_requested_ || pause_requested_) { break; }
264  startTime = artdaq::MonitoredQuantity::getCurrentTime();
265 
266  if (startTime >= nextSendTime)
267  {
268  auto table = policy_->GetCurrentTable();
269  if (table.size() > 0)
270  {
271  send_event_table(table);
272  ++table_update_count_;
273  delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
274  statsHelper_.addSample(TABLE_UPDATES_STAT_KEY, delta_time);
275  TLOG_ARB(16, app_name) << "process_fragments TABLE_UPDATES_STAT_KEY=" << std::to_string(delta_time) << TLOG_ENDL;
276  }
277  else
278  {
279  TLOG_WARNING(app_name) << "No tokens received in this update interval! This is most likely a Very Bad Thing!" << TLOG_ENDL;
280  }
281  auto max_tokens = policy_->GetMaxNumberOfTokens();
282  if (max_tokens > 0)
283  {
284  auto frac = table.size() / static_cast<double>(max_tokens);
285  if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
286  if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
287  if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
288  if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
289  }
290  nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
291  TLOG_DEBUG(app_name) << "current_table_interval_ms is now " << current_table_interval_ms_ << TLOG_ENDL;
292  }
293  else
294  {
295  usleep(current_table_interval_ms_ * 10); // 1/100 of the table update interval
296  }
297  }
298 
299  metricMan_.do_stop();
300 
301  return table_update_count_;
302 }
303 
305 {
306  // Reconnect table socket, if necessary
307  if (table_socket_ == -1)
308  {
309  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
310  if (table_socket_ < 0)
311  {
312  TLOG_ERROR(app_name) << "I failed to create the socket for sending Data Requests! Errno: " << std::to_string(errno) << TLOG_ENDL;
313  exit(1);
314  }
315  auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
316  if (sts == -1)
317  {
318  TLOG_ERROR(app_name) << "Unable to resolve table_update_address" << TLOG_ENDL;
319  exit(1);
320  }
321 
322  auto yes = 1;
323  if (receive_address_ != "localhost")
324  {
325  TLOG_DEBUG(app_name) << "Making sure that multicast sending uses the correct interface for hostname " << receive_address_ << TLOG_ENDL;
326  struct in_addr addr;
327  sts = ResolveHost(receive_address_.c_str(), addr);
328  if (sts == -1)
329  {
330  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;;
331  }
332 
333  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
334  {
335  throw art::Exception(art::errors::Configuration) <<
336  "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
337  exit(1);
338  }
339 
340  if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0)
341  {
342  TLOG_ERROR("RequestSender") << "Unable to enable multicast loopback on table socket" << TLOG_ENDL;
343  exit(1);
344  }
345  if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
346  {
347  TLOG_ERROR(app_name) << "Cannot set outgoing interface. Errno: " << std::to_string(errno) << TLOG_ENDL;
348  exit(1);
349  }
350  }
351  if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1)
352  {
353  TLOG_ERROR(app_name) << "Cannot set request socket to broadcast. Errno: " << std::to_string(errno) << TLOG_ENDL;
354  exit(1);
355  }
356  }
357 
358  // Reconnect ack socket, if necessary
359  if (ack_socket_ == -1)
360  {
361  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
362  if (ack_socket_ < 0)
363  {
364  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
365  exit(1);
366  }
367 
368  struct sockaddr_in si_me_request;
369 
370  auto yes = 1;
371  if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
372  {
373  throw art::Exception(art::errors::Configuration) <<
374  "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
375  exit(1);
376  }
377  memset(&si_me_request, 0, sizeof(si_me_request));
378  si_me_request.sin_family = AF_INET;
379  si_me_request.sin_port = htons(receive_acks_port_);
380  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
381  if (bind(ack_socket_, reinterpret_cast<struct sockaddr *>(&si_me_request), sizeof(si_me_request)) == -1)
382  {
383  throw art::Exception(art::errors::Configuration) <<
384  "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
385  exit(1);
386  }
387  TLOG_DEBUG(app_name) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_ << TLOG_ENDL;
388  }
389 
390  auto acks = std::unordered_map<int, bool>();
391  for (auto& r : sender_ranks_)
392  {
393  acks[r] = false;
394  }
395  auto counter = 0U;
396  auto start_time = std::chrono::steady_clock::now();
397  while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
398  {
399  // Send table update
400  auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
401  auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
402 
403  TLOG_DEBUG(app_name) << "Sending table information for " << std::to_string(header.nEntries) << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ << TLOG_ENDL;
404  if (sendto(table_socket_, &header, sizeof(detail::RoutingPacketHeader), 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
405  {
406  TLOG_ERROR(app_name) << "Error sending request message header" << TLOG_ENDL;
407  }
408  if (sendto(table_socket_, &packet[0], packetSize, 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_)) < 0)
409  {
410  TLOG_ERROR(app_name) << "Error sending request message data" << TLOG_ENDL;
411  }
412 
413  // Collect acks
414 
415  auto first = packet[0].sequence_id;
416  auto last = packet.rbegin()->sequence_id;
417  TLOG_DEBUG(app_name) << "Expecting acks to have first= " << std::to_string(first) << ", and last= " << std::to_string(last) << TLOG_ENDL;
418 
419 
420  auto startTime = std::chrono::steady_clock::now();
421  while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
422  {
423  auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
424  if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms)
425  {
426  if (counter > max_ack_cycle_count_ && table_update_count_ > 0)
427  {
428  TLOG_ERROR(app_name) << "Did not receive acks from all senders after resending table " << std::to_string(counter)
429  << " times during the table_update_interval. Check the status of the senders!" << TLOG_ENDL;
430  break;
431  }
432  TLOG_WARNING(app_name) << "Did not receive acks from all senders within the table_ack_wait_time. Resending table update" << TLOG_ENDL;
433  break;
434  }
435 
436  TLOG_ARB(20, app_name) << "send_event_table: Polling Request socket for new requests" << TLOG_ENDL;
437  auto ready = true;
438  while (ready)
439  {
441  if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0)
442  {
443  if (errno == EWOULDBLOCK || errno == EAGAIN)
444  {
445  TLOG_ARB(20, app_name) << "send_event_table: No more ack datagrams on ack socket." << TLOG_ENDL;
446  ready = false;
447  }
448  else
449  {
450  TLOG_ERROR(app_name) << "An unexpected error occurred during ack packet receive" << TLOG_ENDL;
451  exit(2);
452  }
453  }
454  else
455  {
456  TLOG_DEBUG(app_name) << "Ack packet from rank " << buffer.rank << " has first= " << std::to_string(buffer.first_sequence_id)
457  << " and last= " << std::to_string(buffer.last_sequence_id) << TLOG_ENDL;
458  if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
459  {
460  TLOG_DEBUG(app_name) << "Received table update acknowledgement from sender with rank " << std::to_string(buffer.rank) << "." << TLOG_ENDL;
461  acks[buffer.rank] = true;
462  TLOG_DEBUG(app_name) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; })
463  << " acks outstanding" << TLOG_ENDL;
464  }
465  else
466  {
467  if (!acks.count(buffer.rank))
468  {
469  TLOG_ERROR(app_name) << "Received acknowledgement from invalid rank " << buffer.rank << "!"
470  << " Cross-talk between RoutingMasters means there's a configuration error!" << TLOG_ENDL;
471  }
472  else
473  {
474  TLOG_WARNING(app_name) << "Received acknowledgement from rank " << buffer.rank
475  << " that had incorrect sequence ID information. Discarding." << TLOG_ENDL;
476  }
477  }
478  }
479  }
480  usleep(table_ack_wait_time_ms * 1000 / 10);
481  }
482  }
483  if (metricMan)
484  {
485  artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time;
486  metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average);
487  }
488 }
489 
490 void artdaq::RoutingMasterCore::receive_tokens_()
491 {
492  while (!shutdown_requested_)
493  {
494  TLOG_DEBUG(app_name) << "Receive Token loop start" << TLOG_ENDL;
495  if (token_socket_ == -1)
496  {
497  TLOG_DEBUG(app_name) << "Opening token listener socket" << TLOG_ENDL;
498  token_socket_ = TCP_listen_fd(receive_token_port_, 3 * sizeof(detail::RoutingToken));
499 
500  if (token_epoll_fd_ != -1) close(token_epoll_fd_);
501  struct epoll_event ev;
502  token_epoll_fd_ = epoll_create1(0);
503  ev.events = EPOLLIN | EPOLLPRI;
504  ev.data.fd = token_socket_;
505  if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, token_socket_, &ev) == -1)
506  {
507  TLOG_ERROR(app_name) << "Could not register listen socket to epoll fd" << TLOG_ENDL;
508  exit(3);
509  }
510  }
511  if (token_socket_ == -1 || token_epoll_fd_ == -1)
512  {
513  TLOG_DEBUG(app_name) << "One of the listen sockets was not opened successfully." << TLOG_ENDL;
514  return;
515  }
516 
517  auto nfds = epoll_wait(token_epoll_fd_, &receive_token_events_[0], receive_token_events_.size(), current_table_interval_ms_);
518  if (nfds == -1)
519  {
520  perror("epoll_wait");
521  exit(EXIT_FAILURE);
522  }
523 
524  TLOG_DEBUG(app_name) << "Received " << std::to_string(nfds) << " events" << TLOG_ENDL;
525  for (auto n = 0; n < nfds; ++n)
526  {
527  if (receive_token_events_[n].data.fd == token_socket_)
528  {
529  TLOG_DEBUG(app_name) << "Accepting new connection on token_socket" << TLOG_ENDL;
530  sockaddr_in addr;
531  socklen_t arglen = sizeof(addr);
532  auto conn_sock = accept(token_socket_, (struct sockaddr *)&addr, &arglen);
533 
534  if (conn_sock == -1)
535  {
536  perror("accept");
537  exit(EXIT_FAILURE);
538  }
539 
540  receive_token_addrs_[conn_sock] = std::string(inet_ntoa(addr.sin_addr));
541  struct epoll_event ev;
542  ev.events = EPOLLIN | EPOLLET;
543  ev.data.fd = conn_sock;
544  if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, conn_sock, &ev) == -1)
545  {
546  perror("epoll_ctl: conn_sock");
547  exit(EXIT_FAILURE);
548  }
549  }
550  else
551  {
552  auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
553  detail::RoutingToken buff;
554  auto sts = read(receive_token_events_[n].data.fd, &buff, sizeof(detail::RoutingToken));
555  if (sts != sizeof(detail::RoutingToken) || buff.header != TOKEN_MAGIC)
556  {
557  TLOG_ERROR(app_name) << "Received invalid token from " << receive_token_addrs_[receive_token_events_[n].data.fd] << TLOG_ENDL;
558  }
559  else
560  {
561  TLOG_DEBUG(app_name) << "Received token from " << std::to_string(buff.rank) << " indicating " << buff.new_slots_free << " slots are free." << TLOG_ENDL;
562  received_token_count_ += buff.new_slots_free;
564  {
565  policy_->AddReceiverToken(buff.rank, buff.new_slots_free);
566  }
567  else if (routing_mode_ == detail::RoutingMasterMode::RouteBySendCount)
568  {
569  if (!received_token_counter_.count(buff.rank)) received_token_counter_[buff.rank] = 0;
570  received_token_counter_[buff.rank] += buff.new_slots_free;
571  TLOG_DEBUG(app_name) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() << "." << TLOG_ENDL;
572  while (received_token_counter_[buff.rank] >= sender_ranks_.size())
573  {
574  TLOG_DEBUG(app_name) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size()
575  << "... Sending token to policy" << TLOG_ENDL;
576  policy_->AddReceiverToken(buff.rank, 1);
577  received_token_counter_[buff.rank] -= sender_ranks_.size();
578  }
579  }
580  }
581  auto delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
582  statsHelper_.addSample(TOKENS_RECEIVED_STAT_KEY, delta_time);
583 
584  }
585  }
586  }
587 }
588 
589 void artdaq::RoutingMasterCore::start_recieve_token_thread_()
590 {
591  if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
592  TLOG_INFO(app_name) << "Starting Token Reception Thread" << TLOG_ENDL;
593  ev_token_receive_thread_ = boost::thread(&RoutingMasterCore::receive_tokens_, this);
594 }
595 
596 std::string artdaq::RoutingMasterCore::report(std::string const&) const
597 {
598  std::string resultString;
599 
600  // if we haven't been able to come up with any report so far, say so
601  auto tmpString = app_name + " run number = " + std::to_string(run_id_.run())
602  + ", table updates sent = " + std::to_string(table_update_count_)
603  + ", Receiver tokens received = " + std::to_string(received_token_count_);
604  return tmpString;
605 }
606 
607 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const
608 {
609  std::ostringstream oss;
610  oss << app_name << " statistics:" << std::endl;
611 
612  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
613  if (mqPtr.get() != nullptr)
614  {
615  artdaq::MonitoredQuantityStats stats;
616  mqPtr->getStats(stats);
617  oss << " Table Update statistics: "
618  << stats.recentSampleCount << " table updates sent at "
619  << stats.recentSampleRate << " table updates/sec, , monitor window = "
620  << stats.recentDuration << " sec" << std::endl;
621  oss << " Average times per table update: ";
622  if (stats.recentSampleRate > 0.0)
623  {
624  oss << " elapsed time = "
625  << (1.0 / stats.recentSampleRate) << " sec";
626  }
627  oss << ", avg table acknowledgement wait time = "
628  << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
629  }
630 
631  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
632  if (mqPtr.get() != nullptr)
633  {
634  artdaq::MonitoredQuantityStats stats;
635  mqPtr->getStats(stats);
636  oss << " Received Token statistics: "
637  << stats.recentSampleCount << " tokens received at "
638  << stats.recentSampleRate << " tokens/sec, , monitor window = "
639  << stats.recentDuration << " sec" << std::endl;
640  oss << " Average times per token: ";
641  if (stats.recentSampleRate > 0.0)
642  {
643  oss << " elapsed time = "
644  << (1.0 / stats.recentSampleRate) << " sec";
645  }
646  oss << ", input token wait time = "
647  << mqPtr->getRecentValueSum() << " sec" << std::endl;
648  }
649 
650  return oss.str();
651 }
652 
653 void artdaq::RoutingMasterCore::sendMetrics_()
654 {
655  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
656  if (mqPtr.get() != nullptr)
657  {
658  artdaq::MonitoredQuantityStats stats;
659  mqPtr->getStats(stats);
660  metricMan_.sendMetric("Table Update Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::Accumulate);
661  metricMan_.sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
662  metricMan_.sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average);
663  }
664 
665  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
666  if (mqPtr.get() != nullptr)
667  {
668  artdaq::MonitoredQuantityStats stats;
669  mqPtr->getStats(stats);
670  metricMan_.sendMetric("Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::Accumulate);
671  metricMan_.sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
672  metricMan_.sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
673  }
674 }
bool resume(uint64_t, uint64_t)
Resumes the RoutingMasterCore.
size_t process_event_table()
Main loop of the RoutingMasterCore. Determines when to send the next table update, asks the RoutingMasterPolicy for the table to send, and sends it.
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:27
bool soft_initialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Soft-Initializes the RoutingMasterCore.
static const std::string TOKENS_RECEIVED_STAT_KEY
Key for the Tokens Received MonitoredQuantity.
A row of the Routing Table.
bool reinitialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Reinitializes the RoutingMasterCore.
bool start(art::RunID id, uint64_t, uint64_t)
Start the RoutingMasterCore.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
RoutingMasterCore()
RoutingMasterCore Constructor.
Events should be routed by sequence ID (BR -&gt; EB)
bool initialize(fhicl::ParameterSet const &pset, uint64_t, uint64_t)
Processes the initialize request.
The RoutingToken contains the magic bytes, the rank of the token sender, and the number of slots free...
int TCP_listen_fd(int port, int rcvbuf)
Create a TCP listening socket on the given port and INADDR_ANY, with the given receive buffer...
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
bool pause(uint64_t, uint64_t)
Pauses the RoutingMasterCore.
The header of the Routing Table, containing the magic bytes and the number of entries.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
bool stop(uint64_t, uint64_t)
Stops the RoutingMasterCore.
int rank
The rank from which the RoutingAckPacket came.
std::unique_ptr< RoutingMasterPolicy > makeRoutingMasterPolicy(std::string const &policy_plugin_spec, fhicl::ParameterSet const &ps)
Load a RoutingMasterPolicy plugin.
std::string report(std::string const &) const
Send a report on the current status of the RoutingMasterCore.
static const std::string TABLE_UPDATES_STAT_KEY
Key for Table Update count MonnitoredQuantity.
bool shutdown(uint64_t)
Shuts Down the RoutingMasterCore.
void send_event_table(detail::RoutingPacket table)
Sends a detail::RoutingPacket to the table receivers.
Events should be routed by send count (EB -&gt; Agg)