artdaq  v3_03_00
RoutingMasterCore.cc
1 #include <sys/un.h>
2 #include <sys/time.h>
3 #include <sys/epoll.h>
4 #include <arpa/inet.h>
5 #include <netdb.h>
6 #include <pthread.h>
7 #include <sched.h>
8 #include <algorithm>
9 
10 #include "canvas/Utilities/Exception.h"
11 #include "cetlib_except/exception.h"
12 
13 #define TRACE_NAME (app_name + "_RoutingMasterCore").c_str() // include these 2 first -
14 #include "artdaq/DAQdata/Globals.hh" // to get tracemf.h before trace.h
15 #include "artdaq-core/Data/Fragment.hh"
16 #include "artdaq-core/Utilities/ExceptionHandler.hh"
17 
18 #include "artdaq/Application/RoutingMasterCore.hh"
19 #include "artdaq/Application/Routing/makeRoutingMasterPolicy.hh"
22 
23 const std::string artdaq::RoutingMasterCore::
24 TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates");
25 const std::string artdaq::RoutingMasterCore::
26 TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived");
27 
29  : received_token_counter_()
30  , shutdown_requested_(false)
31  , stop_requested_(false)
32  , pause_requested_(false)
33  , token_socket_(-1)
34  , table_socket_(-1)
35  , ack_socket_(-1)
36 {
37  TLOG(TLVL_DEBUG) << "Constructor" ;
40 }
41 
43 {
44  TLOG(TLVL_DEBUG) << "Destructor" ;
45  if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
46 }
47 
48 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t)
49 {
50  TLOG(TLVL_DEBUG) << "initialize method called with "
51  << "ParameterSet = \"" << pset.to_string()
52  << "\"." ;
53 
54  // pull out the relevant parts of the ParameterSet
55  fhicl::ParameterSet daq_pset;
56  try
57  {
58  daq_pset = pset.get<fhicl::ParameterSet>("daq");
59  }
60  catch (...)
61  {
62  TLOG(TLVL_ERROR)
63  << "Unable to find the DAQ parameters in the initialization "
64  << "ParameterSet: \"" + pset.to_string() + "\"." ;
65  return false;
66  }
67 
68  if (daq_pset.has_key("rank"))
69  {
70  if (my_rank >= 0 && daq_pset.get<int>("rank") != my_rank) {
71  TLOG(TLVL_WARNING) << "Routing Master rank specified at startup is different than rank specified at configure! Using rank received at configure!";
72  }
73  my_rank = daq_pset.get<int>("rank");
74  }
75  if (my_rank == -1)
76  {
77  TLOG(TLVL_ERROR) << "Routing Master rank not specified at startup or in configuration! Aborting";
78  exit(1);
79  }
80 
81  try
82  {
83  policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
84  }
85  catch (...)
86  {
87  TLOG(TLVL_ERROR)
88  << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\"." ;
89  return false;
90  }
91 
92  // pull out the Metric part of the ParameterSet
93  fhicl::ParameterSet metric_pset;
94  try
95  {
96  metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
97  }
98  catch (...) {} // OK if there's no metrics table defined in the FHiCL
99 
100  if (metric_pset.is_empty())
101  {
102  TLOG(TLVL_INFO) << "No metric plugins appear to be defined" ;
103  }
104  try
105  {
106  metricMan->initialize(metric_pset, app_name);
107  }
108  catch (...)
109  {
110  ExceptionHandler(ExceptionHandlerRethrow::no,
111  "Error loading metrics in RoutingMasterCore::initialize()");
112  }
113 
114  // create the requested CommandableFragmentGenerator
115  auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
116  if (policy_plugin_spec.length() == 0)
117  {
118  TLOG(TLVL_ERROR)
119  << "No fragment generator (parameter name = \"policy\") was "
120  << "specified in the policy ParameterSet. The "
121  << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\"." ;
122  return false;
123  }
124  try
125  {
126  policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_);
127  }
128  catch (...)
129  {
130  std::stringstream exception_string;
131  exception_string << "Exception thrown during initialization of policy of type \""
132  << policy_plugin_spec << "\"";
133 
134  ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
135 
136  TLOG(TLVL_DEBUG) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string() ;
137 
138  return false;
139  }
140 
141  rt_priority_ = daq_pset.get<int>("rt_priority", 0);
142  sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
143  num_receivers_ = policy_->GetReceiverCount();
144 
145  receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
146  receive_token_events_ = std::vector<epoll_event>(num_receivers_ + 1);
147 
148  auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
150  max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
151  current_table_interval_ms_ = max_table_update_interval_ms_;
152  max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
153  receive_token_port_ = daq_pset.get<int>("routing_token_port", 35555);
154  send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
155  receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
156  send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
157  receive_address_ = daq_pset.get<std::string>("routing_master_hostname", "localhost");
158 
159  // fetch the monitoring parameters and create the MonitoredQuantity instances
160  statsHelper_.createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
161 
162  shutdown_requested_.store(false);
163  start_recieve_token_thread_();
164  return true;
165 }
166 
167 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t)
168 {
169  stop_requested_.store(false);
170  pause_requested_.store(false);
171 
172  statsHelper_.resetStatistics();
173  policy_->Reset();
174 
175  metricMan->do_start();
176  run_id_ = id;
177  table_update_count_ = 0;
178  received_token_count_ = 0;
179 
180  TLOG(TLVL_INFO) << "Started run " << run_id_.run() ;
181  return true;
182 }
183 
184 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t)
185 {
186  TLOG(TLVL_INFO) << "Stopping run " << run_id_.run()
187  << " after " << table_update_count_ << " table updates."
188  << " and " << received_token_count_ << " received tokens." ;
189  stop_requested_.store(true);
190  return true;
191 }
192 
193 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t)
194 {
195  TLOG(TLVL_INFO) << "Pausing run " << run_id_.run()
196  << " after " << table_update_count_ << " table updates."
197  << " and " << received_token_count_ << " received tokens." ;
198  pause_requested_.store(true);
199  return true;
200 }
201 
202 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t)
203 {
204  TLOG(TLVL_INFO) << "Resuming run " << run_id_.run() ;
205  policy_->Reset();
206  pause_requested_.store(false);
207  metricMan->do_start();
208  return true;
209 }
210 
212 {
213  shutdown_requested_.store(true);
214  if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
215  policy_.reset(nullptr);
216  metricMan->shutdown();
217  return true;
218 }
219 
220 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
221 {
222  TLOG(TLVL_INFO) << "soft_initialize method called with "
223  << "ParameterSet = \"" << pset.to_string()
224  << "\"." ;
225  return initialize(pset, e, f);
226 }
227 
228 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
229 {
230  TLOG(TLVL_INFO) << "reinitialize method called with "
231  << "ParameterSet = \"" << pset.to_string()
232  << "\"." ;
233  return initialize(pset, e, f);
234 }
235 
237 {
238  if (rt_priority_ > 0)
239  {
240 #pragma GCC diagnostic push
241 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
242  sched_param s_param = {};
243  s_param.sched_priority = rt_priority_;
244  if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
245  TLOG(TLVL_WARNING) << "setting realtime priority failed" ;
246 #pragma GCC diagnostic pop
247  }
248 
249  // try-catch block here?
250 
251  // how to turn RT PRI off?
252  if (rt_priority_ > 0)
253  {
254 #pragma GCC diagnostic push
255 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
256  sched_param s_param = {};
257  s_param.sched_priority = rt_priority_;
258  int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
259  if (status != 0)
260  {
261  TLOG(TLVL_ERROR)
262  << "Failed to set realtime priority to " << rt_priority_
263  << ", return code = " << status ;
264  }
265 #pragma GCC diagnostic pop
266  }
267 
268  //MPI_Barrier(local_group_comm_);
269 
270  TLOG(TLVL_DEBUG) << "Sending initial table." ;
271  auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
272  auto nextSendTime = startTime;
273  double delta_time;
274  while (!stop_requested_ && !pause_requested_)
275  {
276  startTime = artdaq::MonitoredQuantity::getCurrentTime();
277 
278  if (startTime >= nextSendTime)
279  {
280  auto table = policy_->GetCurrentTable();
281  if (table.size() > 0)
282  {
283  send_event_table(table);
284  ++table_update_count_;
285  delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
286  statsHelper_.addSample(TABLE_UPDATES_STAT_KEY, delta_time);
287  TLOG(16) << "process_fragments TABLE_UPDATES_STAT_KEY=" << delta_time ;
288  }
289  else
290  {
291  TLOG(TLVL_DEBUG) << "No tokens received in this update interval (" << current_table_interval_ms_ << " ms)! This most likely means that the receivers are not keeping up!" ;
292  }
293  auto max_tokens = policy_->GetMaxNumberOfTokens();
294  if (max_tokens > 0)
295  {
296  auto frac = table.size() / static_cast<double>(max_tokens);
297  if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
298  if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
299  if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
300  if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
301  }
302  nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
303  TLOG(TLVL_DEBUG) << "current_table_interval_ms is now " << current_table_interval_ms_ ;
304  }
305  else
306  {
307  usleep(current_table_interval_ms_ * 10); // 1/100 of the table update interval
308  }
309  }
310 
311  metricMan->do_stop();
312 }
313 
315 {
316  // Reconnect table socket, if necessary
317  if (table_socket_ == -1)
318  {
319  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
320  if (table_socket_ < 0)
321  {
322  TLOG(TLVL_ERROR) << "I failed to create the socket for sending Data Requests! Errno: " << errno ;
323  exit(1);
324  }
325  auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
326  if (sts == -1)
327  {
328  TLOG(TLVL_ERROR) << "Unable to resolve table_update_address" ;
329  exit(1);
330  }
331 
332  auto yes = 1;
333  if (receive_address_ != "localhost")
334  {
335  TLOG(TLVL_DEBUG) << "Making sure that multicast sending uses the correct interface for hostname " << receive_address_ ;
336  struct in_addr addr;
337  sts = ResolveHost(receive_address_.c_str(), addr);
338  if (sts == -1)
339  {
340  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;;
341  }
342 
343  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
344  {
345  throw art::Exception(art::errors::Configuration) <<
346  "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
347  exit(1);
348  }
349 
350  if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0)
351  {
352  TLOG(TLVL_ERROR) << "Unable to enable multicast loopback on table socket" ;
353  exit(1);
354  }
355  if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
356  {
357  TLOG(TLVL_ERROR) << "Cannot set outgoing interface. Errno: " << errno ;
358  exit(1);
359  }
360  }
361  if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1)
362  {
363  TLOG(TLVL_ERROR) << "Cannot set request socket to broadcast. Errno: " << errno ;
364  exit(1);
365  }
366  }
367 
368  // Reconnect ack socket, if necessary
369  if (ack_socket_ == -1)
370  {
371  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
372  if (ack_socket_ < 0)
373  {
374  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
375  exit(1);
376  }
377 
378  struct sockaddr_in si_me_request;
379 
380  auto yes = 1;
381  if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
382  {
383  throw art::Exception(art::errors::Configuration) <<
384  "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
385  exit(1);
386  }
387  memset(&si_me_request, 0, sizeof(si_me_request));
388  si_me_request.sin_family = AF_INET;
389  si_me_request.sin_port = htons(receive_acks_port_);
390  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
391  if (bind(ack_socket_, reinterpret_cast<struct sockaddr *>(&si_me_request), sizeof(si_me_request)) == -1)
392  {
393  throw art::Exception(art::errors::Configuration) <<
394  "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
395  exit(1);
396  }
397  TLOG(TLVL_DEBUG) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_ ;
398  }
399 
400  auto acks = std::unordered_map<int, bool>();
401  for (auto& r : sender_ranks_)
402  {
403  acks[r] = false;
404  }
405  auto counter = 0U;
406  auto start_time = std::chrono::steady_clock::now();
407  while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0 && !stop_requested_)
408  {
409  // Send table update
410  auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
411  auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
412 
413  TLOG(TLVL_DEBUG) << "Sending table information for " << header.nEntries << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ ;
414  TRACE(16,"headerData:0x%016lx%016lx packetData:0x%016lx%016lx"
415  ,((unsigned long*)&header)[0],((unsigned long*)&header)[1], ((unsigned long*)&packet[0])[0],((unsigned long*)&packet[0])[1] );
416  auto hdrsts = sendto(table_socket_, &header, sizeof(detail::RoutingPacketHeader), 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_));
417  if (hdrsts != sizeof(detail::RoutingPacketHeader))
418  {
419  TLOG(TLVL_ERROR) << "Error sending routing message header. hdrsts=" << hdrsts;
420  }
421  auto pktsts = sendto(table_socket_, &packet[0], packetSize, 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_));
422  if (pktsts != (ssize_t)packetSize)
423  {
424  TLOG(TLVL_ERROR) << "Error sending routing message data. hdrsts="<<hdrsts<<" pktsts="<<pktsts;
425  }
426 
427  // Collect acks
428 
429  auto first = packet[0].sequence_id;
430  auto last = packet.rbegin()->sequence_id;
431  TLOG(TLVL_DEBUG) << "Sent " << hdrsts <<"+"<< pktsts << ". Expecting acks to have first= " << first << ", and last= " << last ;
432 
433 
434  auto startTime = std::chrono::steady_clock::now();
435  while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
436  {
437  auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
438  if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms)
439  {
440  if (counter > max_ack_cycle_count_ && table_update_count_ > 0)
441  {
442  TLOG(TLVL_ERROR) << "Did not receive acks from all senders after resending table " << counter
443  << " times during the table_update_interval. Check the status of the senders!" ;
444  break;
445  }
446  TLOG(TLVL_WARNING) << "Did not receive acks from all senders within the timeout (" << table_ack_wait_time_ms << " ms). Resending table update" ;
447  break;
448  }
449 
450  TLOG(20) << "send_event_table: Polling Request socket for new requests" ;
451  auto ready = true;
452  while (ready)
453  {
455  if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0)
456  {
457  if (errno == EWOULDBLOCK || errno == EAGAIN)
458  {
459  TLOG(20) << "send_event_table: No more ack datagrams on ack socket." ;
460  ready = false;
461  }
462  else
463  {
464  TLOG(TLVL_ERROR) << "An unexpected error occurred during ack packet receive" ;
465  exit(2);
466  }
467  }
468  else
469  {
470  TLOG(TLVL_DEBUG) << "Ack packet from rank " << buffer.rank << " has first= " << buffer.first_sequence_id
471  << " and last= " << buffer.last_sequence_id ;
472  if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
473  {
474  TLOG(TLVL_DEBUG) << "Received table update acknowledgement from sender with rank " << buffer.rank << "." ;
475  acks[buffer.rank] = true;
476  TLOG(TLVL_DEBUG) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; })
477  << " acks outstanding" ;
478  }
479  else
480  {
481  if (!acks.count(buffer.rank))
482  {
483  TLOG(TLVL_ERROR) << "Received acknowledgement from invalid rank " << buffer.rank << "!"
484  << " Cross-talk between RoutingMasters means there's a configuration error!" ;
485  }
486  else
487  {
488  TLOG(TLVL_WARNING) << "Received acknowledgement from rank " << buffer.rank
489  << " that had incorrect sequence ID information. Discarding." ;
490  }
491  }
492  }
493  }
494  usleep(table_ack_wait_time_ms * 1000 / 10);
495  }
496  }
497  if (metricMan)
498  {
499  artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time;
500  metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average);
501  }
502 }
503 
504 void artdaq::RoutingMasterCore::receive_tokens_()
505 {
506  while (!shutdown_requested_)
507  {
508  TLOG(TLVL_DEBUG) << "Receive Token loop start" ;
509  if (token_socket_ == -1)
510  {
511  TLOG(TLVL_DEBUG) << "Opening token listener socket" ;
512  token_socket_ = TCP_listen_fd(receive_token_port_, 3 * sizeof(detail::RoutingToken));
513  fcntl(token_socket_, F_SETFL, O_NONBLOCK); // set O_NONBLOCK
514 
515  if (token_epoll_fd_ != -1) close(token_epoll_fd_);
516  struct epoll_event ev;
517  token_epoll_fd_ = epoll_create1(0);
518  ev.events = EPOLLIN | EPOLLPRI;
519  ev.data.fd = token_socket_;
520  if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, token_socket_, &ev) == -1)
521  {
522  TLOG(TLVL_ERROR) << "Could not register listen socket to epoll fd" ;
523  exit(3);
524  }
525  }
526  if (token_socket_ == -1 || token_epoll_fd_ == -1)
527  {
528  TLOG(TLVL_DEBUG) << "One of the listen sockets was not opened successfully." ;
529  return;
530  }
531 
532  auto nfds = epoll_wait(token_epoll_fd_, &receive_token_events_[0], receive_token_events_.size(), current_table_interval_ms_);
533  if (nfds == -1)
534  {
535  perror("epoll_wait");
536  exit(EXIT_FAILURE);
537  }
538 
539  TLOG(TLVL_DEBUG) << "Received " << nfds << " events" ;
540  for (auto n = 0; n < nfds; ++n)
541  {
542  if (receive_token_events_[n].data.fd == token_socket_)
543  {
544  TLOG(TLVL_DEBUG) << "Accepting new connection on token_socket" ;
545  sockaddr_in addr;
546  socklen_t arglen = sizeof(addr);
547  auto conn_sock = accept(token_socket_, (struct sockaddr *)&addr, &arglen);
548  fcntl(conn_sock, F_SETFL, O_NONBLOCK); // set O_NONBLOCK
549 
550  if (conn_sock == -1)
551  {
552  perror("accept");
553  exit(EXIT_FAILURE);
554  }
555 
556  receive_token_addrs_[conn_sock] = std::string(inet_ntoa(addr.sin_addr));
557  TLOG(TLVL_DEBUG) << "New fd is " << conn_sock << " for receiver at " << receive_token_addrs_[conn_sock];
558  struct epoll_event ev;
559  ev.events = EPOLLIN | EPOLLET;
560  ev.data.fd = conn_sock;
561  if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, conn_sock, &ev) == -1)
562  {
563  perror("epoll_ctl: conn_sock");
564  exit(EXIT_FAILURE);
565  }
566  }
567  else
568  {
569  /*if (receive_token_events_[n].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP))
570  {
571  TLOG(TLVL_DEBUG) << "Closing connection on fd " << receive_token_events_[n].data.fd << " (" << receive_token_addrs_[receive_token_events_[n].data.fd] << ")";
572  receive_token_addrs_.erase(receive_token_events_[n].data.fd);
573  close(receive_token_events_[n].data.fd);
574  epoll_ctl(token_epoll_fd_, EPOLL_CTL_DEL, receive_token_events_[n].data.fd, NULL);
575  continue;
576  }*/
577 
578  auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
579  bool reading = true;
580  int sts = 0;
581  while(reading)
582  {
583  detail::RoutingToken buff;
584  sts += read(receive_token_events_[n].data.fd, &buff, sizeof(detail::RoutingToken) - sts);
585  if (sts == 0)
586  {
587  TLOG(TLVL_INFO) << "Received 0-size token from " << receive_token_addrs_[receive_token_events_[n].data.fd];
588  reading = false;
589  }
590  else if(sts < 0 && errno == EAGAIN)
591  {
592  TLOG(TLVL_DEBUG) << "No more tokens from this rank. Continuing poll loop.";
593  reading = false;
594  }
595  else if(sts < 0)
596  {
597  TLOG(TLVL_ERROR) << "Error reading from token socket: sts=" << sts << ", errno=" << errno;
598  receive_token_addrs_.erase(receive_token_events_[n].data.fd);
599  close(receive_token_events_[n].data.fd);
600  epoll_ctl(token_epoll_fd_, EPOLL_CTL_DEL, receive_token_events_[n].data.fd, NULL);
601  reading = false;
602  }
603  else if (sts == sizeof(detail::RoutingToken) && buff.header != TOKEN_MAGIC)
604  {
605  TLOG(TLVL_ERROR) << "Received invalid token from " << receive_token_addrs_[receive_token_events_[n].data.fd] << " sts=" << sts;
606  reading = false;
607  }
608  else if(sts == sizeof(detail::RoutingToken))
609  {
610  sts = 0;
611  TLOG(TLVL_DEBUG) << "Received token from " << buff.rank << " indicating " << buff.new_slots_free << " slots are free." ;
612  received_token_count_ += buff.new_slots_free;
614  {
615  policy_->AddReceiverToken(buff.rank, buff.new_slots_free);
616  }
617  else if (routing_mode_ == detail::RoutingMasterMode::RouteBySendCount)
618  {
619  if (!received_token_counter_.count(buff.rank)) received_token_counter_[buff.rank] = 0;
620  received_token_counter_[buff.rank] += buff.new_slots_free;
621  TLOG(TLVL_DEBUG) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() << "." ;
622  while (received_token_counter_[buff.rank] >= sender_ranks_.size())
623  {
624  TLOG(TLVL_DEBUG) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size()
625  << "... Sending token to policy" ;
626  policy_->AddReceiverToken(buff.rank, 1);
627  received_token_counter_[buff.rank] -= sender_ranks_.size();
628  }
629  }
630  }
631  }
632  auto delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
633  statsHelper_.addSample(TOKENS_RECEIVED_STAT_KEY, delta_time);
634  bool readyToReport = statsHelper_.readyToReport(delta_time);
635  if (readyToReport)
636  {
637  std::string statString = buildStatisticsString_();
638  TLOG(TLVL_INFO) << statString;
639  sendMetrics_();
640  }
641  }
642 }
643  }
644 }
645 
646 void artdaq::RoutingMasterCore::start_recieve_token_thread_()
647 {
648  if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
649  boost::thread::attributes attrs;
650  attrs.set_stack_size(4096 * 2000); // 8000 KB
651 
652  TLOG(TLVL_INFO) << "Starting Token Reception Thread" ;
653  try {
654  ev_token_receive_thread_ = boost::thread(attrs, boost::bind(&RoutingMasterCore::receive_tokens_, this));
655  }
656  catch(boost::exception const& e)
657  {
658  std::cerr << "Exception encountered starting Token Reception thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
659  exit(3);
660  }
661  TLOG(TLVL_INFO) << "Started Token Reception Thread";
662 }
663 
664 std::string artdaq::RoutingMasterCore::report(std::string const&) const
665 {
666  std::string resultString;
667 
668  // if we haven't been able to come up with any report so far, say so
669  auto tmpString = app_name + " run number = " + std::to_string(run_id_.run())
670  + ", table updates sent = " + std::to_string(table_update_count_)
671  + ", Receiver tokens received = " + std::to_string(received_token_count_);
672  return tmpString;
673 }
674 
675 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const
676 {
677  std::ostringstream oss;
678  oss << app_name << " statistics:" << std::endl;
679 
680  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
681  if (mqPtr.get() != nullptr)
682  {
683  artdaq::MonitoredQuantityStats stats;
684  mqPtr->getStats(stats);
685  oss << " Table Update statistics: "
686  << stats.recentSampleCount << " table updates sent at "
687  << stats.recentSampleRate << " table updates/sec, , monitor window = "
688  << stats.recentDuration << " sec" << std::endl;
689  oss << " Average times per table update: ";
690  if (stats.recentSampleRate > 0.0)
691  {
692  oss << " elapsed time = "
693  << (1.0 / stats.recentSampleRate) << " sec";
694  }
695  oss << ", avg table acknowledgement wait time = "
696  << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
697  }
698 
699  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
700  if (mqPtr.get() != nullptr)
701  {
702  artdaq::MonitoredQuantityStats stats;
703  mqPtr->getStats(stats);
704  oss << " Received Token statistics: "
705  << stats.recentSampleCount << " tokens received at "
706  << stats.recentSampleRate << " tokens/sec, , monitor window = "
707  << stats.recentDuration << " sec" << std::endl;
708  oss << " Average times per token: ";
709  if (stats.recentSampleRate > 0.0)
710  {
711  oss << " elapsed time = "
712  << (1.0 / stats.recentSampleRate) << " sec";
713  }
714  oss << ", input token wait time = "
715  << mqPtr->getRecentValueSum() << " sec" << std::endl;
716  }
717 
718  return oss.str();
719 }
720 
721 void artdaq::RoutingMasterCore::sendMetrics_()
722 {
723  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
724  if (mqPtr.get() != nullptr)
725  {
726  artdaq::MonitoredQuantityStats stats;
727  mqPtr->getStats(stats);
728  metricMan->sendMetric("Table Update Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::LastPoint);
729  metricMan->sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
730  metricMan->sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average);
731  }
732 
733  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
734  if (mqPtr.get() != nullptr)
735  {
736  artdaq::MonitoredQuantityStats stats;
737  mqPtr->getStats(stats);
738  metricMan->sendMetric("Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::LastPoint);
739  metricMan->sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
740  metricMan->sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
741  }
742 }
bool resume(uint64_t, uint64_t)
Resumes the RoutingMasterCore.
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
bool soft_initialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Soft-Initializes the RoutingMasterCore.
static const std::string TOKENS_RECEIVED_STAT_KEY
Key for the Tokens Received MonitoredQuantity.
A row of the Routing Table.
bool reinitialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Reinitializes the RoutingMasterCore.
bool start(art::RunID id, uint64_t, uint64_t)
Start the RoutingMasterCore.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
RoutingMasterCore()
RoutingMasterCore Constructor.
Events should be routed by sequence ID (BR -&gt; EB)
bool initialize(fhicl::ParameterSet const &pset, uint64_t, uint64_t)
Processes the initialize request.
The RoutingToken contains the magic bytes, the rank of the token sender, and the number of slots free...
int TCP_listen_fd(int port, int rcvbuf)
Create a TCP listening socket on the given port and INADDR_ANY, with the given receive buffer...
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
bool pause(uint64_t, uint64_t)
Pauses the RoutingMasterCore.
The header of the Routing Table, containing the magic bytes and the number of entries.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
bool stop(uint64_t, uint64_t)
Stops the RoutingMasterCore.
int rank
The rank from which the RoutingAckPacket came.
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
std::unique_ptr< RoutingMasterPolicy > makeRoutingMasterPolicy(std::string const &policy_plugin_spec, fhicl::ParameterSet const &ps)
Load a RoutingMasterPolicy plugin.
std::string report(std::string const &) const
Send a report on the current status of the RoutingMasterCore.
static const std::string TABLE_UPDATES_STAT_KEY
Key for Table Update count MonnitoredQuantity.
bool shutdown(uint64_t)
Shuts Down the RoutingMasterCore.
void process_event_table()
Main loop of the RoutingMasterCore. Determines when to send the next table update, asks the RoutingMasterPolicy for the table to send, and sends it.
void send_event_table(detail::RoutingPacket table)
Sends a detail::RoutingPacket to the table receivers.
Events should be routed by send count (EB -&gt; Agg)