artdaq  v3_05_00
RoutingMasterCore.cc
1 #include <sys/un.h>
2 #include <sys/time.h>
3 #include <sys/epoll.h>
4 #include <arpa/inet.h>
5 #include <netdb.h>
6 #include <pthread.h>
7 #include <sched.h>
8 #include <algorithm>
9 
10 #include "canvas/Utilities/Exception.h"
11 #include "cetlib_except/exception.h"
12 
13 #define TRACE_NAME (app_name + "_RoutingMasterCore").c_str() // include these 2 first -
14 #include "artdaq/DAQdata/Globals.hh" // to get tracemf.h before trace.h
15 #include "artdaq-core/Data/Fragment.hh"
16 #include "artdaq-core/Utilities/ExceptionHandler.hh"
17 
18 #include "artdaq/Application/RoutingMasterCore.hh"
19 #include "artdaq/RoutingPolicies/makeRoutingMasterPolicy.hh"
22 
23 const std::string artdaq::RoutingMasterCore::
24 TABLE_UPDATES_STAT_KEY("RoutingMasterCoreTableUpdates");
25 const std::string artdaq::RoutingMasterCore::
26 TOKENS_RECEIVED_STAT_KEY("RoutingMasterCoreTokensReceived");
27 
29  : received_token_counter_()
30  , shutdown_requested_(false)
31  , stop_requested_(true)
32  , pause_requested_(false)
33  , token_epoll_fd_(-1)
34  , token_socket_(-1)
35  , table_socket_(-1)
36  , ack_socket_(-1)
37 {
38  TLOG(TLVL_DEBUG) << "Constructor" ;
41 }
42 
44 {
45  TLOG(TLVL_DEBUG) << "Destructor" ;
46  artdaq::StatisticsCollection::getInstance().requestStop();
47  if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
48 }
49 
50 bool artdaq::RoutingMasterCore::initialize(fhicl::ParameterSet const& pset, uint64_t, uint64_t)
51 {
52  TLOG(TLVL_DEBUG) << "initialize method called with "
53  << "ParameterSet = \"" << pset.to_string()
54  << "\"." ;
55 
56  // pull out the relevant parts of the ParameterSet
57  fhicl::ParameterSet daq_pset;
58  try
59  {
60  daq_pset = pset.get<fhicl::ParameterSet>("daq");
61  }
62  catch (...)
63  {
64  TLOG(TLVL_ERROR)
65  << "Unable to find the DAQ parameters in the initialization "
66  << "ParameterSet: \"" + pset.to_string() + "\"." ;
67  return false;
68  }
69 
70  if (daq_pset.has_key("rank"))
71  {
72  if (my_rank >= 0 && daq_pset.get<int>("rank") != my_rank) {
73  TLOG(TLVL_WARNING) << "Routing Master rank specified at startup is different than rank specified at configure! Using rank received at configure!";
74  }
75  my_rank = daq_pset.get<int>("rank");
76  }
77  if (my_rank == -1)
78  {
79  TLOG(TLVL_ERROR) << "Routing Master rank not specified at startup or in configuration! Aborting";
80  exit(1);
81  }
82 
83  try
84  {
85  policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
86  }
87  catch (...)
88  {
89  TLOG(TLVL_ERROR)
90  << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\"." ;
91  return false;
92  }
93 
94  // pull out the Metric part of the ParameterSet
95  fhicl::ParameterSet metric_pset;
96  try
97  {
98  metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
99  }
100  catch (...) {} // OK if there's no metrics table defined in the FHiCL
101 
102  if (metric_pset.is_empty())
103  {
104  TLOG(TLVL_INFO) << "No metric plugins appear to be defined" ;
105  }
106  try
107  {
108  metricMan->initialize(metric_pset, app_name);
109  }
110  catch (...)
111  {
112  ExceptionHandler(ExceptionHandlerRethrow::no,
113  "Error loading metrics in RoutingMasterCore::initialize()");
114  }
115 
116  // create the requested CommandableFragmentGenerator
117  auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
118  if (policy_plugin_spec.length() == 0)
119  {
120  TLOG(TLVL_ERROR)
121  << "No fragment generator (parameter name = \"policy\") was "
122  << "specified in the policy ParameterSet. The "
123  << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\"." ;
124  return false;
125  }
126  try
127  {
128  policy_ = artdaq::makeRoutingMasterPolicy(policy_plugin_spec, policy_pset_);
129  }
130  catch (...)
131  {
132  std::stringstream exception_string;
133  exception_string << "Exception thrown during initialization of policy of type \""
134  << policy_plugin_spec << "\"";
135 
136  ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
137 
138  TLOG(TLVL_DEBUG) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string() ;
139 
140  return false;
141  }
142 
143  rt_priority_ = daq_pset.get<int>("rt_priority", 0);
144  sender_ranks_ = daq_pset.get<std::vector<int>>("sender_ranks");
145  num_receivers_ = policy_->GetReceiverCount();
146 
147  receive_ack_events_ = std::vector<epoll_event>(sender_ranks_.size());
148  receive_token_events_ = std::vector<epoll_event>(num_receivers_ + 1);
149 
150  auto mode = daq_pset.get<bool>("senders_send_by_send_count", false);
152  max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
153  current_table_interval_ms_ = max_table_update_interval_ms_;
154  max_ack_cycle_count_ = daq_pset.get<size_t>("table_ack_retry_count", 5);
155  receive_token_port_ = daq_pset.get<int>("routing_token_port", 35555);
156  send_tables_port_ = daq_pset.get<int>("table_update_port", 35556);
157  receive_acks_port_ = daq_pset.get<int>("table_acknowledge_port", 35557);
158  send_tables_address_ = daq_pset.get<std::string>("table_update_address", "227.128.12.28");
159  receive_address_ = daq_pset.get<std::string>("routing_master_hostname", "localhost");
160 
161  // fetch the monitoring parameters and create the MonitoredQuantity instances
162  statsHelper_.createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
163 
164  shutdown_requested_.store(false);
165  start_recieve_token_thread_();
166  return true;
167 }
168 
169 bool artdaq::RoutingMasterCore::start(art::RunID id, uint64_t, uint64_t)
170 {
171  run_id_ = id;
172  stop_requested_.store(false);
173  pause_requested_.store(false);
174 
175  statsHelper_.resetStatistics();
176 
177  metricMan->do_start();
178  table_update_count_ = 0;
179  received_token_count_ = 0;
180 
181  TLOG(TLVL_INFO) << "Started run " << run_id_.run() ;
182  return true;
183 }
184 
185 bool artdaq::RoutingMasterCore::stop(uint64_t, uint64_t)
186 {
187  TLOG(TLVL_INFO) << "Stopping run " << run_id_.run()
188  << " after " << table_update_count_ << " table updates."
189  << " and " << received_token_count_ << " received tokens." ;
190  stop_requested_.store(true);
191  run_id_ = art::RunID::flushRun();
192  return true;
193 }
194 
195 bool artdaq::RoutingMasterCore::pause(uint64_t, uint64_t)
196 {
197  TLOG(TLVL_INFO) << "Pausing run " << run_id_.run()
198  << " after " << table_update_count_ << " table updates."
199  << " and " << received_token_count_ << " received tokens." ;
200  pause_requested_.store(true);
201  return true;
202 }
203 
204 bool artdaq::RoutingMasterCore::resume(uint64_t, uint64_t)
205 {
206  TLOG(TLVL_DEBUG) << "Resuming run " << run_id_.run();
207  pause_requested_.store(false);
208  metricMan->do_start();
209  return true;
210 }
211 
213 {
214  shutdown_requested_.store(true);
215  if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
216  policy_.reset(nullptr);
217  metricMan->shutdown();
218  return true;
219 }
220 
221 bool artdaq::RoutingMasterCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
222 {
223  TLOG(TLVL_INFO) << "soft_initialize method called with "
224  << "ParameterSet = \"" << pset.to_string()
225  << "\"." ;
226  return initialize(pset, e, f);
227 }
228 
229 bool artdaq::RoutingMasterCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t e, uint64_t f)
230 {
231  TLOG(TLVL_INFO) << "reinitialize method called with "
232  << "ParameterSet = \"" << pset.to_string()
233  << "\"." ;
234  return initialize(pset, e, f);
235 }
236 
238 {
239  if (rt_priority_ > 0)
240  {
241 #pragma GCC diagnostic push
242 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
243  sched_param s_param = {};
244  s_param.sched_priority = rt_priority_;
245  if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param))
246  TLOG(TLVL_WARNING) << "setting realtime priority failed" ;
247 #pragma GCC diagnostic pop
248  }
249 
250  // try-catch block here?
251 
252  // how to turn RT PRI off?
253  if (rt_priority_ > 0)
254  {
255 #pragma GCC diagnostic push
256 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
257  sched_param s_param = {};
258  s_param.sched_priority = rt_priority_;
259  int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
260  if (status != 0)
261  {
262  TLOG(TLVL_ERROR)
263  << "Failed to set realtime priority to " << rt_priority_
264  << ", return code = " << status ;
265  }
266 #pragma GCC diagnostic pop
267  }
268 
269  //MPI_Barrier(local_group_comm_);
270 
271  TLOG(TLVL_DEBUG) << "Sending initial table." ;
272  auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
273  auto nextSendTime = startTime;
274  double delta_time;
275  while (!stop_requested_ && !pause_requested_)
276  {
277  startTime = artdaq::MonitoredQuantity::getCurrentTime();
278 
279  if (startTime >= nextSendTime)
280  {
281  auto table = policy_->GetCurrentTable();
282  if (table.size() > 0)
283  {
284  send_event_table(table);
285  ++table_update_count_;
286  delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
287  statsHelper_.addSample(TABLE_UPDATES_STAT_KEY, delta_time);
288  TLOG(16) << "process_fragments TABLE_UPDATES_STAT_KEY=" << delta_time ;
289  }
290  else
291  {
292  TLOG(TLVL_DEBUG) << "No tokens received in this update interval (" << current_table_interval_ms_ << " ms)! This most likely means that the receivers are not keeping up!" ;
293  }
294  auto max_tokens = policy_->GetMaxNumberOfTokens();
295  if (max_tokens > 0)
296  {
297  auto frac = table.size() / static_cast<double>(max_tokens);
298  if (frac > 0.75) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
299  if (frac < 0.5) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
300  if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
301  if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
302  }
303  nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
304  TLOG(TLVL_DEBUG) << "current_table_interval_ms is now " << current_table_interval_ms_ ;
305  }
306  else
307  {
308  usleep(current_table_interval_ms_ * 10); // 1/100 of the table update interval
309  }
310  }
311 
312  policy_->Reset();
313  metricMan->do_stop();
314 }
315 
317 {
318  // Reconnect table socket, if necessary
319  if (table_socket_ == -1)
320  {
321  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
322  if (table_socket_ < 0)
323  {
324  TLOG(TLVL_ERROR) << "I failed to create the socket for sending Data Requests! Errno: " << errno ;
325  exit(1);
326  }
327  auto sts = ResolveHost(send_tables_address_.c_str(), send_tables_port_, send_tables_addr_);
328  if (sts == -1)
329  {
330  TLOG(TLVL_ERROR) << "Unable to resolve table_update_address" ;
331  exit(1);
332  }
333 
334  auto yes = 1;
335  if (receive_address_ != "localhost")
336  {
337  TLOG(TLVL_DEBUG) << "Making sure that multicast sending uses the correct interface for hostname " << receive_address_ ;
338  struct in_addr addr;
339  sts = ResolveHost(receive_address_.c_str(), addr);
340  if (sts == -1)
341  {
342  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Unable to resolve routing_master_address" << std::endl;;
343  }
344 
345  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
346  {
347  throw art::Exception(art::errors::Configuration) <<
348  "RoutingMasterCore: Unable to enable port reuse on table update socket" << std::endl;
349  exit(1);
350  }
351 
352  if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0)
353  {
354  TLOG(TLVL_ERROR) << "Unable to enable multicast loopback on table socket" ;
355  exit(1);
356  }
357  if (setsockopt(table_socket_, IPPROTO_IP, IP_MULTICAST_IF, &addr, sizeof(addr)) == -1)
358  {
359  TLOG(TLVL_ERROR) << "Cannot set outgoing interface. Errno: " << errno ;
360  exit(1);
361  }
362  }
363  if (setsockopt(table_socket_, SOL_SOCKET, SO_BROADCAST, (void*)&yes, sizeof(int)) == -1)
364  {
365  TLOG(TLVL_ERROR) << "Cannot set request socket to broadcast. Errno: " << errno ;
366  exit(1);
367  }
368  }
369 
370  // Reconnect ack socket, if necessary
371  if (ack_socket_ == -1)
372  {
373  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
374  if (ack_socket_ < 0)
375  {
376  throw art::Exception(art::errors::Configuration) << "RoutingMasterCore: Error creating socket for receiving table update acks!" << std::endl;
377  exit(1);
378  }
379 
380  struct sockaddr_in si_me_request;
381 
382  auto yes = 1;
383  if (setsockopt(ack_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
384  {
385  throw art::Exception(art::errors::Configuration) <<
386  "RoutingMasterCore: Unable to enable port reuse on ack socket" << std::endl;
387  exit(1);
388  }
389  memset(&si_me_request, 0, sizeof(si_me_request));
390  si_me_request.sin_family = AF_INET;
391  si_me_request.sin_port = htons(receive_acks_port_);
392  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
393  if (bind(ack_socket_, reinterpret_cast<struct sockaddr *>(&si_me_request), sizeof(si_me_request)) == -1)
394  {
395  throw art::Exception(art::errors::Configuration) <<
396  "RoutingMasterCore: Cannot bind request socket to port " << receive_acks_port_ << std::endl;
397  exit(1);
398  }
399  TLOG(TLVL_DEBUG) << "Listening for acks on 0.0.0.0 port " << receive_acks_port_ ;
400  }
401 
402  auto acks = std::unordered_map<int, bool>();
403  for (auto& r : sender_ranks_)
404  {
405  acks[r] = false;
406  }
407  auto counter = 0U;
408  auto start_time = std::chrono::steady_clock::now();
409  while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0 && !stop_requested_)
410  {
411  // Send table update
412  auto header = detail::RoutingPacketHeader(routing_mode_, packet.size());
413  auto packetSize = sizeof(detail::RoutingPacketEntry) * packet.size();
414 
415  assert(packetSize + sizeof(header) < MAX_ROUTING_TABLE_SIZE);
416  std::vector<uint8_t> buffer(packetSize + sizeof(header));
417  memcpy(&buffer[0], &header, sizeof(detail::RoutingPacketHeader));
418  memcpy(&buffer[sizeof(detail::RoutingPacketHeader)], &packet[0], packetSize);
419 
420  TLOG(TLVL_DEBUG) << "Sending table information for " << header.nEntries << " events to multicast group " << send_tables_address_ << ", port " << send_tables_port_ ;
421  TRACE(16,"headerData:0x%016lx%016lx packetData:0x%016lx%016lx"
422  ,((unsigned long*)&header)[0],((unsigned long*)&header)[1], ((unsigned long*)&packet[0])[0],((unsigned long*)&packet[0])[1] );
423  auto sts = sendto(table_socket_, &buffer[0], buffer.size(), 0, reinterpret_cast<struct sockaddr *>(&send_tables_addr_), sizeof(send_tables_addr_));
424  if (sts != static_cast<ssize_t>(buffer.size()))
425  {
426  TLOG(TLVL_ERROR) << "Error sending routing table. sts=" << sts;
427  }
428 
429  // Collect acks
430 
431  auto first = packet[0].sequence_id;
432  auto last = packet.rbegin()->sequence_id;
433  TLOG(TLVL_DEBUG) << "Sent " << sts << " bytes. Expecting acks to have first= " << first << ", and last= " << last ;
434 
435 
436  auto startTime = std::chrono::steady_clock::now();
437  while (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) > 0)
438  {
439  auto table_ack_wait_time_ms = current_table_interval_ms_ / max_ack_cycle_count_;
440  if (TimeUtils::GetElapsedTimeMilliseconds(startTime) > table_ack_wait_time_ms)
441  {
442  if (++counter > max_ack_cycle_count_ && table_update_count_ > 0)
443  {
444  TLOG(TLVL_WARNING) << "Did not receive acks from all senders after resending table " << counter
445  << " times during the table_update_interval. Check the status of the senders!" ;
446  }
447  else
448  {
449  TLOG(TLVL_WARNING) << "Did not receive acks from all senders within the timeout (" << table_ack_wait_time_ms << " ms). Resending table update" ;
450  }
451 
452  if (std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; }) <= 3)
453  {
454  auto ackIter = acks.begin();
455  while (ackIter != acks.end())
456  {
457  if (! ackIter->second)
458  {
459  TLOG(TLVL_TRACE) << "Did not receive ack from rank " << ackIter->first;
460  }
461  ++ackIter;
462  }
463  }
464  break;
465  }
466 
467  TLOG(20) << "send_event_table: Polling Request socket for new requests" ;
468  auto ready = true;
469  while (ready)
470  {
472  if (recvfrom(ack_socket_, &buffer, sizeof(detail::RoutingAckPacket), MSG_DONTWAIT, NULL, NULL) < 0)
473  {
474  if (errno == EWOULDBLOCK || errno == EAGAIN)
475  {
476  TLOG(20) << "send_event_table: No more ack datagrams on ack socket." ;
477  ready = false;
478  }
479  else
480  {
481  TLOG(TLVL_ERROR) << "An unexpected error occurred during ack packet receive" ;
482  exit(2);
483  }
484  }
485  else
486  {
487  TLOG(TLVL_DEBUG) << "Ack packet from rank " << buffer.rank << " has first= " << buffer.first_sequence_id
488  << " and last= " << buffer.last_sequence_id ;
489  if (acks.count(buffer.rank) && buffer.first_sequence_id == first && buffer.last_sequence_id == last)
490  {
491  TLOG(TLVL_DEBUG) << "Received table update acknowledgement from sender with rank " << buffer.rank << "." ;
492  acks[buffer.rank] = true;
493  TLOG(TLVL_DEBUG) << "There are now " << std::count_if(acks.begin(), acks.end(), [](std::pair<int, bool> p) {return !p.second; })
494  << " acks outstanding" ;
495  }
496  else
497  {
498  if (!acks.count(buffer.rank))
499  {
500  TLOG(TLVL_ERROR) << "Received acknowledgement from invalid rank " << buffer.rank << "!"
501  << " Cross-talk between RoutingMasters means there's a configuration error!" ;
502  }
503  else
504  {
505  TLOG(TLVL_WARNING) << "Received acknowledgement from rank " << buffer.rank
506  << " that had incorrect sequence ID information. Discarding."
507  << " Expected first/last=" << first <<"/"<< last
508  << " recvd=" << buffer.first_sequence_id <<"/"<< buffer.last_sequence_id;
509  }
510  }
511  }
512  }
513  usleep(table_ack_wait_time_ms * 1000 / 10);
514  }
515  }
516  if (metricMan)
517  {
518  artdaq::TimeUtils::seconds delta = std::chrono::steady_clock::now() - start_time;
519  metricMan->sendMetric("Avg Table Acknowledge Time", delta.count(), "seconds", 3, MetricMode::Average);
520  }
521 }
522 
523 void artdaq::RoutingMasterCore::receive_tokens_()
524 {
525  while (!shutdown_requested_)
526  {
527  TLOG(TLVL_DEBUG) << "Receive Token loop start" ;
528  if (token_socket_ == -1)
529  {
530  TLOG(TLVL_DEBUG) << "Opening token listener socket" ;
531  token_socket_ = TCP_listen_fd(receive_token_port_, 3 * sizeof(detail::RoutingToken));
532  fcntl(token_socket_, F_SETFL, O_NONBLOCK); // set O_NONBLOCK
533 
534  if (token_epoll_fd_ != -1) close(token_epoll_fd_);
535  struct epoll_event ev;
536  token_epoll_fd_ = epoll_create1(0);
537  ev.events = EPOLLIN | EPOLLPRI;
538  ev.data.fd = token_socket_;
539  if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, token_socket_, &ev) == -1)
540  {
541  TLOG(TLVL_ERROR) << "Could not register listen socket to epoll fd" ;
542  exit(3);
543  }
544  }
545  if (token_socket_ == -1 || token_epoll_fd_ == -1)
546  {
547  TLOG(TLVL_DEBUG) << "One of the listen sockets was not opened successfully." ;
548  return;
549  }
550 
551  auto nfds = epoll_wait(token_epoll_fd_, &receive_token_events_[0], receive_token_events_.size(), current_table_interval_ms_);
552  if (nfds == -1)
553  {
554  perror("epoll_wait");
555  exit(EXIT_FAILURE);
556  }
557 
558  while (stop_requested_ && !shutdown_requested_)
559  {
560  usleep(10000);
561  }
562 
563  TLOG(TLVL_DEBUG) << "Received " << nfds << " events" ;
564  for (auto n = 0; n < nfds; ++n)
565  {
566  if (receive_token_events_[n].data.fd == token_socket_)
567  {
568  TLOG(TLVL_DEBUG) << "Accepting new connection on token_socket" ;
569  sockaddr_in addr;
570  socklen_t arglen = sizeof(addr);
571  auto conn_sock = accept(token_socket_, (struct sockaddr *)&addr, &arglen);
572  fcntl(conn_sock, F_SETFL, O_NONBLOCK); // set O_NONBLOCK
573 
574  if (conn_sock == -1)
575  {
576  perror("accept");
577  exit(EXIT_FAILURE);
578  }
579 
580  receive_token_addrs_[conn_sock] = std::string(inet_ntoa(addr.sin_addr));
581  TLOG(TLVL_DEBUG) << "New fd is " << conn_sock << " for receiver at " << receive_token_addrs_[conn_sock];
582  struct epoll_event ev;
583  ev.events = EPOLLIN | EPOLLET;
584  ev.data.fd = conn_sock;
585  if (epoll_ctl(token_epoll_fd_, EPOLL_CTL_ADD, conn_sock, &ev) == -1)
586  {
587  perror("epoll_ctl: conn_sock");
588  exit(EXIT_FAILURE);
589  }
590  }
591  else
592  {
593  /*if (receive_token_events_[n].events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP))
594  {
595  TLOG(TLVL_DEBUG) << "Closing connection on fd " << receive_token_events_[n].data.fd << " (" << receive_token_addrs_[receive_token_events_[n].data.fd] << ")";
596  receive_token_addrs_.erase(receive_token_events_[n].data.fd);
597  close(receive_token_events_[n].data.fd);
598  epoll_ctl(token_epoll_fd_, EPOLL_CTL_DEL, receive_token_events_[n].data.fd, NULL);
599  continue;
600  }*/
601 
602  auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
603  bool reading = true;
604  int sts = 0;
605  while(reading)
606  {
607  detail::RoutingToken buff;
608  sts += read(receive_token_events_[n].data.fd, &buff, sizeof(detail::RoutingToken) - sts);
609  if (sts == 0)
610  {
611  TLOG(TLVL_INFO) << "Received 0-size token from " << receive_token_addrs_[receive_token_events_[n].data.fd];
612  reading = false;
613  }
614  else if(sts < 0 && errno == EAGAIN)
615  {
616  TLOG(TLVL_DEBUG) << "No more tokens from this rank. Continuing poll loop.";
617  reading = false;
618  }
619  else if(sts < 0)
620  {
621  TLOG(TLVL_ERROR) << "Error reading from token socket: sts=" << sts << ", errno=" << errno;
622  receive_token_addrs_.erase(receive_token_events_[n].data.fd);
623  close(receive_token_events_[n].data.fd);
624  epoll_ctl(token_epoll_fd_, EPOLL_CTL_DEL, receive_token_events_[n].data.fd, NULL);
625  reading = false;
626  }
627  else if (sts == sizeof(detail::RoutingToken) && buff.header != TOKEN_MAGIC)
628  {
629  TLOG(TLVL_ERROR) << "Received invalid token from " << receive_token_addrs_[receive_token_events_[n].data.fd] << " sts=" << sts;
630  reading = false;
631  }
632  else if(sts == sizeof(detail::RoutingToken))
633  {
634  sts = 0;
635  TLOG(TLVL_DEBUG) << "Received token from " << buff.rank << " indicating " << buff.new_slots_free << " slots are free. (run=" << buff.run_number << ")" ;
636  if (buff.run_number != run_id_.run())
637  {
638  TLOG(TLVL_DEBUG) << "Received token from a different run number! Current = " << run_id_.run() << ", token = " << buff.run_number << ", ignoring (n=" << buff.new_slots_free << ")";
639  }
640  else
641  {
642  received_token_count_ += buff.new_slots_free;
644  {
645  policy_->AddReceiverToken(buff.rank, buff.new_slots_free);
646  }
647  else if (routing_mode_ == detail::RoutingMasterMode::RouteBySendCount)
648  {
649  if (!received_token_counter_.count(buff.rank)) received_token_counter_[buff.rank] = 0;
650  received_token_counter_[buff.rank] += buff.new_slots_free;
651  TLOG(TLVL_DEBUG) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size() << "." ;
652  while (received_token_counter_[buff.rank] >= sender_ranks_.size())
653  {
654  TLOG(TLVL_DEBUG) << "RoutingMasterMode is RouteBySendCount. I have " << received_token_counter_[buff.rank] << " tokens for rank " << buff.rank << " and I need " << sender_ranks_.size()
655  << "... Sending token to policy" ;
656  policy_->AddReceiverToken(buff.rank, 1);
657  received_token_counter_[buff.rank] -= sender_ranks_.size();
658  }
659  }
660  }
661  }
662  }
663  auto delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
664  statsHelper_.addSample(TOKENS_RECEIVED_STAT_KEY, delta_time);
665  bool readyToReport = statsHelper_.readyToReport(delta_time);
666  if (readyToReport)
667  {
668  std::string statString = buildStatisticsString_();
669  TLOG(TLVL_INFO) << statString;
670  sendMetrics_();
671  }
672  }
673 }
674  }
675 }
676 
677 void artdaq::RoutingMasterCore::start_recieve_token_thread_()
678 {
679  if (ev_token_receive_thread_.joinable()) ev_token_receive_thread_.join();
680  boost::thread::attributes attrs;
681  attrs.set_stack_size(4096 * 2000); // 8000 KB
682 
683  TLOG(TLVL_INFO) << "Starting Token Reception Thread" ;
684  try {
685  ev_token_receive_thread_ = boost::thread(attrs, boost::bind(&RoutingMasterCore::receive_tokens_, this));
686  }
687  catch(boost::exception const& e)
688  {
689  std::cerr << "Exception encountered starting Token Reception thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
690  exit(3);
691  }
692  TLOG(TLVL_INFO) << "Started Token Reception Thread";
693 }
694 
695 std::string artdaq::RoutingMasterCore::report(std::string const&) const
696 {
697  std::string resultString;
698 
699  // if we haven't been able to come up with any report so far, say so
700  auto tmpString = app_name + " run number = " + std::to_string(run_id_.run())
701  + ", table updates sent = " + std::to_string(table_update_count_)
702  + ", Receiver tokens received = " + std::to_string(received_token_count_);
703  return tmpString;
704 }
705 
706 std::string artdaq::RoutingMasterCore::buildStatisticsString_() const
707 {
708  std::ostringstream oss;
709  oss << app_name << " statistics:" << std::endl;
710 
711  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
712  if (mqPtr.get() != nullptr)
713  {
714  artdaq::MonitoredQuantityStats stats;
715  mqPtr->getStats(stats);
716  oss << " Table Update statistics: "
717  << stats.recentSampleCount << " table updates sent at "
718  << stats.recentSampleRate << " table updates/sec, , monitor window = "
719  << stats.recentDuration << " sec" << std::endl;
720  oss << " Average times per table update: ";
721  if (stats.recentSampleRate > 0.0)
722  {
723  oss << " elapsed time = "
724  << (1.0 / stats.recentSampleRate) << " sec";
725  }
726  oss << ", avg table acknowledgement wait time = "
727  << (mqPtr->getRecentValueSum() / sender_ranks_.size()) << " sec" << std::endl;
728  }
729 
730  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
731  if (mqPtr.get() != nullptr)
732  {
733  artdaq::MonitoredQuantityStats stats;
734  mqPtr->getStats(stats);
735  oss << " Received Token statistics: "
736  << stats.recentSampleCount << " tokens received at "
737  << stats.recentSampleRate << " tokens/sec, , monitor window = "
738  << stats.recentDuration << " sec" << std::endl;
739  oss << " Average times per token: ";
740  if (stats.recentSampleRate > 0.0)
741  {
742  oss << " elapsed time = "
743  << (1.0 / stats.recentSampleRate) << " sec";
744  }
745  oss << ", input token wait time = "
746  << mqPtr->getRecentValueSum() << " sec" << std::endl;
747  }
748 
749  return oss.str();
750 }
751 
752 void artdaq::RoutingMasterCore::sendMetrics_()
753 {
754  if (metricMan)
755  {
756  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
757  if (mqPtr.get() != nullptr)
758  {
759  artdaq::MonitoredQuantityStats stats;
760  mqPtr->getStats(stats);
761  metricMan->sendMetric("Table Update Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::LastPoint);
762  metricMan->sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
763  metricMan->sendMetric("Average Sender Acknowledgement Time", (mqPtr->getRecentValueSum() / sender_ranks_.size()), "seconds", 3, MetricMode::Average);
764  }
765 
766  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
767  if (mqPtr.get() != nullptr)
768  {
769  artdaq::MonitoredQuantityStats stats;
770  mqPtr->getStats(stats);
771  metricMan->sendMetric("Receiver Token Count", static_cast<unsigned long>(stats.fullSampleCount), "updates", 1, MetricMode::LastPoint);
772  metricMan->sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
773  metricMan->sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
774  }
775  }
776 }
bool resume(uint64_t, uint64_t)
Resumes the RoutingMasterCore.
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
bool soft_initialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Soft-Initializes the RoutingMasterCore.
static const std::string TOKENS_RECEIVED_STAT_KEY
Key for the Tokens Received MonitoredQuantity.
A row of the Routing Table.
bool reinitialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Reinitializes the RoutingMasterCore.
bool start(art::RunID id, uint64_t, uint64_t)
Start the RoutingMasterCore.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
RoutingMasterCore()
RoutingMasterCore Constructor.
Events should be routed by sequence ID (BR -&gt; EB)
bool initialize(fhicl::ParameterSet const &pset, uint64_t, uint64_t)
Processes the initialize request.
The RoutingToken contains the magic bytes, the rank of the token sender, and the number of slots free...
int TCP_listen_fd(int port, int rcvbuf)
Create a TCP listening socket on the given port and INADDR_ANY, with the given receive buffer...
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
bool pause(uint64_t, uint64_t)
Pauses the RoutingMasterCore.
The header of the Routing Table, containing the magic bytes and the number of entries.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
bool stop(uint64_t, uint64_t)
Stops the RoutingMasterCore.
int rank
The rank from which the RoutingAckPacket came.
std::unique_ptr< RoutingMasterPolicy > makeRoutingMasterPolicy(std::string const &policy_plugin_spec, fhicl::ParameterSet const &ps)
Load a RoutingMasterPolicy plugin.
std::string report(std::string const &) const
Send a report on the current status of the RoutingMasterCore.
static const std::string TABLE_UPDATES_STAT_KEY
Key for Table Update count MonnitoredQuantity.
bool shutdown(uint64_t)
Shuts Down the RoutingMasterCore.
void process_event_table()
Main loop of the RoutingMasterCore. Determines when to send the next table update, asks the RoutingMasterPolicy for the table to send, and sends it.
void send_event_table(detail::RoutingPacket table)
Sends a detail::RoutingPacket to the table receivers.
Events should be routed by send count (EB -&gt; Agg)