artdaq  v3_11_01
RoutingManagerCore.cc
1 #include <arpa/inet.h>
2 #include <netdb.h>
3 #include <pthread.h>
4 #include <sched.h>
5 #include <sys/time.h>
6 #include <sys/un.h>
7 #include <algorithm>
8 #include <memory>
9 
10 #include "canvas/Utilities/Exception.h"
11 #include "cetlib_except/exception.h"
12 
13 #include "artdaq/DAQdata/Globals.hh" // include these 2 first to get tracemf.h -
14 #define TRACE_NAME (app_name + "_RoutingManagerCore").c_str() // before trace.h
15 #include "artdaq-core/Data/Fragment.hh"
16 #include "artdaq-core/Utilities/ExceptionHandler.hh"
17 
18 #include "artdaq/Application/RoutingManagerCore.hh"
21 #include "artdaq/RoutingPolicies/makeRoutingManagerPolicy.hh"
22 
23 const std::string artdaq::RoutingManagerCore::
24  TABLE_UPDATES_STAT_KEY("RoutingManagerCoreTableUpdates");
25 const std::string artdaq::RoutingManagerCore::
26  TOKENS_RECEIVED_STAT_KEY("RoutingManagerCoreTokensReceived");
27 const std::string artdaq::RoutingManagerCore::
28  CURRENT_TABLE_INTERVAL_STAT_KEY("RoutingManagerCoreCurrentTableInterval");
29 
31  : shutdown_requested_(false)
32  , stop_requested_(true)
33  , pause_requested_(false)
34  , statsHelperPtr_(new artdaq::StatisticsHelper())
35 {
36  TLOG(TLVL_DEBUG) << "Constructor";
37  statsHelperPtr_->addMonitoredQuantityName(TABLE_UPDATES_STAT_KEY);
38  statsHelperPtr_->addMonitoredQuantityName(TOKENS_RECEIVED_STAT_KEY);
39  statsHelperPtr_->addMonitoredQuantityName(CURRENT_TABLE_INTERVAL_STAT_KEY);
40 }
41 
43 {
44  TLOG(TLVL_DEBUG) << "Destructor";
45  artdaq::StatisticsCollection::getInstance().requestStop();
46  token_receiver_->stopTokenReception(true);
47 }
48 
49 bool artdaq::RoutingManagerCore::initialize(fhicl::ParameterSet const& pset, uint64_t /*unused*/, uint64_t /*unused*/)
50 {
51  TLOG(TLVL_DEBUG) << "initialize method called with "
52  << "ParameterSet = \"" << pset.to_string()
53  << "\".";
54 
55  // pull out the relevant parts of the ParameterSet
56  fhicl::ParameterSet daq_pset;
57  try
58  {
59  daq_pset = pset.get<fhicl::ParameterSet>("daq");
60  }
61  catch (...)
62  {
63  TLOG(TLVL_ERROR)
64  << "Unable to find the DAQ parameters in the initialization "
65  << "ParameterSet: \"" + pset.to_string() + "\".";
66  return false;
67  }
68 
69  if (daq_pset.has_key("rank"))
70  {
71  if (my_rank >= 0 && daq_pset.get<int>("rank") != my_rank)
72  {
73  TLOG(TLVL_WARNING) << "Routing Manager rank specified at startup is different than rank specified at configure! Using rank received at configure!";
74  }
75  my_rank = daq_pset.get<int>("rank");
76  }
77  if (my_rank == -1)
78  {
79  TLOG(TLVL_ERROR) << "Routing Manager rank not specified at startup or in configuration! Aborting";
80  exit(1);
81  }
82 
83  try
84  {
85  policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
86  }
87  catch (...)
88  {
89  TLOG(TLVL_ERROR)
90  << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\".";
91  return false;
92  }
93 
94  try
95  {
96  token_receiver_pset_ = daq_pset.get<fhicl::ParameterSet>("token_receiver");
97  }
98  catch (...)
99  {
100  TLOG(TLVL_ERROR)
101  << "Unable to find the token_receiver parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\".";
102  return false;
103  }
104 
105  // pull out the Metric part of the ParameterSet
106  fhicl::ParameterSet metric_pset;
107  try
108  {
109  metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
110  }
111  catch (...)
112  {} // OK if there's no metrics table defined in the FHiCL
113 
114  if (metric_pset.is_empty())
115  {
116  TLOG(TLVL_INFO) << "No metric plugins appear to be defined";
117  }
118  try
119  {
120  metricMan->initialize(metric_pset, app_name);
121  }
122  catch (...)
123  {
124  ExceptionHandler(ExceptionHandlerRethrow::no,
125  "Error loading metrics in RoutingManagerCore::initialize()");
126  }
127 
128  // create the requested RoutingPolicy
129  auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
130  if (policy_plugin_spec.length() == 0)
131  {
132  TLOG(TLVL_ERROR)
133  << "No fragment generator (parameter name = \"policy\") was "
134  << "specified in the policy ParameterSet. The "
135  << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\".";
136  return false;
137  }
138  try
139  {
140  policy_ = artdaq::makeRoutingManagerPolicy(policy_plugin_spec, policy_pset_);
141  }
142  catch (...)
143  {
144  std::stringstream exception_string;
145  exception_string << "Exception thrown during initialization of policy of type \""
146  << policy_plugin_spec << "\"";
147 
148  ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
149 
150  TLOG(TLVL_DEBUG) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string();
151 
152  return false;
153  }
154 
155  rt_priority_ = daq_pset.get<int>("rt_priority", 0);
156  max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
157  current_table_interval_ms_ = max_table_update_interval_ms_;
158  table_update_high_fraction_ = daq_pset.get<double>("table_update_interval_high_frac", 0.75);
159  table_update_low_fraction_ = daq_pset.get<double>("table_update_interval_low_frac", 0.5);
160 
161  // fetch the monitoring parameters and create the MonitoredQuantity instances
162  statsHelperPtr_->createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
163 
164  // create the requested TokenReceiver
165  token_receiver_ = std::make_unique<TokenReceiver>(token_receiver_pset_, policy_, max_table_update_interval_ms_);
166  token_receiver_->setStatsHelper(statsHelperPtr_, TOKENS_RECEIVED_STAT_KEY);
167  token_receiver_->startTokenReception();
168  token_receiver_->pauseTokenReception();
169 
170  table_listen_port_ = daq_pset.get<int>("table_update_port", 35556);
171 
172  shutdown_requested_.store(true);
173  if (listen_thread_ && listen_thread_->joinable())
174  {
175  listen_thread_->join();
176  }
177  shutdown_requested_.store(false);
178  TLOG(TLVL_INFO) << "Starting Listener Thread";
179 
180  try
181  {
182  listen_thread_ = std::make_unique<boost::thread>(&RoutingManagerCore::listen_, this);
183  }
184  catch (const boost::exception& e)
185  {
186  TLOG(TLVL_ERROR) << "Caught boost::exception starting TCP Socket Listen thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
187  std::cerr << "Caught boost::exception starting TCP Socket Listen thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
188  exit(5);
189  }
190  return true;
191 }
192 
193 bool artdaq::RoutingManagerCore::start(art::RunID id, uint64_t /*unused*/, uint64_t /*unused*/)
194 {
195  run_id_ = id;
196  stop_requested_.store(false);
197  pause_requested_.store(false);
198 
199  statsHelperPtr_->resetStatistics();
200 
201  metricMan->do_start();
202  table_update_count_ = 0;
203  token_receiver_->setRunNumber(run_id_.run());
204  token_receiver_->resumeTokenReception();
205 
206  TLOG(TLVL_INFO) << "Started run " << run_id_.run();
207  return true;
208 }
209 
210 bool artdaq::RoutingManagerCore::stop(uint64_t /*unused*/, uint64_t /*unused*/)
211 {
212  TLOG(TLVL_INFO) << "Stopping run " << run_id_.run()
213  << " after " << table_update_count_ << " table updates."
214  << " and " << token_receiver_->getReceivedTokenCount() << " received tokens.";
215  stop_requested_.store(true);
216  token_receiver_->pauseTokenReception();
217  run_id_ = art::RunID::flushRun();
218  return true;
219 }
220 
221 bool artdaq::RoutingManagerCore::pause(uint64_t /*unused*/, uint64_t /*unused*/)
222 {
223  TLOG(TLVL_INFO) << "Pausing run " << run_id_.run()
224  << " after " << table_update_count_ << " table updates."
225  << " and " << token_receiver_->getReceivedTokenCount() << " received tokens.";
226  pause_requested_.store(true);
227  return true;
228 }
229 
230 bool artdaq::RoutingManagerCore::resume(uint64_t /*unused*/, uint64_t /*unused*/)
231 {
232  TLOG(TLVL_DEBUG) << "Resuming run " << run_id_.run();
233  pause_requested_.store(false);
234  metricMan->do_start();
235  return true;
236 }
237 
238 bool artdaq::RoutingManagerCore::shutdown(uint64_t /*unused*/)
239 {
240  shutdown_requested_.store(true);
241  if (listen_thread_ && listen_thread_->joinable())
242  {
243  listen_thread_->join();
244  }
245  token_receiver_->stopTokenReception();
246  policy_.reset();
247  metricMan->shutdown();
248  return true;
249 }
250 
251 bool artdaq::RoutingManagerCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t timeout, uint64_t timestamp)
252 {
253  TLOG(TLVL_INFO) << "soft_initialize method called with "
254  << "ParameterSet = \"" << pset.to_string()
255  << "\".";
256  return initialize(pset, timeout, timestamp);
257 }
258 
259 bool artdaq::RoutingManagerCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t timeout, uint64_t timestamp)
260 {
261  TLOG(TLVL_INFO) << "reinitialize method called with "
262  << "ParameterSet = \"" << pset.to_string()
263  << "\".";
264  return initialize(pset, timeout, timestamp);
265 }
266 
268 {
269  if (rt_priority_ > 0)
270  {
271 #pragma GCC diagnostic push
272 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
273  sched_param s_param = {};
274  s_param.sched_priority = rt_priority_;
275  if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param) != 0)
276  {
277  TLOG(TLVL_WARNING) << "setting realtime priority failed";
278  }
279 #pragma GCC diagnostic pop
280  }
281 
282  // try-catch block here?
283 
284  // how to turn RT PRI off?
285  if (rt_priority_ > 0)
286  {
287 #pragma GCC diagnostic push
288 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
289  sched_param s_param = {};
290  s_param.sched_priority = rt_priority_;
291  int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
292  if (status != 0)
293  {
294  TLOG(TLVL_ERROR)
295  << "Failed to set realtime priority to " << rt_priority_
296  << ", return code = " << status;
297  }
298 #pragma GCC diagnostic pop
299  }
300 
301  //MPI_Barrier(local_group_comm_);
302 
303  TLOG(TLVL_DEBUG) << "Sending initial table.";
304  auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
305  auto nextSendTime = startTime;
306  double delta_time;
307  while (!stop_requested_ && !pause_requested_)
308  {
309  receive_();
310  if (policy_->GetRoutingMode() == detail::RoutingManagerMode::EventBuilding || policy_->GetRoutingMode() == detail::RoutingManagerMode::RequestBasedEventBuilding)
311  {
312  startTime = artdaq::MonitoredQuantity::getCurrentTime();
313 
314  if (startTime >= nextSendTime)
315  {
316  auto table = policy_->GetCurrentTable();
317 
318  if (table.empty())
319  {
320  TLOG(TLVL_WARNING) << "Routing Policy generated Empty table for this routing interval (" << current_table_interval_ms_ << " ms)! This may indicate issues with the receivers, if it persists."
321  << " Next seqID=" << policy_->GetNextSequenceID() << ", Policy held tokens=" << policy_->GetHeldTokenCount();
322  }
323  else
324  {
325  send_event_table(table);
326  ++table_update_count_;
327  delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
328  statsHelperPtr_->addSample(TABLE_UPDATES_STAT_KEY, delta_time);
329  TLOG(16) << "process_fragments TABLE_UPDATES_STAT_KEY=" << delta_time;
330 
331  bool readyToReport = statsHelperPtr_->readyToReport();
332  if (readyToReport)
333  {
334  std::string statString = buildStatisticsString_();
335  TLOG(TLVL_INFO) << statString;
336  sendMetrics_();
337  }
338  }
339 
340  auto max_tokens = policy_->GetMaxNumberOfTokens();
341  if (max_tokens > 0)
342  {
343  auto frac = policy_->GetTokensUsedSinceLastUpdate() / static_cast<double>(max_tokens);
344  policy_->ResetTokensUsedSinceLastUpdate();
345  if (frac > table_update_high_fraction_) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
346  if (frac < table_update_low_fraction_) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
347  if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
348  if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
349  }
350  nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
351  TLOG(TLVL_DEBUG) << "current_table_interval_ms is now " << current_table_interval_ms_;
352  statsHelperPtr_->addSample(CURRENT_TABLE_INTERVAL_STAT_KEY, current_table_interval_ms_ / 1000.0);
353  }
354  else
355  {
356  usleep(current_table_interval_ms_ * 10); // 1/100 of the table update interval
357  }
358  }
359  }
360 
361  TLOG(TLVL_DEBUG) << "stop_requested_ is " << stop_requested_ << ", pause_requested_ is " << pause_requested_ << ", exiting process_event_table loop";
362  policy_->Reset();
363  metricMan->do_stop();
364 }
365 
367 {
368  std::lock_guard<std::mutex> lk(fd_mutex_);
369  for (auto& dest : connected_fds_)
370  {
371  for (auto& connected_fd : dest.second)
372  {
373  auto header = detail::RoutingPacketHeader(packet.size());
374  TLOG(TLVL_DEBUG) << "Sending table information for " << header.nEntries << " events to destination " << dest.first;
375  TRACE(16, "headerData:0x%016lx%016lx packetData:0x%016lx%016lx", ((unsigned long*)&header)[0], ((unsigned long*)&header)[1], ((unsigned long*)&packet[0])[0], ((unsigned long*)&packet[0])[1]); // NOLINT
376  auto sts = write(connected_fd, &header, sizeof(header));
377  if (sts != sizeof(header))
378  {
379  TLOG(TLVL_ERROR) << "Error sending routing header to fd " << connected_fd << ", rank " << dest.first;
380  }
381  else
382  {
383  sts = write(connected_fd, &packet[0], packet.size() * sizeof(detail::RoutingPacketEntry));
384  if (sts != static_cast<ssize_t>(packet.size() * sizeof(detail::RoutingPacketEntry)))
385  {
386  TLOG(TLVL_ERROR) << "Error sending routing table. sts=" << sts << "/" << packet.size() * sizeof(detail::RoutingPacketEntry) << ", fd=" << connected_fd << ", rank=" << dest.first;
387  }
388  }
389  }
390  }
391 }
392 
393 std::string artdaq::RoutingManagerCore::report(std::string const& /*unused*/) const
394 {
395  std::string resultString;
396 
397  // if we haven't been able to come up with any report so far, say so
398  auto tmpString = app_name + " run number = " + std::to_string(run_id_.run()) + ", table updates sent = " + std::to_string(table_update_count_) + ", Receiver tokens received = " + std::to_string(token_receiver_->getReceivedTokenCount());
399  return tmpString;
400 }
401 
402 std::string artdaq::RoutingManagerCore::buildStatisticsString_() const
403 {
404  std::ostringstream oss;
405  oss << app_name << " statistics:" << std::endl;
406 
407  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
408  if (mqPtr != nullptr)
409  {
410  artdaq::MonitoredQuantityStats stats;
411  mqPtr->getStats(stats);
412  oss << " Table Update statistics: "
413  << stats.recentSampleCount << " table updates sent at "
414  << stats.recentSampleRate << " table updates/sec, , monitor window = "
415  << stats.recentDuration << " sec" << std::endl;
416  oss << " Average times per table update: ";
417  if (stats.recentSampleRate > 0.0)
418  {
419  oss << " elapsed time = "
420  << (1.0 / stats.recentSampleRate) << " sec";
421  }
422  }
423 
424  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
425  if (mqPtr != nullptr)
426  {
427  artdaq::MonitoredQuantityStats stats;
428  mqPtr->getStats(stats);
429  oss << " Received Token statistics: "
430  << stats.recentSampleCount << " tokens received at "
431  << stats.recentSampleRate << " tokens/sec, , monitor window = "
432  << stats.recentDuration << " sec" << std::endl;
433  oss << " Average times per token: ";
434  if (stats.recentSampleRate > 0.0)
435  {
436  oss << " elapsed time = "
437  << (1.0 / stats.recentSampleRate) << " sec";
438  }
439  oss << ", input token wait time = "
440  << mqPtr->getRecentValueSum() << " sec" << std::endl;
441  }
442 
443  return oss.str();
444 }
445 
446 void artdaq::RoutingManagerCore::sendMetrics_()
447 {
448  if (metricMan)
449  {
450  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
451  if (mqPtr != nullptr)
452  {
453  artdaq::MonitoredQuantityStats stats;
454  mqPtr->getStats(stats);
455  metricMan->sendMetric("Table Update Count", stats.fullSampleCount, "updates", 1, MetricMode::LastPoint);
456  metricMan->sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
457  }
458 
459  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
460  if (mqPtr != nullptr)
461  {
462  artdaq::MonitoredQuantityStats stats;
463  mqPtr->getStats(stats);
464  metricMan->sendMetric("Receiver Token Count", stats.fullSampleCount, "updates", 1, MetricMode::LastPoint);
465  metricMan->sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
466  metricMan->sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
467  }
468 
469  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(CURRENT_TABLE_INTERVAL_STAT_KEY);
470  if (mqPtr.get() != nullptr)
471  {
472  artdaq::MonitoredQuantityStats stats;
473  mqPtr->getStats(stats);
474  metricMan->sendMetric("Table Update Interval", stats.recentValueAverage, "s", 3, MetricMode::Average);
475  }
476  }
477 }
478 
479 void artdaq::RoutingManagerCore::listen_()
480 {
481  if (epoll_fd_ == -1)
482  {
483  epoll_fd_ = epoll_create1(0);
484  }
485  int listen_fd = -1;
486  while (shutdown_requested_ == false)
487  {
488  TLOG(TLVL_TRACE) << "listen_: Listening/accepting new connections on port " << table_listen_port_;
489  if (listen_fd == -1)
490  {
491  TLOG(TLVL_DEBUG) << "listen_: Opening listener";
492  listen_fd = TCP_listen_fd(table_listen_port_, 0);
493  }
494  if (listen_fd == -1)
495  {
496  TLOG(TLVL_DEBUG) << "listen_: Error creating listen_fd!";
497  break;
498  }
499 
500  int res;
501  timeval tv = {2, 0}; // maybe increase of some global "debugging" flag set???
502  fd_set rfds;
503  FD_ZERO(&rfds);
504  FD_SET(listen_fd, &rfds); // NOLINT
505 
506  res = select(listen_fd + 1, &rfds, static_cast<fd_set*>(nullptr), static_cast<fd_set*>(nullptr), &tv);
507  if (res > 0)
508  {
509  int sts;
510  sockaddr_un un;
511  socklen_t arglen = sizeof(un);
512  int fd;
513  TLOG(TLVL_DEBUG) << "listen_: Calling accept";
514  fd = accept(listen_fd, reinterpret_cast<sockaddr*>(&un), &arglen); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
515  TLOG(TLVL_DEBUG) << "listen_: Done with accept";
516 
517  TLOG(TLVL_DEBUG) << "listen_: Reading connect message";
518  socklen_t lenlen = sizeof(tv);
519  /*sts=*/
520  setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, lenlen); // see man 7 socket.
521  detail::RoutingRequest rch;
522  uint64_t mark_us = TimeUtils::gettimeofday_us();
523  sts = read(fd, &rch, sizeof(rch));
524  uint64_t delta_us = TimeUtils::gettimeofday_us() - mark_us;
525  TLOG(TLVL_DEBUG) << "listen_: Read of connect message took " << delta_us << " microseconds.";
526  if (sts != sizeof(rch))
527  {
528  TLOG(TLVL_DEBUG) << "listen_: Wrong message header length received!";
529  close(fd);
530  continue;
531  }
532 
533  // check for "magic" and valid source_id(aka rank)
534  if (rch.header != ROUTING_MAGIC || !(rch.mode == detail::RoutingRequest::RequestMode::Connect))
535  {
536  TLOG(TLVL_DEBUG) << "listen_: Wrong magic bytes in header! rch.header: " << std::hex << rch.header;
537  close(fd);
538  continue;
539  }
540 
541  // now add (new) connection
542  std::lock_guard<std::mutex> lk(fd_mutex_);
543  connected_fds_[rch.rank].insert(fd);
544  struct epoll_event ev;
545  ev.data.fd = fd;
546  ev.events = EPOLLIN;
547  epoll_ctl(epoll_fd_, EPOLL_CTL_ADD, fd, &ev);
548  TLOG(TLVL_INFO) << "listen_: New fd is " << fd << " for table receiver rank " << rch.rank;
549  }
550  else
551  {
552  TLOG(16) << "listen_: No connections in timeout interval!";
553  }
554  }
555 
556  TLOG(TLVL_INFO) << "listen_: Shutting down connection listener";
557  if (listen_fd != -1)
558  {
559  close(listen_fd);
560  }
561  std::lock_guard<std::mutex> lk(fd_mutex_);
562  for (auto& fd_set : connected_fds_)
563  {
564  for (auto& fd : fd_set.second)
565  {
566  epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, fd, nullptr);
567  close(fd);
568  }
569  }
570  connected_fds_.clear();
571 
572 } // listen_
573 
574 void artdaq::RoutingManagerCore::receive_()
575 {
576  if (epoll_fd_ == -1)
577  {
578  epoll_fd_ = epoll_create1(0);
579  }
580  std::vector<epoll_event> received_events(10);
581 
582  int nfds = 1;
583  while (nfds > 0)
584  {
585  std::lock_guard<std::mutex> lk(fd_mutex_);
586  nfds = epoll_wait(epoll_fd_, &received_events[0], received_events.size(), 1);
587  if (nfds == -1)
588  {
589  TLOG(TLVL_ERROR) << "Error status received from epoll_wait, exiting with code " << EXIT_FAILURE << ", errno=" << errno << " (" << strerror(errno) << ")";
590  perror("epoll_wait");
591  exit(EXIT_FAILURE);
592  }
593 
594  if (nfds > 0)
595  {
596  TLOG(13) << "Received " << nfds << " events on table sockets";
597  }
598  for (auto n = 0; n < nfds; ++n)
599  {
600  bool reading = true;
601  int sts = 0;
602  while (reading)
603  {
604  if ((received_events[n].events & EPOLLIN) != 0)
605  {
606  detail::RoutingRequest buff;
607  auto stss = read(received_events[n].data.fd, &buff, sizeof(detail::RoutingRequest) - sts);
608  sts += stss;
609  if (stss == 0)
610  {
611  TLOG(TLVL_INFO) << "Received 0-size request from " << find_fd_(received_events[n].data.fd);
612  reading = false;
613  }
614  else if (stss < 0 && errno == EAGAIN)
615  {
616  TLOG(TLVL_DEBUG) << "No more requests from this rank. Continuing poll loop.";
617  reading = false;
618  }
619  else if (stss < 0)
620  {
621  TLOG(TLVL_ERROR) << "Error reading from request socket: sts=" << sts << ", errno=" << errno << " (" << strerror(errno) << ")";
622  close(received_events[n].data.fd);
623  epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, received_events[n].data.fd, nullptr);
624  reading = false;
625  }
626  else if (sts == sizeof(detail::RoutingRequest) && buff.header != ROUTING_MAGIC)
627  {
628  TLOG(TLVL_ERROR) << "Received invalid request from " << find_fd_(received_events[n].data.fd) << " sts=" << sts << ", header=" << std::hex << buff.header;
629  reading = false;
630  }
631  else if (sts == sizeof(detail::RoutingRequest))
632  {
633  reading = false;
634  sts = 0;
635  TLOG(TLVL_TRACE) << "Received request from " << buff.rank << " mode=" << detail::RoutingRequest::RequestModeToString(buff.mode);
636  detail::RoutingPacketEntry reply;
637 
638  switch (buff.mode)
639  {
640  case detail::RoutingRequest::RequestMode::Disconnect:
641  connected_fds_[buff.rank].erase(received_events[n].data.fd);
642  close(received_events[n].data.fd);
643  epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, received_events[n].data.fd, nullptr);
644  break;
645 
646  case detail::RoutingRequest::RequestMode::Request:
647  reply = policy_->GetRouteForSequenceID(buff.sequence_id, buff.rank);
648  if (reply.sequence_id == buff.sequence_id)
649  {
650  TLOG(TLVL_TRACE) << "Reply to request from " << buff.rank << " with route to " << reply.destination_rank << " for sequence ID " << buff.sequence_id;
651  detail::RoutingPacketHeader hdr(1);
652  write(received_events[n].data.fd, &hdr, sizeof(hdr));
653  write(received_events[n].data.fd, &reply, sizeof(detail::RoutingPacketEntry));
654  }
655  else
656  {
657  TLOG(TLVL_TRACE) << "Unable to route request, replying with empty RoutingPacket";
658  detail::RoutingPacketHeader hdr(0);
659  write(received_events[n].data.fd, &hdr, sizeof(hdr));
660  }
661  break;
662  default:
663  TLOG(TLVL_WARNING) << "Received request from " << buff.rank << " with invalid mode " << detail::RoutingRequest::RequestModeToString(buff.mode) << " (currently only expecting Disconnect or Request)";
664  break;
665  }
666  }
667  }
668  else
669  {
670  TLOG(TLVL_DEBUG) << "Received event mask " << received_events[n].events << " from table socket rank " << find_fd_(received_events[n].data.fd);
671  }
672  }
673  }
674  }
675 }
676 
677 int artdaq::RoutingManagerCore::find_fd_(int fd) const
678 {
679  for (auto& rank : connected_fds_)
680  {
681  if (rank.second.count(fd) != 0)
682  {
683  return rank.first;
684  }
685  }
686  return -1;
687 }
This class manages MonitoredQuantity instances for the *Core classes.
bool shutdown(uint64_t)
Shuts Down the RoutingManagerCore.
bool start(art::RunID id, uint64_t, uint64_t)
Start the RoutingManagerCore.
A row of the Routing Table.
static std::string RequestModeToString(RequestMode m)
Convert a RequestMode enumeration value to string.
static const std::string TABLE_UPDATES_STAT_KEY
Key for Table Update count MonnitoredQuantity.
bool pause(uint64_t, uint64_t)
Pauses the RoutingManagerCore.
int TCP_listen_fd(int port, int rcvbuf)
Create a TCP listening socket on the given port and INADDR_ANY, with the given receive buffer...
The header of the Routing Table, containing the magic bytes and the number of entries.
uint64_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Multiple sources sending to a single destination. RoutingManager pushes table updates to all senders...
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
bool initialize(fhicl::ParameterSet const &pset, uint64_t, uint64_t)
Processes the initialize request.
Multiple sources sending to a single destination. Table updates are triggered by senders requesting r...
bool stop(uint64_t, uint64_t)
Stops the RoutingManagerCore.
void process_event_table()
Main loop of the RoutingManagerCore. Determines when to send the next table update, asks the RoutingManagerPolicy for the table to send, and sends it.
bool soft_initialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Soft-Initializes the RoutingManagerCore.
static const std::string TOKENS_RECEIVED_STAT_KEY
Key for the Tokens Received MonitoredQuantity.
std::string report(std::string const &) const
Send a report on the current status of the RoutingManagerCore.
bool resume(uint64_t, uint64_t)
Resumes the RoutingManagerCore.
void send_event_table(detail::RoutingPacket packet)
Sends a detail::RoutingPacket to the table receivers.
RoutingManagerCore()
RoutingManagerCore Constructor.
static const std::string CURRENT_TABLE_INTERVAL_STAT_KEY
Key for the Current Table Interval MonitoredQuantity.
std::shared_ptr< RoutingManagerPolicy > makeRoutingManagerPolicy(std::string const &policy_plugin_spec, fhicl::ParameterSet const &ps)
Load a RoutingManagerPolicy plugin.
bool reinitialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Reinitializes the RoutingManagerCore.