artdaq  v3_12_02
RoutingManagerCore.cc
1 #include "TRACE/tracemf.h"
2 #include "artdaq/DAQdata/Globals.hh" // include these 2 first to get tracemf.h -
3 #define TRACE_NAME (app_name + "_RoutingManagerCore").c_str() // before trace.h
4 
5 #include "artdaq/Application/RoutingManagerCore.hh"
6 
7 #include "artdaq-core/Utilities/ExceptionHandler.hh"
9 #include "artdaq/RoutingPolicies/makeRoutingManagerPolicy.hh"
10 
11 #include "fhiclcpp/ParameterSet.h"
12 
13 #include <arpa/inet.h>
14 #include <netdb.h>
15 #include <pthread.h>
16 #include <sched.h>
17 #include <sys/time.h>
18 #include <sys/un.h>
19 #include <algorithm>
20 #include <memory>
21 
22 const std::string artdaq::RoutingManagerCore::
23  TABLE_UPDATES_STAT_KEY("RoutingManagerCoreTableUpdates");
24 const std::string artdaq::RoutingManagerCore::
25  TOKENS_RECEIVED_STAT_KEY("RoutingManagerCoreTokensReceived");
26 const std::string artdaq::RoutingManagerCore::
27  CURRENT_TABLE_INTERVAL_STAT_KEY("RoutingManagerCoreCurrentTableInterval");
28 
30  : shutdown_requested_(false)
31  , stop_requested_(true)
32  , pause_requested_(false)
33  , statsHelperPtr_(new artdaq::StatisticsHelper())
34 {
35  TLOG(TLVL_DEBUG + 32) << "Constructor";
36  statsHelperPtr_->addMonitoredQuantityName(TABLE_UPDATES_STAT_KEY);
37  statsHelperPtr_->addMonitoredQuantityName(TOKENS_RECEIVED_STAT_KEY);
38  statsHelperPtr_->addMonitoredQuantityName(CURRENT_TABLE_INTERVAL_STAT_KEY);
39 }
40 
42 {
43  TLOG(TLVL_DEBUG + 32) << "Destructor";
44  artdaq::StatisticsCollection::getInstance().requestStop();
45  token_receiver_->stopTokenReception(true);
46 }
47 
48 bool artdaq::RoutingManagerCore::initialize(fhicl::ParameterSet const& pset, uint64_t /*unused*/, uint64_t /*unused*/)
49 {
50  TLOG(TLVL_DEBUG + 32) << "initialize method called with "
51  << "ParameterSet = \"" << pset.to_string()
52  << "\".";
53 
54  // pull out the relevant parts of the ParameterSet
55  fhicl::ParameterSet daq_pset;
56  try
57  {
58  daq_pset = pset.get<fhicl::ParameterSet>("daq");
59  }
60  catch (...)
61  {
62  TLOG(TLVL_ERROR)
63  << "Unable to find the DAQ parameters in the initialization "
64  << "ParameterSet: \"" + pset.to_string() + "\".";
65  return false;
66  }
67 
68  if (daq_pset.has_key("rank"))
69  {
70  if (my_rank >= 0 && daq_pset.get<int>("rank") != my_rank)
71  {
72  TLOG(TLVL_WARNING) << "Routing Manager rank specified at startup is different than rank specified at configure! Using rank received at configure!";
73  }
74  my_rank = daq_pset.get<int>("rank");
75  }
76  if (my_rank == -1)
77  {
78  TLOG(TLVL_ERROR) << "Routing Manager rank not specified at startup or in configuration! Aborting";
79  exit(1);
80  }
81 
82  try
83  {
84  policy_pset_ = daq_pset.get<fhicl::ParameterSet>("policy");
85  }
86  catch (...)
87  {
88  TLOG(TLVL_ERROR)
89  << "Unable to find the policy parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\".";
90  return false;
91  }
92 
93  try
94  {
95  token_receiver_pset_ = daq_pset.get<fhicl::ParameterSet>("token_receiver");
96  }
97  catch (...)
98  {
99  TLOG(TLVL_ERROR)
100  << "Unable to find the token_receiver parameters in the DAQ initialization ParameterSet: \"" + daq_pset.to_string() + "\".";
101  return false;
102  }
103 
104  // pull out the Metric part of the ParameterSet
105  fhicl::ParameterSet metric_pset;
106  try
107  {
108  metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
109  }
110  catch (...)
111  {} // OK if there's no metrics table defined in the FHiCL
112 
113  if (metric_pset.is_empty())
114  {
115  TLOG(TLVL_INFO) << "No metric plugins appear to be defined";
116  }
117  try
118  {
119  metricMan->initialize(metric_pset, app_name);
120  }
121  catch (...)
122  {
123  ExceptionHandler(ExceptionHandlerRethrow::no,
124  "Error loading metrics in RoutingManagerCore::initialize()");
125  }
126 
127  // create the requested RoutingPolicy
128  auto policy_plugin_spec = policy_pset_.get<std::string>("policy", "");
129  if (policy_plugin_spec.length() == 0)
130  {
131  TLOG(TLVL_ERROR)
132  << "No fragment generator (parameter name = \"policy\") was "
133  << "specified in the policy ParameterSet. The "
134  << "DAQ initialization PSet was \"" << daq_pset.to_string() << "\".";
135  return false;
136  }
137  try
138  {
139  policy_ = artdaq::makeRoutingManagerPolicy(policy_plugin_spec, policy_pset_);
140  }
141  catch (...)
142  {
143  std::stringstream exception_string;
144  exception_string << "Exception thrown during initialization of policy of type \""
145  << policy_plugin_spec << "\"";
146 
147  ExceptionHandler(ExceptionHandlerRethrow::no, exception_string.str());
148 
149  TLOG(TLVL_DEBUG + 32) << "FHiCL parameter set used to initialize the policy which threw an exception: " << policy_pset_.to_string();
150 
151  return false;
152  }
153 
154  rt_priority_ = daq_pset.get<int>("rt_priority", 0);
155  max_table_update_interval_ms_ = daq_pset.get<size_t>("table_update_interval_ms", 1000);
156  current_table_interval_ms_ = max_table_update_interval_ms_;
157  table_update_high_fraction_ = daq_pset.get<double>("table_update_interval_high_frac", 0.75);
158  table_update_low_fraction_ = daq_pset.get<double>("table_update_interval_low_frac", 0.5);
159 
160  // fetch the monitoring parameters and create the MonitoredQuantity instances
161  statsHelperPtr_->createCollectors(daq_pset, 100, 30.0, 60.0, TABLE_UPDATES_STAT_KEY);
162 
163  // create the requested TokenReceiver
164  token_receiver_ = std::make_unique<TokenReceiver>(token_receiver_pset_, policy_, max_table_update_interval_ms_);
165  token_receiver_->setStatsHelper(statsHelperPtr_, TOKENS_RECEIVED_STAT_KEY);
166  token_receiver_->startTokenReception();
167  token_receiver_->pauseTokenReception();
168 
169  table_listen_port_ = daq_pset.get<int>("table_update_port", 35556);
170 
171  shutdown_requested_.store(true);
172  if (listen_thread_ && listen_thread_->joinable())
173  {
174  listen_thread_->join();
175  }
176  shutdown_requested_.store(false);
177  TLOG(TLVL_INFO) << "Starting Listener Thread";
178 
179  try
180  {
181  listen_thread_ = std::make_unique<boost::thread>(&RoutingManagerCore::listen_, this);
182  }
183  catch (const boost::exception& e)
184  {
185  TLOG(TLVL_ERROR) << "Caught boost::exception starting TCP Socket Listen thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
186  std::cerr << "Caught boost::exception starting TCP Socket Listen thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
187  exit(5);
188  }
189  return true;
190 }
191 
192 bool artdaq::RoutingManagerCore::start(art::RunID id, uint64_t /*unused*/, uint64_t /*unused*/)
193 {
194  run_id_ = id;
195  stop_requested_.store(false);
196  pause_requested_.store(false);
197 
198  statsHelperPtr_->resetStatistics();
199 
200  metricMan->do_start();
201  table_update_count_ = 0;
202  token_receiver_->setRunNumber(run_id_.run());
203  token_receiver_->resumeTokenReception();
204 
205  TLOG(TLVL_INFO) << "Started run " << run_id_.run();
206  return true;
207 }
208 
209 bool artdaq::RoutingManagerCore::stop(uint64_t /*unused*/, uint64_t /*unused*/)
210 {
211  TLOG(TLVL_INFO) << "Stopping run " << run_id_.run()
212  << " after " << table_update_count_ << " table updates."
213  << " and " << token_receiver_->getReceivedTokenCount() << " received tokens.";
214  stop_requested_.store(true);
215  token_receiver_->pauseTokenReception();
216  run_id_ = art::RunID::flushRun();
217  return true;
218 }
219 
220 bool artdaq::RoutingManagerCore::pause(uint64_t /*unused*/, uint64_t /*unused*/)
221 {
222  TLOG(TLVL_INFO) << "Pausing run " << run_id_.run()
223  << " after " << table_update_count_ << " table updates."
224  << " and " << token_receiver_->getReceivedTokenCount() << " received tokens.";
225  pause_requested_.store(true);
226  return true;
227 }
228 
229 bool artdaq::RoutingManagerCore::resume(uint64_t /*unused*/, uint64_t /*unused*/)
230 {
231  TLOG(TLVL_DEBUG + 32) << "Resuming run " << run_id_.run();
232  pause_requested_.store(false);
233  metricMan->do_start();
234  return true;
235 }
236 
237 bool artdaq::RoutingManagerCore::shutdown(uint64_t /*unused*/)
238 {
239  shutdown_requested_.store(true);
240  if (listen_thread_ && listen_thread_->joinable())
241  {
242  listen_thread_->join();
243  }
244  token_receiver_->stopTokenReception();
245  policy_.reset();
246  metricMan->shutdown();
247  return true;
248 }
249 
250 bool artdaq::RoutingManagerCore::soft_initialize(fhicl::ParameterSet const& pset, uint64_t timeout, uint64_t timestamp)
251 {
252  TLOG(TLVL_INFO) << "soft_initialize method called with "
253  << "ParameterSet = \"" << pset.to_string()
254  << "\".";
255  return initialize(pset, timeout, timestamp);
256 }
257 
258 bool artdaq::RoutingManagerCore::reinitialize(fhicl::ParameterSet const& pset, uint64_t timeout, uint64_t timestamp)
259 {
260  TLOG(TLVL_INFO) << "reinitialize method called with "
261  << "ParameterSet = \"" << pset.to_string()
262  << "\".";
263  return initialize(pset, timeout, timestamp);
264 }
265 
267 {
268  if (rt_priority_ > 0)
269  {
270 #pragma GCC diagnostic push
271 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
272  sched_param s_param = {};
273  s_param.sched_priority = rt_priority_;
274  if (pthread_setschedparam(pthread_self(), SCHED_RR, &s_param) != 0)
275  {
276  TLOG(TLVL_WARNING) << "setting realtime priority failed";
277  }
278 #pragma GCC diagnostic pop
279  }
280 
281  // try-catch block here?
282 
283  // how to turn RT PRI off?
284  if (rt_priority_ > 0)
285  {
286 #pragma GCC diagnostic push
287 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
288  sched_param s_param = {};
289  s_param.sched_priority = rt_priority_;
290  int status = pthread_setschedparam(pthread_self(), SCHED_RR, &s_param);
291  if (status != 0)
292  {
293  TLOG(TLVL_ERROR)
294  << "Failed to set realtime priority to " << rt_priority_
295  << ", return code = " << status;
296  }
297 #pragma GCC diagnostic pop
298  }
299 
300  // MPI_Barrier(local_group_comm_);
301 
302  TLOG(TLVL_DEBUG + 32) << "Sending initial table.";
303  auto startTime = artdaq::MonitoredQuantity::getCurrentTime();
304  auto nextSendTime = startTime;
305  double delta_time;
306  while (!stop_requested_ && !pause_requested_)
307  {
308  receive_();
309  if (policy_->GetRoutingMode() == detail::RoutingManagerMode::EventBuilding || policy_->GetRoutingMode() == detail::RoutingManagerMode::RequestBasedEventBuilding)
310  {
311  startTime = artdaq::MonitoredQuantity::getCurrentTime();
312 
313  if (startTime >= nextSendTime)
314  {
315  auto table = policy_->GetCurrentTable();
316 
317  if (table.empty())
318  {
319  TLOG(TLVL_WARNING) << "Routing Policy generated Empty table for this routing interval (" << current_table_interval_ms_ << " ms)! This may indicate issues with the receivers, if it persists."
320  << " Next seqID=" << policy_->GetNextSequenceID() << ", Policy held tokens=" << policy_->GetHeldTokenCount();
321  }
322  else
323  {
324  send_event_table(table);
325  ++table_update_count_;
326  delta_time = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
327  statsHelperPtr_->addSample(TABLE_UPDATES_STAT_KEY, delta_time);
328  TLOG(TLVL_DEBUG + 34) << "process_fragments TABLE_UPDATES_STAT_KEY=" << delta_time;
329 
330  bool readyToReport = statsHelperPtr_->readyToReport();
331  if (readyToReport)
332  {
333  std::string statString = buildStatisticsString_();
334  TLOG(TLVL_INFO) << statString;
335  sendMetrics_();
336  }
337  }
338 
339  auto max_tokens = policy_->GetMaxNumberOfTokens();
340  if (max_tokens > 0)
341  {
342  auto frac = policy_->GetTokensUsedSinceLastUpdate() / static_cast<double>(max_tokens);
343  policy_->ResetTokensUsedSinceLastUpdate();
344  if (frac > table_update_high_fraction_) current_table_interval_ms_ = 9 * current_table_interval_ms_ / 10;
345  if (frac < table_update_low_fraction_) current_table_interval_ms_ = 11 * current_table_interval_ms_ / 10;
346  if (current_table_interval_ms_ > max_table_update_interval_ms_) current_table_interval_ms_ = max_table_update_interval_ms_;
347  if (current_table_interval_ms_ < 1) current_table_interval_ms_ = 1;
348  }
349  nextSendTime = startTime + current_table_interval_ms_ / 1000.0;
350  TLOG(TLVL_DEBUG + 32) << "current_table_interval_ms is now " << current_table_interval_ms_;
351  statsHelperPtr_->addSample(CURRENT_TABLE_INTERVAL_STAT_KEY, current_table_interval_ms_ / 1000.0);
352  }
353  else
354  {
355  usleep(current_table_interval_ms_ * 10); // 1/100 of the table update interval
356  }
357  }
358  }
359 
360  TLOG(TLVL_DEBUG + 32) << "stop_requested_ is " << stop_requested_ << ", pause_requested_ is " << pause_requested_ << ", exiting process_event_table loop";
361  policy_->Reset();
362  metricMan->do_stop();
363 }
364 
366 {
367  std::lock_guard<std::mutex> lk(fd_mutex_);
368  for (auto& dest : connected_fds_)
369  {
370  for (auto& connected_fd : dest.second)
371  {
372  auto header = detail::RoutingPacketHeader(packet.size());
373  TLOG(TLVL_DEBUG + 32) << "Sending table information for " << header.nEntries << " events to destination " << dest.first;
374  TRACE(16, "headerData:0x%016lx%016lx packetData:0x%016lx%016lx", ((unsigned long*)&header)[0], ((unsigned long*)&header)[1], ((unsigned long*)&packet[0])[0], ((unsigned long*)&packet[0])[1]); // NOLINT
375  auto sts = write(connected_fd, &header, sizeof(header));
376  if (sts != sizeof(header))
377  {
378  TLOG(TLVL_ERROR) << "Error sending routing header to fd " << connected_fd << ", rank " << dest.first;
379  }
380  else
381  {
382  sts = write(connected_fd, &packet[0], packet.size() * sizeof(detail::RoutingPacketEntry));
383  if (sts != static_cast<ssize_t>(packet.size() * sizeof(detail::RoutingPacketEntry)))
384  {
385  TLOG(TLVL_ERROR) << "Error sending routing table. sts=" << sts << "/" << packet.size() * sizeof(detail::RoutingPacketEntry) << ", fd=" << connected_fd << ", rank=" << dest.first;
386  }
387  }
388  }
389  }
390 }
391 
392 std::string artdaq::RoutingManagerCore::report(std::string const& /*unused*/) const
393 {
394  std::string resultString;
395 
396  // if we haven't been able to come up with any report so far, say so
397  auto tmpString = app_name + " run number = " + std::to_string(run_id_.run()) + ", table updates sent = " + std::to_string(table_update_count_) + ", Receiver tokens received = " + std::to_string(token_receiver_->getReceivedTokenCount());
398  return tmpString;
399 }
400 
401 std::string artdaq::RoutingManagerCore::buildStatisticsString_() const
402 {
403  std::ostringstream oss;
404  oss << app_name << " statistics:" << std::endl;
405 
406  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
407  if (mqPtr != nullptr)
408  {
409  artdaq::MonitoredQuantityStats stats;
410  mqPtr->getStats(stats);
411  oss << " Table Update statistics: "
412  << stats.recentSampleCount << " table updates sent at "
413  << stats.recentSampleRate << " table updates/sec, , monitor window = "
414  << stats.recentDuration << " sec" << std::endl;
415  oss << " Average times per table update: ";
416  if (stats.recentSampleRate > 0.0)
417  {
418  oss << " elapsed time = "
419  << (1.0 / stats.recentSampleRate) << " sec";
420  }
421  }
422 
423  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
424  if (mqPtr != nullptr)
425  {
426  artdaq::MonitoredQuantityStats stats;
427  mqPtr->getStats(stats);
428  oss << " Received Token statistics: "
429  << stats.recentSampleCount << " tokens received at "
430  << stats.recentSampleRate << " tokens/sec, , monitor window = "
431  << stats.recentDuration << " sec" << std::endl;
432  oss << " Average times per token: ";
433  if (stats.recentSampleRate > 0.0)
434  {
435  oss << " elapsed time = "
436  << (1.0 / stats.recentSampleRate) << " sec";
437  }
438  oss << ", input token wait time = "
439  << mqPtr->getRecentValueSum() << " sec" << std::endl;
440  }
441 
442  return oss.str();
443 }
444 
445 void artdaq::RoutingManagerCore::sendMetrics_()
446 {
447  if (metricMan)
448  {
449  auto mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TABLE_UPDATES_STAT_KEY);
450  if (mqPtr != nullptr)
451  {
452  artdaq::MonitoredQuantityStats stats;
453  mqPtr->getStats(stats);
454  metricMan->sendMetric("Table Update Count", stats.fullSampleCount, "updates", 1, MetricMode::LastPoint);
455  metricMan->sendMetric("Table Update Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
456  }
457 
458  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(TOKENS_RECEIVED_STAT_KEY);
459  if (mqPtr != nullptr)
460  {
461  artdaq::MonitoredQuantityStats stats;
462  mqPtr->getStats(stats);
463  metricMan->sendMetric("Receiver Token Count", stats.fullSampleCount, "updates", 1, MetricMode::LastPoint);
464  metricMan->sendMetric("Receiver Token Rate", stats.recentSampleRate, "updates/sec", 1, MetricMode::Average);
465  metricMan->sendMetric("Total Receiver Token Wait Time", mqPtr->getRecentValueSum(), "seconds", 3, MetricMode::Average);
466  }
467 
468  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(CURRENT_TABLE_INTERVAL_STAT_KEY);
469  if (mqPtr.get() != nullptr)
470  {
471  artdaq::MonitoredQuantityStats stats;
472  mqPtr->getStats(stats);
473  metricMan->sendMetric("Table Update Interval", stats.recentValueAverage, "s", 3, MetricMode::Average);
474  }
475  }
476 }
477 
478 void artdaq::RoutingManagerCore::listen_()
479 {
480  if (epoll_fd_ == -1)
481  {
482  epoll_fd_ = epoll_create1(0);
483  }
484  int listen_fd = -1;
485  while (shutdown_requested_ == false)
486  {
487  TLOG(TLVL_DEBUG + 33) << "listen_: Listening/accepting new connections on port " << table_listen_port_;
488  if (listen_fd == -1)
489  {
490  TLOG(TLVL_DEBUG + 32) << "listen_: Opening listener";
491  listen_fd = TCP_listen_fd(table_listen_port_, 0);
492  }
493  if (listen_fd == -1)
494  {
495  TLOG(TLVL_DEBUG + 32) << "listen_: Error creating listen_fd!";
496  break;
497  }
498 
499  int res;
500  timeval tv = {2, 0}; // maybe increase of some global "debugging" flag set???
501  fd_set rfds;
502  FD_ZERO(&rfds);
503  FD_SET(listen_fd, &rfds); // NOLINT
504 
505  res = select(listen_fd + 1, &rfds, static_cast<fd_set*>(nullptr), static_cast<fd_set*>(nullptr), &tv);
506  if (res > 0)
507  {
508  int sts;
509  sockaddr_un un;
510  socklen_t arglen = sizeof(un);
511  int fd;
512  TLOG(TLVL_DEBUG + 32) << "listen_: Calling accept";
513  fd = accept(listen_fd, reinterpret_cast<sockaddr*>(&un), &arglen); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
514  TLOG(TLVL_DEBUG + 32) << "listen_: Done with accept";
515 
516  TLOG(TLVL_DEBUG + 32) << "listen_: Reading connect message";
517  socklen_t lenlen = sizeof(tv);
518  /*sts=*/
519  setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, lenlen); // see man 7 socket.
520  detail::RoutingRequest rch;
521  uint64_t mark_us = TimeUtils::gettimeofday_us();
522  sts = read(fd, &rch, sizeof(rch));
523  uint64_t delta_us = TimeUtils::gettimeofday_us() - mark_us;
524  TLOG(TLVL_DEBUG + 32) << "listen_: Read of connect message took " << delta_us << " microseconds.";
525  if (sts != sizeof(rch))
526  {
527  TLOG(TLVL_DEBUG + 32) << "listen_: Wrong message header length received!";
528  close(fd);
529  continue;
530  }
531 
532  // check for "magic" and valid source_id(aka rank)
533  if (rch.header != ROUTING_MAGIC || !(rch.mode == detail::RoutingRequest::RequestMode::Connect))
534  {
535  TLOG(TLVL_DEBUG + 32) << "listen_: Wrong magic bytes in header! rch.header: " << std::hex << rch.header;
536  close(fd);
537  continue;
538  }
539 
540  // now add (new) connection
541  std::lock_guard<std::mutex> lk(fd_mutex_);
542  connected_fds_[rch.rank].insert(fd);
543  struct epoll_event ev;
544  ev.data.fd = fd;
545  ev.events = EPOLLIN;
546  epoll_ctl(epoll_fd_, EPOLL_CTL_ADD, fd, &ev);
547  TLOG(TLVL_INFO) << "listen_: New fd is " << fd << " for table receiver rank " << rch.rank;
548  }
549  else
550  {
551  TLOG(TLVL_DEBUG + 34) << "listen_: No connections in timeout interval!";
552  }
553  }
554 
555  TLOG(TLVL_INFO) << "listen_: Shutting down connection listener";
556  if (listen_fd != -1)
557  {
558  close(listen_fd);
559  }
560  std::lock_guard<std::mutex> lk(fd_mutex_);
561  for (auto& fd_set : connected_fds_)
562  {
563  for (auto& fd : fd_set.second)
564  {
565  epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, fd, nullptr);
566  close(fd);
567  }
568  }
569  connected_fds_.clear();
570 
571 } // listen_
572 
573 void artdaq::RoutingManagerCore::receive_()
574 {
575  if (epoll_fd_ == -1)
576  {
577  epoll_fd_ = epoll_create1(0);
578  }
579  std::vector<epoll_event> received_events(10);
580 
581  int nfds = 1;
582  while (nfds > 0)
583  {
584  std::lock_guard<std::mutex> lk(fd_mutex_);
585  nfds = epoll_wait(epoll_fd_, &received_events[0], received_events.size(), 1);
586  if (nfds == -1)
587  {
588  TLOG(TLVL_ERROR) << "Error status received from epoll_wait, exiting with code " << EXIT_FAILURE << ", errno=" << errno << " (" << strerror(errno) << ")";
589  perror("epoll_wait");
590  exit(EXIT_FAILURE);
591  }
592 
593  if (nfds > 0)
594  {
595  TLOG(TLVL_DEBUG + 35) << "Received " << nfds << " events on table sockets";
596  }
597  for (auto n = 0; n < nfds; ++n)
598  {
599  bool reading = true;
600  int sts = 0;
601  while (reading)
602  {
603  if ((received_events[n].events & EPOLLIN) != 0)
604  {
605  detail::RoutingRequest buff;
606  auto stss = read(received_events[n].data.fd, &buff, sizeof(detail::RoutingRequest) - sts);
607  sts += stss;
608  if (stss == 0)
609  {
610  TLOG(TLVL_INFO) << "Received 0-size request from " << find_fd_(received_events[n].data.fd);
611  reading = false;
612  }
613  else if (stss < 0 && errno == EAGAIN)
614  {
615  TLOG(TLVL_DEBUG + 32) << "No more requests from this rank. Continuing poll loop.";
616  reading = false;
617  }
618  else if (stss < 0)
619  {
620  TLOG(TLVL_ERROR) << "Error reading from request socket: sts=" << sts << ", errno=" << errno << " (" << strerror(errno) << ")";
621  close(received_events[n].data.fd);
622  epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, received_events[n].data.fd, nullptr);
623  reading = false;
624  }
625  else if (sts == sizeof(detail::RoutingRequest) && buff.header != ROUTING_MAGIC)
626  {
627  TLOG(TLVL_ERROR) << "Received invalid request from " << find_fd_(received_events[n].data.fd) << " sts=" << sts << ", header=" << std::hex << buff.header;
628  reading = false;
629  }
630  else if (sts == sizeof(detail::RoutingRequest))
631  {
632  reading = false;
633  sts = 0;
634  TLOG(TLVL_DEBUG + 33) << "Received request from " << buff.rank << " mode=" << detail::RoutingRequest::RequestModeToString(buff.mode);
635  detail::RoutingPacketEntry reply;
636 
637  switch (buff.mode)
638  {
639  case detail::RoutingRequest::RequestMode::Disconnect:
640  connected_fds_[buff.rank].erase(received_events[n].data.fd);
641  close(received_events[n].data.fd);
642  epoll_ctl(epoll_fd_, EPOLL_CTL_DEL, received_events[n].data.fd, nullptr);
643  break;
644 
645  case detail::RoutingRequest::RequestMode::Request:
646  reply = policy_->GetRouteForSequenceID(buff.sequence_id, buff.rank);
647  if (reply.sequence_id == buff.sequence_id)
648  {
649  TLOG(TLVL_DEBUG + 33) << "Reply to request from " << buff.rank << " with route to " << reply.destination_rank << " for sequence ID " << buff.sequence_id;
650  detail::RoutingPacketHeader hdr(1);
651  write(received_events[n].data.fd, &hdr, sizeof(hdr));
652  write(received_events[n].data.fd, &reply, sizeof(detail::RoutingPacketEntry));
653  }
654  else
655  {
656  TLOG(TLVL_DEBUG + 33) << "Unable to route request, replying with empty RoutingPacket";
657  detail::RoutingPacketHeader hdr(0);
658  write(received_events[n].data.fd, &hdr, sizeof(hdr));
659  }
660  break;
661  default:
662  TLOG(TLVL_WARNING) << "Received request from " << buff.rank << " with invalid mode " << detail::RoutingRequest::RequestModeToString(buff.mode) << " (currently only expecting Disconnect or Request)";
663  break;
664  }
665  }
666  }
667  else
668  {
669  TLOG(TLVL_DEBUG + 32) << "Received event mask " << received_events[n].events << " from table socket rank " << find_fd_(received_events[n].data.fd);
670  }
671  }
672  }
673  }
674 }
675 
676 int artdaq::RoutingManagerCore::find_fd_(int fd) const
677 {
678  for (auto& rank : connected_fds_)
679  {
680  if (rank.second.count(fd) != 0)
681  {
682  return rank.first;
683  }
684  }
685  return -1;
686 }
This class manages MonitoredQuantity instances for the *Core classes.
bool shutdown(uint64_t)
Shuts Down the RoutingManagerCore.
bool start(art::RunID id, uint64_t, uint64_t)
Start the RoutingManagerCore.
A row of the Routing Table.
static std::string RequestModeToString(RequestMode m)
Convert a RequestMode enumeration value to string.
static const std::string TABLE_UPDATES_STAT_KEY
Key for Table Update count MonnitoredQuantity.
bool pause(uint64_t, uint64_t)
Pauses the RoutingManagerCore.
int TCP_listen_fd(int port, int rcvbuf)
Create a TCP listening socket on the given port and INADDR_ANY, with the given receive buffer...
The header of the Routing Table, containing the magic bytes and the number of entries.
uint64_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Multiple sources sending to a single destination. RoutingManager pushes table updates to all senders...
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
bool initialize(fhicl::ParameterSet const &pset, uint64_t, uint64_t)
Processes the initialize request.
Multiple sources sending to a single destination. Table updates are triggered by senders requesting r...
bool stop(uint64_t, uint64_t)
Stops the RoutingManagerCore.
void process_event_table()
Main loop of the RoutingManagerCore. Determines when to send the next table update, asks the RoutingManagerPolicy for the table to send, and sends it.
bool soft_initialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Soft-Initializes the RoutingManagerCore.
static const std::string TOKENS_RECEIVED_STAT_KEY
Key for the Tokens Received MonitoredQuantity.
std::string report(std::string const &) const
Send a report on the current status of the RoutingManagerCore.
bool resume(uint64_t, uint64_t)
Resumes the RoutingManagerCore.
void send_event_table(detail::RoutingPacket packet)
Sends a detail::RoutingPacket to the table receivers.
RoutingManagerCore()
RoutingManagerCore Constructor.
static const std::string CURRENT_TABLE_INTERVAL_STAT_KEY
Key for the Current Table Interval MonitoredQuantity.
std::shared_ptr< RoutingManagerPolicy > makeRoutingManagerPolicy(std::string const &policy_plugin_spec, fhicl::ParameterSet const &ps)
Load a RoutingManagerPolicy plugin.
bool reinitialize(fhicl::ParameterSet const &pset, uint64_t timeout, uint64_t timestamp)
Reinitializes the RoutingManagerCore.