artdaq  v3_10_02
routingReceiver.cc
1 #define TRACE_NAME "RoutingReceiver"
2 #include "artdaq/DAQdata/Globals.hh"
3 
4 #include <arpa/inet.h>
5 #include <netinet/in.h>
6 #include <poll.h>
7 #include <sys/socket.h>
8 #include <sys/types.h>
9 #include <chrono>
10 #include <csignal>
11 #include <thread>
12 #include "artdaq/Application/LoadParameterSet.hh"
14 #include "artdaq/DAQrate/detail/RoutingPacket.hh"
15 #include "artdaq/TransferPlugins/detail/HostMap.hh"
16 #include "canvas/Utilities/Exception.h"
17 #include "fhiclcpp/types/Atom.h"
18 #include "fhiclcpp/types/OptionalTable.h"
19 #include "fhiclcpp/types/TableFragment.h"
20 #include "proto/artdaqapp.hh"
21 
22 namespace artdaq {
27 {
28 public:
30  struct Config
31  {
33  fhicl::Atom<size_t> collection_time_ms{fhicl::Name{"collection_time_ms"}, fhicl::Comment{"Time to collect routing table updates between printing summaries"}, 1000};
35  fhicl::Atom<bool> print_verbose_info{fhicl::Name{"print_verbose_info"}, fhicl::Comment{"Print verbose information about each receiver detected in routing tables"}, true};
37  fhicl::Atom<size_t> graph_width{fhicl::Name{"graph_width"}, fhicl::Comment{"Width of the summary graph"}, 40};
38  fhicl::TableFragment<artdaq::artdaqapp::Config> artdaqAppConfig;
39  };
41  using Parameters = fhicl::WrappedTable<Config>;
42 
47  explicit RoutingReceiver(fhicl::ParameterSet const& pset)
48  : should_stop_(false)
49  , table_socket_(-1)
50  , routing_table_last_(0)
51  {
52  TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
53 
54  // Validate parameters
55 
56  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
57  use_routing_manager_ = rmConfig.get<bool>("use_routing_manager", false);
58  table_port_ = rmConfig.get<int>("table_update_port", 35556);
59  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
60 
61  host_map_ = MakeHostMap(pset);
62 
63  if (use_routing_manager_)
64  {
65  startTableReceiverThread_();
66  }
67  }
68 
73  {
74  TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver BEGIN";
75  should_stop_ = true;
76  try
77  {
78  if (routing_thread_.joinable())
79  {
80  routing_thread_.join();
81  }
82  }
83  catch (...)
84  {
85  // IGNORED
86  }
87  TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver END.";
88  }
89 
94  std::map<Fragment::sequence_id_t, int> GetRoutingTable()
95  {
96  std::unique_lock<std::mutex> lk(routing_mutex_);
97  std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_);
98  return routing_table_copy;
99  }
100 
105  std::map<Fragment::sequence_id_t, int> GetAndClearRoutingTable()
106  {
107  std::unique_lock<std::mutex> lk(routing_mutex_);
108  std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_);
109  routing_table_.clear();
110  return routing_table_copy;
111  }
112 
117  hostMap_t GetHostMap() { return host_map_; }
118 
119 private:
120  RoutingReceiver(RoutingReceiver const&) = delete;
121  RoutingReceiver(RoutingReceiver&&) = delete;
122  RoutingReceiver& operator=(RoutingReceiver const&) = delete;
123  RoutingReceiver& operator=(RoutingReceiver&&) = delete;
124 
125  void setupTableListener_()
126  {
127  int sts;
128  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
129  if (table_socket_ < 0)
130  {
131  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
132  exit(1);
133  }
134 
135  struct sockaddr_in si_me_request;
136 
137  int yes = 1;
138  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
139  {
140  TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
141  exit(1);
142  }
143  memset(&si_me_request, 0, sizeof(si_me_request));
144  si_me_request.sin_family = AF_INET;
145  si_me_request.sin_port = htons(table_port_);
146  //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
147  struct in_addr in_addr_s;
148  sts = inet_aton(table_address_.c_str(), &in_addr_s);
149  if (sts == 0)
150  {
151  TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
152  }
153  si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
154  if (bind(table_socket_, reinterpret_cast<struct sockaddr*>(&si_me_request), sizeof(si_me_request)) == -1) // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
155  {
156  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
157  exit(1);
158  }
159 
160  struct ip_mreq mreq;
161  sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
162  if (sts == -1)
163  {
164  TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
165  exit(1);
166  }
167  mreq.imr_interface.s_addr = htonl(INADDR_ANY);
168  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
169  {
170  TLOG(TLVL_ERROR) << "Unable to join multicast group";
171  exit(1);
172  }
173  }
174  void startTableReceiverThread_()
175  {
176  if (routing_thread_.joinable())
177  {
178  routing_thread_.join();
179  }
180  TLOG(TLVL_INFO) << "Starting Routing Thread";
181  try
182  {
183  routing_thread_ = boost::thread(&RoutingReceiver::receiveTableUpdatesLoop_, this);
184  char tname[16]; // Size 16 - see man page pthread_setname_np(3) and/or prctl(2)
185  snprintf(tname, sizeof(tname) - 1, "%s", "RoutingReceive"); // NOLINT
186  tname[sizeof(tname) - 1] = '\0'; // assure term. snprintf is not too evil :)
187  auto handle = routing_thread_.native_handle();
188  pthread_setname_np(handle, tname);
189  }
190  catch (const boost::exception& e)
191  {
192  TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
193  std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
194  exit(5);
195  }
196  }
197  void receiveTableUpdatesLoop_()
198  {
199  while (true)
200  {
201  if (should_stop_)
202  {
203  TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
204  return;
205  }
206 
207  TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes";
208  if (table_socket_ == -1)
209  {
210  TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
211  setupTableListener_();
212  }
213  if (table_socket_ == -1)
214  {
215  TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
216  return;
217  }
218 
219  struct pollfd fd;
220  fd.fd = table_socket_;
221  fd.events = POLLIN | POLLPRI;
222 
223  auto res = poll(&fd, 1, 1000);
224  if (res > 0)
225  {
226  auto first = artdaq::Fragment::InvalidSequenceID;
227  auto last = artdaq::Fragment::InvalidSequenceID;
228  std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
230 
231  TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
232  struct sockaddr_in from;
233  socklen_t len = sizeof(from);
234  auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, reinterpret_cast<struct sockaddr*>(&from), &len); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
235  TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
236 
237  if (stss > static_cast<ssize_t>(sizeof(hdr)))
238  {
239  memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader));
240  }
241  else
242  {
243  TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
244  continue;
245  }
246 
247  TRACE(TLVL_DEBUG, "receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx", hdr.nEntries, ((unsigned long*)&hdr)[0], ((unsigned long*)&hdr)[1]); // NOLINT
248  if (hdr.header != ROUTING_MAGIC)
249  {
250  TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss;
251  }
252  else
253  {
255  assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
256  memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
257  TRACE(TLVL_DEBUG + 3, "receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx", stss, ((unsigned long*)&buffer[0])[0], ((unsigned long*)&buffer[0])[1]); // NOLINT
258 
259  first = buffer[0].sequence_id;
260  last = buffer[buffer.size() - 1].sequence_id;
261 
262  if (first + hdr.nEntries - 1 != last)
263  {
264  TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
265  continue;
266  }
267  auto thisSeqID = first;
268 
269  {
270  std::unique_lock<std::mutex> lck(routing_mutex_);
271  if (routing_table_.count(last) == 0)
272  {
273  for (auto entry : buffer)
274  {
275  if (thisSeqID != entry.sequence_id)
276  {
277  TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
278  last = thisSeqID - 1;
279  break;
280  }
281  thisSeqID++;
282  if (routing_table_.count(entry.sequence_id) != 0u)
283  {
284  if (routing_table_[entry.sequence_id] != entry.destination_rank)
285  {
286  TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
287  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
288  << " I will use the original value!";
289  }
290  continue;
291  }
292  if (entry.sequence_id < routing_table_last_)
293  {
294  continue;
295  }
296  routing_table_[entry.sequence_id] = entry.destination_rank;
297  TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
298  << " -> Rank " << entry.destination_rank;
299  }
300  }
301 
302  TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
303  if (!routing_table_.empty())
304  {
305  TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
306  }
307 
308  auto counter = 0;
309  for (auto& entry : routing_table_)
310  {
311  TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
312  counter++;
313  }
314  }
315 
316  if (last > routing_table_last_)
317  {
318  routing_table_last_ = last;
319  }
320  }
321  }
322  }
323  }
324 
325 private:
326  bool use_routing_manager_;
327  std::atomic<bool> should_stop_;
328  int table_port_;
329  std::string table_address_;
330  int table_socket_;
331  std::map<Fragment::sequence_id_t, int> routing_table_;
332  Fragment::sequence_id_t routing_table_last_;
333  mutable std::mutex routing_mutex_;
334  boost::thread routing_thread_;
335  hostMap_t host_map_;
336 };
337 } // namespace artdaq
338 
339 static bool sighandler_init = false;
340 static bool should_stop = false;
341 static void signal_handler(int signum)
342 {
343  // Messagefacility may already be gone at this point, TRACE ONLY!
344 #if TRACE_REVNUM < 1459
345  TRACE_STREAMER(TLVL_ERROR, &("routingReceiver")[0], 0, 0, 0)
346 #else
347  TRACE_STREAMER(TLVL_ERROR, TLOG2("routingReceiver", 0), 0)
348 #endif
349  << "A signal of type " << signum << " was caught by routingReceiver. Stopping receive loop!";
350 
351  should_stop = true;
352 
353  sigset_t set;
354  pthread_sigmask(SIG_UNBLOCK, nullptr, &set);
355  pthread_sigmask(SIG_UNBLOCK, &set, nullptr);
356 }
357 
358 int main(int argc, char* argv[]) try
359 {
360  artdaq::configureMessageFacility("RoutingReceiver", false, false);
361  static std::mutex sighandler_mutex;
362  std::unique_lock<std::mutex> lk(sighandler_mutex);
363 
364  if (!sighandler_init) //&& manager_id_ == 0) // ELF 3/22/18: Taking out manager_id_==0 requirement as I think kill(getpid()) is enough protection
365  {
366  sighandler_init = true;
367  std::vector<int> signals = {SIGINT, SIGTERM, SIGUSR1, SIGUSR2}; // SIGQUIT is used by art in normal operation
368  for (auto signal : signals)
369  {
370  struct sigaction old_action;
371  sigaction(signal, nullptr, &old_action);
372 
373  //If the old handler wasn't SIG_IGN (it's a handler that just
374  // "ignore" the signal)
375  if (old_action.sa_handler != SIG_IGN) // NOLINT(cppcoreguidelines-pro-type-cstyle-cast)
376  {
377  struct sigaction action;
378  action.sa_handler = signal_handler;
379  sigemptyset(&action.sa_mask);
380  for (auto sigblk : signals)
381  {
382  sigaddset(&action.sa_mask, sigblk);
383  }
384  action.sa_flags = 0;
385 
386  //Replace the signal handler of SIGINT with the one described by new_action
387  sigaction(signal, &action, nullptr);
388  }
389  }
390  }
391 
392  fhicl::ParameterSet init_ps = LoadParameterSet<artdaq::RoutingReceiver::Config>(argc, argv, "routingReceiver", "This application receives Routing Tables, and calculates statistics about the usage of the receivers");
393  auto config_ps = init_ps.get<fhicl::ParameterSet>("daq", init_ps);
394  auto metric_ps = config_ps.get<fhicl::ParameterSet>("metrics", config_ps);
395  auto fr_ps = config_ps.get<fhicl::ParameterSet>("fragment_receiver", config_ps);
396 
397  artdaq::RoutingReceiver rr(fr_ps);
398 
399  auto host_map = rr.GetHostMap();
400 
401  auto collection_time_ms = init_ps.get<size_t>("collection_time_ms", 1000);
402  auto max_graph_width = init_ps.get<size_t>("max_graph_width", 100);
403  bool print_verbose = init_ps.get<bool>("print_verbose_info", true);
404  bool verbose_clear_screen = init_ps.get<bool>("clear_screen", true);
405 
406  auto blue = "\033[34m";
407  auto cyan = "\033[36m";
408  auto green = "\033[32m";
409  auto yellow = "\033[93m";
410  auto red = "\033[31m";
411 
412  metricMan->initialize(metric_ps, "RoutingReceiver");
413  metricMan->do_start();
414  if (print_verbose && verbose_clear_screen)
415  {
416  std::cout << "\033[2J";
417  }
418 
419  std::map<int, int> receiver_table = std::map<int, int>();
420 
421  while (!should_stop)
422  {
423  auto start_time = std::chrono::steady_clock::now();
424 
425  auto this_table = rr.GetAndClearRoutingTable();
426 
427  if (!this_table.empty())
428  {
429  auto graph_width = this_table.size();
430  auto n = 1; // n becomes entries per graph character
431  auto graph_width_orig = graph_width;
432  while (graph_width > max_graph_width)
433  {
434  n++;
435  graph_width = graph_width_orig / n;
436  }
437 
438  for (auto& entry : this_table)
439  {
440  receiver_table[entry.second]++;
441  }
442 
443  auto average_entries_per_receiver = this_table.size() / receiver_table.size();
444  auto offset = 2 * n; // Offset is 2 characters, in entries
445 
446  auto cyan_threshold = ((average_entries_per_receiver - offset) / 2) / n;
447  auto green_threshold = (average_entries_per_receiver - offset) / n;
448  auto yellow_threshold = (average_entries_per_receiver + offset) / n;
449  auto red_threshold = (2 * average_entries_per_receiver) / n;
450 
451  TLOG(TLVL_TRACE) << "CT: " << cyan_threshold << ", GT: " << green_threshold << ", YT: " << yellow_threshold << ", RT: " << red_threshold;
452 
453  std::ostringstream report;
454  std::ostringstream verbose_report;
455 
456  if (print_verbose && verbose_clear_screen)
457  {
458  std::cout << "\033[;H\033[J";
459  }
460 
461  report << artdaq::TimeUtils::gettimeofday_us() << ": " << this_table.size() << " Entries, ";
462  for (auto& receiver : receiver_table)
463  {
464  auto percent = static_cast<int>(receiver.second * 100 / this_table.size());
465  report << receiver.first << ": " << receiver.second << " (" << percent << "%), ";
466  if (print_verbose)
467  {
468  verbose_report << receiver.first << ": " << receiver.second << " (" << percent << "%)\t[";
469 
470  size_t graph_characters = receiver.second / n;
471 
472  for (size_t ii = 0; ii < graph_characters; ++ii)
473  {
474  if (ii < cyan_threshold)
475  {
476  verbose_report << blue;
477  }
478  else if (ii < green_threshold)
479  {
480  verbose_report << cyan;
481  }
482  else if (ii < yellow_threshold)
483  {
484  verbose_report << green;
485  }
486  else if (ii < red_threshold)
487  {
488  verbose_report << yellow;
489  }
490  else
491  {
492  verbose_report << red;
493  }
494  verbose_report << "|";
495  }
496  std::string spaces = std::string(graph_width - graph_characters, ' ');
497  verbose_report << "\033[0m" << spaces << "]" << std::endl;
498  }
499  receiver.second = 0;
500  }
501  TLOG(TLVL_INFO) << report.str();
502  std::cout << report.str() << std::endl;
503  if (print_verbose)
504  {
505  std::cout << verbose_report.str() << std::endl;
506  }
507  }
508  std::this_thread::sleep_until(start_time + std::chrono::milliseconds(collection_time_ms));
509  }
510 
511  metricMan->do_stop();
513 
514  return 0;
515 }
516 catch (...)
517 {
518  return -1;
519 }
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
A row of the Routing Table.
~RoutingReceiver()
RoutingReceiver Destructor.
fhicl::Atom< bool > print_verbose_info
&quot;print_verbose_info&quot; (Default: true): Print verbose information about each receiver detected in routi...
static void CleanUpGlobals()
Clean up statically-allocated Manager class instances.
Definition: Globals.hh:150
fhicl::Atom< size_t > collection_time_ms
&quot;collection_time_ms&quot;: Time to collect routing table updates between printing summaries ...
RoutingReceiver(fhicl::ParameterSet const &pset)
RoutingReceiver Constructor.
fhicl::TableFragment< artdaq::artdaqapp::Config > artdaqAppConfig
Configuration for artdaq Application (BoardReader, etc)
Accepted configuration parameters for RoutingReceiver.
std::map< Fragment::sequence_id_t, int > GetRoutingTable()
Get the current routing table.
fhicl::WrappedTable< Config > Parameters
Used for ParameterSet validation (if desired)
The header of the Routing Table, containing the magic bytes and the number of entries.
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Class which receives routing tables and prints updates.
hostMap_t GetHostMap()
Get the host map.
std::map< Fragment::sequence_id_t, int > GetAndClearRoutingTable()
Get the current routing table, additionally clearing all entries.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:39
hostMap_t MakeHostMap(fhicl::ParameterSet const &pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:65
fhicl::Atom< size_t > graph_width
&quot;graph_width&quot;: Width of the summary graph