artdaq  v3_09_00
routingReceiver.cc
1 #define TRACE_NAME "RoutingReceiver"
2 #include "artdaq/DAQdata/Globals.hh"
3 
4 #include <arpa/inet.h>
5 #include <netinet/in.h>
6 #include <poll.h>
7 #include <sys/socket.h>
8 #include <sys/types.h>
9 #include <chrono>
10 #include <csignal>
11 #include <thread>
12 #include "artdaq/Application/LoadParameterSet.hh"
14 #include "artdaq/DAQrate/detail/RoutingPacket.hh"
15 #include "artdaq/TransferPlugins/detail/HostMap.hh"
16 #include "canvas/Utilities/Exception.h"
17 #include "fhiclcpp/types/Atom.h"
18 #include "fhiclcpp/types/OptionalTable.h"
19 #include "fhiclcpp/types/TableFragment.h"
20 #include "proto/artdaqapp.hh"
21 
22 namespace artdaq {
27 {
28 public:
30  struct Config
31  {
33  fhicl::Atom<size_t> collection_time_ms{fhicl::Name{"collection_time_ms"}, fhicl::Comment{"Time to collect routing table updates between printing summaries"}, 1000};
35  fhicl::Atom<bool> print_verbose_info{fhicl::Name{"print_verbose_info"}, fhicl::Comment{"Print verbose information about each receiver detected in routing tables"}, true};
37  fhicl::Atom<size_t> graph_width{fhicl::Name{"graph_width"}, fhicl::Comment{"Width of the summary graph"}, 40};
38  fhicl::TableFragment<artdaq::artdaqapp::Config> artdaqAppConfig;
39  };
41  using Parameters = fhicl::WrappedTable<Config>;
42 
47  explicit RoutingReceiver(fhicl::ParameterSet const& pset)
48  : should_stop_(false)
49  , table_socket_(-1)
50  , routing_table_last_(0)
51  {
52  TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
53 
54  // Validate parameters
55 
56  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
57  use_routing_manager_ = rmConfig.get<bool>("use_routing_manager", false);
58  table_port_ = rmConfig.get<int>("table_update_port", 35556);
59  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
60 
61  host_map_ = MakeHostMap(pset);
62 
63  if (use_routing_manager_)
64  {
65  startTableReceiverThread_();
66  }
67  }
68 
73  {
74  TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver BEGIN";
75  should_stop_ = true;
76  try
77  {
78  if (routing_thread_.joinable())
79  {
80  routing_thread_.join();
81  }
82  }
83  catch (...)
84  {
85  // IGNORED
86  }
87  TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver END.";
88  }
89 
94  std::map<Fragment::sequence_id_t, int> GetRoutingTable()
95  {
96  std::unique_lock<std::mutex> lk(routing_mutex_);
97  std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_);
98  return routing_table_copy;
99  }
100 
105  std::map<Fragment::sequence_id_t, int> GetAndClearRoutingTable()
106  {
107  std::unique_lock<std::mutex> lk(routing_mutex_);
108  std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_);
109  routing_table_.clear();
110  return routing_table_copy;
111  }
112 
117  hostMap_t GetHostMap() { return host_map_; }
118 
119 private:
120  RoutingReceiver(RoutingReceiver const&) = delete;
121  RoutingReceiver(RoutingReceiver&&) = delete;
122  RoutingReceiver& operator=(RoutingReceiver const&) = delete;
123  RoutingReceiver& operator=(RoutingReceiver&&) = delete;
124 
125  void setupTableListener_()
126  {
127  int sts;
128  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
129  if (table_socket_ < 0)
130  {
131  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
132  exit(1);
133  }
134 
135  struct sockaddr_in si_me_request;
136 
137  int yes = 1;
138  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
139  {
140  TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
141  exit(1);
142  }
143  memset(&si_me_request, 0, sizeof(si_me_request));
144  si_me_request.sin_family = AF_INET;
145  si_me_request.sin_port = htons(table_port_);
146  //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
147  struct in_addr in_addr_s;
148  sts = inet_aton(table_address_.c_str(), &in_addr_s);
149  if (sts == 0)
150  {
151  TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
152  }
153  si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
154  if (bind(table_socket_, reinterpret_cast<struct sockaddr*>(&si_me_request), sizeof(si_me_request)) == -1) // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
155  {
156  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
157  exit(1);
158  }
159 
160  struct ip_mreq mreq;
161  sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
162  if (sts == -1)
163  {
164  TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
165  exit(1);
166  }
167  mreq.imr_interface.s_addr = htonl(INADDR_ANY);
168  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
169  {
170  TLOG(TLVL_ERROR) << "Unable to join multicast group";
171  exit(1);
172  }
173  }
174  void startTableReceiverThread_()
175  {
176  if (routing_thread_.joinable())
177  {
178  routing_thread_.join();
179  }
180  TLOG(TLVL_INFO) << "Starting Routing Thread";
181  try
182  {
183  routing_thread_ = boost::thread(&RoutingReceiver::receiveTableUpdatesLoop_, this);
184  }
185  catch (const boost::exception& e)
186  {
187  TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
188  std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
189  exit(5);
190  }
191  }
192  void receiveTableUpdatesLoop_()
193  {
194  while (true)
195  {
196  if (should_stop_)
197  {
198  TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
199  return;
200  }
201 
202  TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes";
203  if (table_socket_ == -1)
204  {
205  TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
206  setupTableListener_();
207  }
208  if (table_socket_ == -1)
209  {
210  TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
211  return;
212  }
213 
214  struct pollfd fd;
215  fd.fd = table_socket_;
216  fd.events = POLLIN | POLLPRI;
217 
218  auto res = poll(&fd, 1, 1000);
219  if (res > 0)
220  {
221  auto first = artdaq::Fragment::InvalidSequenceID;
222  auto last = artdaq::Fragment::InvalidSequenceID;
223  std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
225 
226  TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
227  struct sockaddr_in from;
228  socklen_t len = sizeof(from);
229  auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, reinterpret_cast<struct sockaddr*>(&from), &len); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
230  TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
231 
232  if (stss > static_cast<ssize_t>(sizeof(hdr)))
233  {
234  memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader));
235  }
236  else
237  {
238  TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
239  continue;
240  }
241 
242  TRACE(TLVL_DEBUG, "receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx", hdr.nEntries, ((unsigned long*)&hdr)[0], ((unsigned long*)&hdr)[1]); // NOLINT
243  if (hdr.header != ROUTING_MAGIC)
244  {
245  TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss;
246  }
247  else
248  {
250  assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
251  memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
252  TRACE(6, "receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx", stss, ((unsigned long*)&buffer[0])[0], ((unsigned long*)&buffer[0])[1]); // NOLINT
253 
254  first = buffer[0].sequence_id;
255  last = buffer[buffer.size() - 1].sequence_id;
256 
257  if (first + hdr.nEntries - 1 != last)
258  {
259  TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
260  continue;
261  }
262  auto thisSeqID = first;
263 
264  {
265  std::unique_lock<std::mutex> lck(routing_mutex_);
266  if (routing_table_.count(last) == 0)
267  {
268  for (auto entry : buffer)
269  {
270  if (thisSeqID != entry.sequence_id)
271  {
272  TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
273  last = thisSeqID - 1;
274  break;
275  }
276  thisSeqID++;
277  if (routing_table_.count(entry.sequence_id) != 0u)
278  {
279  if (routing_table_[entry.sequence_id] != entry.destination_rank)
280  {
281  TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
282  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
283  << " I will use the original value!";
284  }
285  continue;
286  }
287  if (entry.sequence_id < routing_table_last_)
288  {
289  continue;
290  }
291  routing_table_[entry.sequence_id] = entry.destination_rank;
292  TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
293  << " -> Rank " << entry.destination_rank;
294  }
295  }
296 
297  TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
298  if (!routing_table_.empty())
299  {
300  TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
301  }
302 
303  auto counter = 0;
304  for (auto& entry : routing_table_)
305  {
306  TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
307  counter++;
308  }
309  }
310 
311  if (last > routing_table_last_)
312  {
313  routing_table_last_ = last;
314  }
315  }
316  }
317  }
318  }
319 
320 private:
321  bool use_routing_manager_;
322  std::atomic<bool> should_stop_;
323  int table_port_;
324  std::string table_address_;
325  int table_socket_;
326  std::map<Fragment::sequence_id_t, int> routing_table_;
327  Fragment::sequence_id_t routing_table_last_;
328  mutable std::mutex routing_mutex_;
329  boost::thread routing_thread_;
330  hostMap_t host_map_;
331 };
332 } // namespace artdaq
333 
334 static bool sighandler_init = false;
335 static bool should_stop = false;
336 static void signal_handler(int signum)
337 {
338  // Messagefacility may already be gone at this point, TRACE ONLY!
339  TRACE_STREAMER(TLVL_ERROR, &("routingReceiver")[0], 0, 0, 0) << "A signal of type " << signum << " was caught by routingReceiver. Stopping receive loop!";
340 
341  should_stop = true;
342 
343  sigset_t set;
344  pthread_sigmask(SIG_UNBLOCK, nullptr, &set);
345  pthread_sigmask(SIG_UNBLOCK, &set, nullptr);
346 }
347 
348 int main(int argc, char* argv[])
349 try
350 {
351  artdaq::configureMessageFacility("RoutingReceiver", false, false);
352  static std::mutex sighandler_mutex;
353  std::unique_lock<std::mutex> lk(sighandler_mutex);
354 
355  if (!sighandler_init) //&& manager_id_ == 0) // ELF 3/22/18: Taking out manager_id_==0 requirement as I think kill(getpid()) is enough protection
356  {
357  sighandler_init = true;
358  std::vector<int> signals = {SIGINT, SIGTERM, SIGUSR1, SIGUSR2}; // SIGQUIT is used by art in normal operation
359  for (auto signal : signals)
360  {
361  struct sigaction old_action;
362  sigaction(signal, nullptr, &old_action);
363 
364  //If the old handler wasn't SIG_IGN (it's a handler that just
365  // "ignore" the signal)
366  if (old_action.sa_handler != SIG_IGN) // NOLINT(cppcoreguidelines-pro-type-cstyle-cast)
367  {
368  struct sigaction action;
369  action.sa_handler = signal_handler;
370  sigemptyset(&action.sa_mask);
371  for (auto sigblk : signals)
372  {
373  sigaddset(&action.sa_mask, sigblk);
374  }
375  action.sa_flags = 0;
376 
377  //Replace the signal handler of SIGINT with the one described by new_action
378  sigaction(signal, &action, nullptr);
379  }
380  }
381  }
382 
383  fhicl::ParameterSet init_ps = LoadParameterSet<artdaq::RoutingReceiver::Config>(argc, argv, "routingReceiver", "This application receives Routing Tables, and calculates statistics about the usage of the receivers");
384  auto config_ps = init_ps.get<fhicl::ParameterSet>("daq", init_ps);
385  auto metric_ps = config_ps.get<fhicl::ParameterSet>("metrics", config_ps);
386  auto fr_ps = config_ps.get<fhicl::ParameterSet>("fragment_receiver", config_ps);
387 
388  artdaq::RoutingReceiver rr(fr_ps);
389 
390  auto host_map = rr.GetHostMap();
391 
392  auto collection_time_ms = init_ps.get<size_t>("collection_time_ms", 1000);
393  auto max_graph_width = init_ps.get<size_t>("max_graph_width", 100);
394  bool print_verbose = init_ps.get<bool>("print_verbose_info", true);
395  bool verbose_clear_screen = init_ps.get<bool>("clear_screen", true);
396 
397  auto blue = "\033[34m";
398  auto cyan = "\033[36m";
399  auto green = "\033[32m";
400  auto yellow = "\033[93m";
401  auto red = "\033[31m";
402 
403  metricMan->initialize(metric_ps, "RoutingReceiver");
404  metricMan->do_start();
405  if (print_verbose && verbose_clear_screen)
406  {
407  std::cout << "\033[2J";
408  }
409 
410  std::map<int, int> receiver_table = std::map<int, int>();
411 
412  while (!should_stop)
413  {
414  auto start_time = std::chrono::steady_clock::now();
415 
416  auto this_table = rr.GetAndClearRoutingTable();
417 
418  if (!this_table.empty())
419  {
420  auto graph_width = this_table.size();
421  auto n = 1; // n becomes entries per graph character
422  auto graph_width_orig = graph_width;
423  while (graph_width > max_graph_width)
424  {
425  n++;
426  graph_width = graph_width_orig / n;
427  }
428 
429  for (auto& entry : this_table)
430  {
431  receiver_table[entry.second]++;
432  }
433 
434  auto average_entries_per_receiver = this_table.size() / receiver_table.size();
435  auto offset = 2 * n; // Offset is 2 characters, in entries
436 
437  auto cyan_threshold = ((average_entries_per_receiver - offset) / 2) / n;
438  auto green_threshold = (average_entries_per_receiver - offset) / n;
439  auto yellow_threshold = (average_entries_per_receiver + offset) / n;
440  auto red_threshold = (2 * average_entries_per_receiver) / n;
441 
442  TLOG(TLVL_TRACE) << "CT: " << cyan_threshold << ", GT: " << green_threshold << ", YT: " << yellow_threshold << ", RT: " << red_threshold;
443 
444  std::ostringstream report;
445  std::ostringstream verbose_report;
446 
447  if (print_verbose && verbose_clear_screen)
448  {
449  std::cout << "\033[;H\033[J";
450  }
451 
452  report << artdaq::TimeUtils::gettimeofday_us() << ": " << this_table.size() << " Entries, ";
453  for (auto& receiver : receiver_table)
454  {
455  auto percent = static_cast<int>(receiver.second * 100 / this_table.size());
456  report << receiver.first << ": " << receiver.second << " (" << percent << "%), ";
457  if (print_verbose)
458  {
459  verbose_report << receiver.first << ": " << receiver.second << " (" << percent << "%)\t[";
460 
461  size_t graph_characters = receiver.second / n;
462 
463  for (size_t ii = 0; ii < graph_characters; ++ii)
464  {
465  if (ii < cyan_threshold)
466  {
467  verbose_report << blue;
468  }
469  else if (ii < green_threshold)
470  {
471  verbose_report << cyan;
472  }
473  else if (ii < yellow_threshold)
474  {
475  verbose_report << green;
476  }
477  else if (ii < red_threshold)
478  {
479  verbose_report << yellow;
480  }
481  else
482  {
483  verbose_report << red;
484  }
485  verbose_report << "|";
486  }
487  std::string spaces = std::string(graph_width - graph_characters, ' ');
488  verbose_report << "\033[0m" << spaces << "]" << std::endl;
489  }
490  receiver.second = 0;
491  }
492  TLOG(TLVL_INFO) << report.str();
493  std::cout << report.str() << std::endl;
494  if (print_verbose)
495  {
496  std::cout << verbose_report.str() << std::endl;
497  }
498  }
499  std::this_thread::sleep_until(start_time + std::chrono::milliseconds(collection_time_ms));
500  }
501 
502  metricMan->do_stop();
504 
505  return 0;
506 }
507 catch (...)
508 {
509  return -1;
510 }
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
A row of the Routing Table.
~RoutingReceiver()
RoutingReceiver Destructor.
fhicl::Atom< bool > print_verbose_info
&quot;print_verbose_info&quot; (Default: true): Print verbose information about each receiver detected in routi...
static void CleanUpGlobals()
Clean up statically-allocated Manager class instances.
Definition: Globals.hh:150
fhicl::Atom< size_t > collection_time_ms
&quot;collection_time_ms&quot;: Time to collect routing table updates between printing summaries ...
RoutingReceiver(fhicl::ParameterSet const &pset)
RoutingReceiver Constructor.
fhicl::TableFragment< artdaq::artdaqapp::Config > artdaqAppConfig
Configuration for artdaq Application (BoardReader, etc)
Accepted configuration parameters for RoutingReceiver.
std::map< Fragment::sequence_id_t, int > GetRoutingTable()
Get the current routing table.
fhicl::WrappedTable< Config > Parameters
Used for ParameterSet validation (if desired)
The header of the Routing Table, containing the magic bytes and the number of entries.
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Class which receives routing tables and prints updates.
hostMap_t GetHostMap()
Get the host map.
std::map< Fragment::sequence_id_t, int > GetAndClearRoutingTable()
Get the current routing table, additionally clearing all entries.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:39
hostMap_t MakeHostMap(fhicl::ParameterSet const &pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:65
fhicl::Atom< size_t > graph_width
&quot;graph_width&quot;: Width of the summary graph