artdaq  v3_08_00
routingReceiver.cc
1 #define TRACE_NAME "RoutingReceiver"
2 #include "artdaq/DAQdata/Globals.hh"
3 
4 #include <arpa/inet.h>
5 #include <netinet/in.h>
6 #include <poll.h>
7 #include <signal.h>
8 #include <sys/socket.h>
9 #include <sys/types.h>
10 #include <chrono>
11 #include <thread>
12 #include "artdaq/Application/LoadParameterSet.hh"
14 #include "artdaq/DAQrate/detail/RoutingPacket.hh"
15 #include "artdaq/TransferPlugins/detail/HostMap.hh"
16 #include "canvas/Utilities/Exception.h"
17 #include "fhiclcpp/types/Atom.h"
18 #include "fhiclcpp/types/OptionalTable.h"
19 #include "fhiclcpp/types/TableFragment.h"
20 #include "proto/artdaqapp.hh"
21 
22 namespace artdaq {
27 {
28 public:
30  struct Config
31  {
33  fhicl::Atom<size_t> collection_time_ms{fhicl::Name{"collection_time_ms"}, fhicl::Comment{"Time to collect routing table updates between printing summaries"}, 1000};
35  fhicl::Atom<bool> print_verbose_info{fhicl::Name{"print_verbose_info"}, fhicl::Comment{"Print verbose information about each receiver detected in routing tables"}, true};
37  fhicl::Atom<size_t> graph_width{fhicl::Name{"graph_width"}, fhicl::Comment{"Width of the summary graph"}, 40};
38  fhicl::TableFragment<artdaq::artdaqapp::Config> artdaqAppConfig;
39  };
41  using Parameters = fhicl::WrappedTable<Config>;
42 
47  explicit RoutingReceiver(fhicl::ParameterSet const& pset)
48  : should_stop_(false)
49  , table_socket_(-1)
50  , routing_table_last_(0)
51  {
52  TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
53 
54  // Validate parameters
55 
56  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
57  use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
58  table_port_ = rmConfig.get<int>("table_update_port", 35556);
59  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
60 
61  host_map_ = MakeHostMap(pset);
62 
63  if (use_routing_master_) startTableReceiverThread_();
64  }
65 
70  {
71  TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver BEGIN";
72  should_stop_ = true;
73  if (routing_thread_.joinable()) routing_thread_.join();
74  TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver END.";
75  }
76 
81  std::map<Fragment::sequence_id_t, int> GetRoutingTable()
82  {
83  std::unique_lock<std::mutex> lk(routing_mutex_);
84  std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_);
85  return routing_table_copy;
86  }
87 
92  std::map<Fragment::sequence_id_t, int> GetAndClearRoutingTable()
93  {
94  std::unique_lock<std::mutex> lk(routing_mutex_);
95  std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_);
96  routing_table_.clear();
97  return routing_table_copy;
98  }
99 
104  hostMap_t GetHostMap() { return host_map_; }
105 
106 private:
107  void setupTableListener_()
108  {
109  int sts;
110  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
111  if (table_socket_ < 0)
112  {
113  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
114  exit(1);
115  }
116 
117  struct sockaddr_in si_me_request;
118 
119  int yes = 1;
120  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
121  {
122  TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
123  exit(1);
124  }
125  memset(&si_me_request, 0, sizeof(si_me_request));
126  si_me_request.sin_family = AF_INET;
127  si_me_request.sin_port = htons(table_port_);
128  //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
129  struct in_addr in_addr_s;
130  sts = inet_aton(table_address_.c_str(), &in_addr_s);
131  if (sts == 0)
132  {
133  TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
134  }
135  si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
136  if (bind(table_socket_, (struct sockaddr*)&si_me_request, sizeof(si_me_request)) == -1)
137  {
138  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
139  exit(1);
140  }
141 
142  struct ip_mreq mreq;
143  sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
144  if (sts == -1)
145  {
146  TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
147  exit(1);
148  }
149  mreq.imr_interface.s_addr = htonl(INADDR_ANY);
150  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
151  {
152  TLOG(TLVL_ERROR) << "Unable to join multicast group";
153  exit(1);
154  }
155  }
156  void startTableReceiverThread_()
157  {
158  if (routing_thread_.joinable()) routing_thread_.join();
159  TLOG(TLVL_INFO) << "Starting Routing Thread";
160  try
161  {
162  routing_thread_ = boost::thread(&RoutingReceiver::receiveTableUpdatesLoop_, this);
163  }
164  catch (const boost::exception& e)
165  {
166  TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
167  std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
168  exit(5);
169  }
170  }
171  void receiveTableUpdatesLoop_()
172  {
173  while (true)
174  {
175  if (should_stop_)
176  {
177  TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
178  return;
179  }
180 
181  TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes";
182  if (table_socket_ == -1)
183  {
184  TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
185  setupTableListener_();
186  }
187  if (table_socket_ == -1)
188  {
189  TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
190  return;
191  }
192 
193  struct pollfd fd;
194  fd.fd = table_socket_;
195  fd.events = POLLIN | POLLPRI;
196 
197  auto res = poll(&fd, 1, 1000);
198  if (res > 0)
199  {
200  auto first = artdaq::Fragment::InvalidSequenceID;
201  auto last = artdaq::Fragment::InvalidSequenceID;
202  std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
204 
205  TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
206  struct sockaddr_in from;
207  socklen_t len = sizeof(from);
208  auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, (struct sockaddr*)&from, &len);
209  TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
210 
211  if (stss > static_cast<ssize_t>(sizeof(hdr)))
212  {
213  memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader));
214  }
215  else
216  {
217  TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
218  continue;
219  }
220 
221  TRACE(TLVL_DEBUG, "receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx", hdr.nEntries, ((unsigned long*)&hdr)[0], ((unsigned long*)&hdr)[1]);
222  if (hdr.header != ROUTING_MAGIC)
223  {
224  TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss;
225  }
226  else
227  {
229  assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
230  memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
231  TRACE(6, "receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx", stss, ((unsigned long*)&buffer[0])[0], ((unsigned long*)&buffer[0])[1]);
232 
233  first = buffer[0].sequence_id;
234  last = buffer[buffer.size() - 1].sequence_id;
235 
236  if (first + hdr.nEntries - 1 != last)
237  {
238  TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
239  continue;
240  }
241  auto thisSeqID = first;
242 
243  {
244  std::unique_lock<std::mutex> lck(routing_mutex_);
245  if (routing_table_.count(last) == 0)
246  {
247  for (auto entry : buffer)
248  {
249  if (thisSeqID != entry.sequence_id)
250  {
251  TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
252  last = thisSeqID - 1;
253  break;
254  }
255  thisSeqID++;
256  if (routing_table_.count(entry.sequence_id))
257  {
258  if (routing_table_[entry.sequence_id] != entry.destination_rank)
259  {
260  TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
261  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
262  << " I will use the original value!";
263  }
264  continue;
265  }
266  if (entry.sequence_id < routing_table_last_) continue;
267  routing_table_[entry.sequence_id] = entry.destination_rank;
268  TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
269  << " -> Rank " << entry.destination_rank;
270  }
271  }
272 
273  TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
274  if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
275 
276  auto counter = 0;
277  for (auto& entry : routing_table_)
278  {
279  TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
280  counter++;
281  }
282  }
283 
284  if (last > routing_table_last_) routing_table_last_ = last;
285  }
286  }
287  }
288  }
289 
290 private:
291  bool use_routing_master_;
292  std::atomic<bool> should_stop_;
293  int table_port_;
294  std::string table_address_;
295  int table_socket_;
296  std::map<Fragment::sequence_id_t, int> routing_table_;
297  Fragment::sequence_id_t routing_table_last_;
298  mutable std::mutex routing_mutex_;
299  boost::thread routing_thread_;
300  hostMap_t host_map_;
301 };
302 } // namespace artdaq
303 
304 static bool sighandler_init = false;
305 static bool should_stop = false;
306 static void signal_handler(int signum)
307 {
308  // Messagefacility may already be gone at this point, TRACE ONLY!
309  TRACE_STREAMER(TLVL_ERROR, &("routingReceiver")[0], 0, 0, 0) << "A signal of type " << signum << " was caught by routingReceiver. Stopping receive loop!";
310 
311  should_stop = true;
312 
313  sigset_t set;
314  pthread_sigmask(SIG_UNBLOCK, NULL, &set);
315  pthread_sigmask(SIG_UNBLOCK, &set, NULL);
316 }
317 
318 int main(int argc, char* argv[])
319 {
320  artdaq::configureMessageFacility("RoutingReceiver", false, false);
321  static std::mutex sighandler_mutex;
322  std::unique_lock<std::mutex> lk(sighandler_mutex);
323 
324  if (!sighandler_init) //&& manager_id_ == 0) // ELF 3/22/18: Taking out manager_id_==0 requirement as I think kill(getpid()) is enough protection
325  {
326  sighandler_init = true;
327  std::vector<int> signals = {SIGINT, SIGTERM, SIGUSR1, SIGUSR2}; // SIGQUIT is used by art in normal operation
328  for (auto signal : signals)
329  {
330  struct sigaction old_action;
331  sigaction(signal, NULL, &old_action);
332 
333  //If the old handler wasn't SIG_IGN (it's a handler that just
334  // "ignore" the signal)
335  if (old_action.sa_handler != SIG_IGN)
336  {
337  struct sigaction action;
338  action.sa_handler = signal_handler;
339  sigemptyset(&action.sa_mask);
340  for (auto sigblk : signals)
341  {
342  sigaddset(&action.sa_mask, sigblk);
343  }
344  action.sa_flags = 0;
345 
346  //Replace the signal handler of SIGINT with the one described by new_action
347  sigaction(signal, &action, NULL);
348  }
349  }
350  }
351 
352  fhicl::ParameterSet init_ps = LoadParameterSet<artdaq::RoutingReceiver::Config>(argc, argv, "routingReceiver", "This application receives Routing Tables, and calculates statistics about the usage of the receivers");
353  fhicl::ParameterSet config_ps = init_ps.get<fhicl::ParameterSet>("daq", init_ps);
354  fhicl::ParameterSet metric_ps = config_ps.get<fhicl::ParameterSet>("metrics", config_ps);
355  fhicl::ParameterSet fr_ps = config_ps.get<fhicl::ParameterSet>("fragment_receiver", config_ps);
356 
357  artdaq::RoutingReceiver rr(fr_ps);
358 
359  auto host_map = rr.GetHostMap();
360 
361  size_t collection_time_ms = init_ps.get<size_t>("collection_time_ms", 1000);
362  size_t max_graph_width = init_ps.get<size_t>("max_graph_width", 100);
363  bool print_verbose = init_ps.get<bool>("print_verbose_info", true);
364  bool verbose_clear_screen = init_ps.get<bool>("clear_screen", true);
365 
366  auto blue = "\033[34m";
367  auto cyan = "\033[36m";
368  auto green = "\033[32m";
369  auto yellow = "\033[93m";
370  auto red = "\033[31m";
371 
372  metricMan->initialize(metric_ps, "RoutingReceiver");
373  metricMan->do_start();
374  if (print_verbose && verbose_clear_screen) std::cout << "\033[2J";
375 
376  std::map<int, int> receiver_table = std::map<int, int>();
377 
378  while (!should_stop)
379  {
380  auto start_time = std::chrono::steady_clock::now();
381 
382  auto this_table = rr.GetAndClearRoutingTable();
383 
384  if (this_table.size() > 0)
385  {
386  auto graph_width = this_table.size();
387  auto n = 1; // n becomes entries per graph character
388  auto graph_width_orig = graph_width;
389  while (graph_width > max_graph_width)
390  {
391  n++;
392  graph_width = graph_width_orig / n;
393  }
394 
395  for (auto& entry : this_table)
396  {
397  receiver_table[entry.second]++;
398  }
399 
400  auto average_entries_per_receiver = this_table.size() / receiver_table.size();
401  auto offset = 2 * n; // Offset is 2 characters, in entries
402 
403  auto cyan_threshold = ((average_entries_per_receiver - offset) / 2) / n;
404  auto green_threshold = (average_entries_per_receiver - offset) / n;
405  auto yellow_threshold = (average_entries_per_receiver + offset) / n;
406  auto red_threshold = (2 * average_entries_per_receiver) / n;
407 
408  TLOG(TLVL_TRACE) << "CT: " << cyan_threshold << ", GT: " << green_threshold << ", YT: " << yellow_threshold << ", RT: " << red_threshold;
409 
410  std::ostringstream report;
411  std::ostringstream verbose_report;
412 
413  if (print_verbose && verbose_clear_screen) std::cout << "\033[;H\033[J";
414 
415  report << artdaq::TimeUtils::gettimeofday_us() << ": " << this_table.size() << " Entries, ";
416  for (auto& receiver : receiver_table)
417  {
418  auto percent = static_cast<int>(receiver.second * 100 / this_table.size());
419  report << receiver.first << ": " << receiver.second << " (" << percent << "%), ";
420  if (print_verbose)
421  {
422  verbose_report << receiver.first << ": " << receiver.second << " (" << percent << "%)\t[";
423 
424  size_t graph_characters = receiver.second / n;
425 
426  for (size_t ii = 0; ii < graph_characters; ++ii)
427  {
428  if (ii < cyan_threshold)
429  {
430  verbose_report << blue;
431  }
432  else if (ii < green_threshold)
433  {
434  verbose_report << cyan;
435  }
436  else if (ii < yellow_threshold)
437  {
438  verbose_report << green;
439  }
440  else if (ii < red_threshold)
441  {
442  verbose_report << yellow;
443  }
444  else
445  {
446  verbose_report << red;
447  }
448  verbose_report << "|";
449  }
450  std::string spaces = std::string(graph_width - graph_characters, ' ');
451  verbose_report << "\033[0m" << spaces << "]" << std::endl;
452  }
453  receiver.second = 0;
454  }
455  TLOG(TLVL_INFO) << report.str();
456  std::cout << report.str() << std::endl;
457  if (print_verbose) std::cout << verbose_report.str() << std::endl;
458  }
459  std::this_thread::sleep_until(start_time + std::chrono::milliseconds(collection_time_ms));
460  }
461 
462  metricMan->do_stop();
464 }
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
A row of the Routing Table.
~RoutingReceiver()
RoutingReceiver Destructor.
hostMap_t MakeHostMap(fhicl::ParameterSet pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:66
fhicl::Atom< bool > print_verbose_info
&quot;print_verbose_info&quot; (Default: true): Print verbose information about each receiver detected in routi...
static void CleanUpGlobals()
Clean up statically-allocated Manager class instances.
Definition: Globals.hh:150
fhicl::Atom< size_t > collection_time_ms
&quot;collection_time_ms&quot;: Time to collect routing table updates between printing summaries ...
RoutingReceiver(fhicl::ParameterSet const &pset)
RoutingReceiver Constructor.
fhicl::TableFragment< artdaq::artdaqapp::Config > artdaqAppConfig
Configuration for artdaq Application (BoardReader, etc)
Accepted configuration parameters for RoutingReceiver.
std::map< Fragment::sequence_id_t, int > GetRoutingTable()
Get the current routing table.
fhicl::WrappedTable< Config > Parameters
Used for ParameterSet validation (if desired)
The header of the Routing Table, containing the magic bytes and the number of entries.
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Class which receives routing tables and prints updates.
hostMap_t GetHostMap()
Get the host map.
std::map< Fragment::sequence_id_t, int > GetAndClearRoutingTable()
Get the current routing table, additionally clearing all entries.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:39
fhicl::Atom< size_t > graph_width
&quot;graph_width&quot;: Width of the summary graph