artdaq  v3_04_01
routingReceiver.cc
1 #define TRACE_NAME "RoutingReceiver"
2 #include "artdaq/DAQdata/Globals.hh"
3 
4 #include <chrono>
5 #include "canvas/Utilities/Exception.h"
6 #include <arpa/inet.h>
7 #include <netinet/in.h>
8 #include <sys/types.h>
9 #include <poll.h>
10 #include <signal.h>
11 #include <thread>
12 #include <sys/socket.h>
14 #include "artdaq/Application/LoadParameterSet.hh"
15 #include "artdaq/DAQrate/detail/RoutingPacket.hh"
16 #include "artdaq/TransferPlugins/detail/HostMap.hh"
17 #include "proto/artdaqapp.hh"
18 #include "fhiclcpp/types/Atom.h"
19 #include "fhiclcpp/types/OptionalTable.h"
20 #include "fhiclcpp/types/TableFragment.h"
21 
22 namespace artdaq {
27  {
28  public:
30  struct Config
31  {
33  fhicl::Atom<size_t> collection_time_ms{ fhicl::Name{ "collection_time_ms" }, fhicl::Comment{ "Time to collect routing table updates between printing summaries" }, 1000 };
35  fhicl::Atom<bool> print_verbose_info{ fhicl::Name{ "print_verbose_info" }, fhicl::Comment{ "Print verbose information about each receiver detected in routing tables" }, true };
37  fhicl::Atom<size_t> graph_width{ fhicl::Name{ "graph_width" }, fhicl::Comment{ "Width of the summary graph" }, 40 };
38  fhicl::TableFragment<artdaq::artdaqapp::Config> artdaqAppConfig;
39  };
41  using Parameters = fhicl::WrappedTable<Config>;
42 
47  explicit RoutingReceiver(fhicl::ParameterSet const& pset)
48  : should_stop_(false)
49  , table_socket_(-1)
50  , routing_table_last_(0)
51  {
52  TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
53 
54  // Validate parameters
55 
56  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
57  use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
58  table_port_ = rmConfig.get<int>("table_update_port", 35556);
59  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
60 
61  host_map_ = MakeHostMap(pset);
62 
63  if (use_routing_master_) startTableReceiverThread_();
64  }
65 
70  {
71  TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver BEGIN";
72  should_stop_ = true;
73  if (routing_thread_.joinable()) routing_thread_.join();
74  TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver END.";
75  }
76 
81  std::map<Fragment::sequence_id_t, int> GetRoutingTable()
82  {
83  std::unique_lock<std::mutex> lk(routing_mutex_);
84  std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_);
85  return routing_table_copy;
86  }
87 
92  std::map<Fragment::sequence_id_t, int> GetAndClearRoutingTable()
93  {
94  std::unique_lock<std::mutex> lk(routing_mutex_);
95  std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_);
96  routing_table_.clear();
97  return routing_table_copy;
98  }
99 
104  hostMap_t GetHostMap() { return host_map_; }
105 
106  private:
107  void setupTableListener_()
108  {
109  int sts;
110  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
111  if (table_socket_ < 0)
112  {
113  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
114  exit(1);
115  }
116 
117  struct sockaddr_in si_me_request;
118 
119  int yes = 1;
120  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
121  {
122  TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
123  exit(1);
124  }
125  memset(&si_me_request, 0, sizeof(si_me_request));
126  si_me_request.sin_family = AF_INET;
127  si_me_request.sin_port = htons(table_port_);
128  //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
129  struct in_addr in_addr_s;
130  sts = inet_aton(table_address_.c_str(), &in_addr_s);
131  if (sts == 0)
132  {
133  TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
134  }
135  si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
136  if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
137  {
138  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
139  exit(1);
140  }
141 
142  struct ip_mreq mreq;
143  sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
144  if (sts == -1)
145  {
146  TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
147  exit(1);
148  }
149  mreq.imr_interface.s_addr = htonl(INADDR_ANY);
150  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
151  {
152  TLOG(TLVL_ERROR) << "Unable to join multicast group";
153  exit(1);
154  }
155  }
156  void startTableReceiverThread_()
157  {
158  if (routing_thread_.joinable()) routing_thread_.join();
159  TLOG(TLVL_INFO) << "Starting Routing Thread";
160  try {
161  routing_thread_ = boost::thread(&RoutingReceiver::receiveTableUpdatesLoop_, this);
162  }
163  catch (const boost::exception& e)
164  {
165  TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
166  std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
167  exit(5);
168  }
169  }
170  void receiveTableUpdatesLoop_()
171  {
172  while (true)
173  {
174  if (should_stop_)
175  {
176  TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
177  return;
178  }
179 
180  TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes";
181  if (table_socket_ == -1)
182  {
183  TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
184  setupTableListener_();
185  }
186  if (table_socket_ == -1)
187  {
188  TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
189  return;
190  }
191 
192  struct pollfd fd;
193  fd.fd = table_socket_;
194  fd.events = POLLIN | POLLPRI;
195 
196  auto res = poll(&fd, 1, 1000);
197  if (res > 0)
198  {
199  auto first = artdaq::Fragment::InvalidSequenceID;
200  auto last = artdaq::Fragment::InvalidSequenceID;
201  std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
203 
204  TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
205  struct sockaddr_in from;
206  socklen_t len = sizeof(from);
207  auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, (struct sockaddr*)&from, &len);
208  TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
209 
210  if (stss > static_cast<ssize_t>(sizeof(hdr)))
211  {
212  memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader));
213  }
214  else
215  {
216  TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
217  continue;
218  }
219 
220  TRACE(TLVL_DEBUG, "receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx", hdr.nEntries, ((unsigned long*)&hdr)[0], ((unsigned long*)&hdr)[1]);
221  if (hdr.header != ROUTING_MAGIC)
222  {
223  TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss;
224  }
225  else
226  {
228  assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
229  memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
230  TRACE(6, "receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx", stss, ((unsigned long*)&buffer[0])[0], ((unsigned long*)&buffer[0])[1]);
231 
232  first = buffer[0].sequence_id;
233  last = buffer[buffer.size() - 1].sequence_id;
234 
235  if (first + hdr.nEntries - 1 != last)
236  {
237  TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
238  continue;
239  }
240  auto thisSeqID = first;
241 
242  {
243  std::unique_lock<std::mutex> lck(routing_mutex_);
244  if (routing_table_.count(last) == 0)
245  {
246  for (auto entry : buffer)
247  {
248  if (thisSeqID != entry.sequence_id)
249  {
250  TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
251  last = thisSeqID - 1;
252  break;
253  }
254  thisSeqID++;
255  if (routing_table_.count(entry.sequence_id))
256  {
257  if (routing_table_[entry.sequence_id] != entry.destination_rank)
258  {
259  TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
260  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
261  << " I will use the original value!";
262  }
263  continue;
264  }
265  if (entry.sequence_id < routing_table_last_) continue;
266  routing_table_[entry.sequence_id] = entry.destination_rank;
267  TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
268  << " -> Rank " << entry.destination_rank;
269  }
270  }
271 
272  TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
273  if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
274 
275  auto counter = 0;
276  for (auto& entry : routing_table_)
277  {
278  TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
279  counter++;
280  }
281  }
282 
283  if (last > routing_table_last_) routing_table_last_ = last;
284  }
285  }
286  }
287  }
288 
289  private:
290  bool use_routing_master_;
291  std::atomic<bool> should_stop_;
292  int table_port_;
293  std::string table_address_;
294  int table_socket_;
295  std::map<Fragment::sequence_id_t, int> routing_table_;
296  Fragment::sequence_id_t routing_table_last_;
297  mutable std::mutex routing_mutex_;
298  boost::thread routing_thread_;
299  hostMap_t host_map_;
300  };
301 }
302 
303 
304 static bool sighandler_init = false;
305 static bool should_stop = false;
306 static void signal_handler(int signum)
307 {
308  // Messagefacility may already be gone at this point, TRACE ONLY!
309  TRACE_STREAMER(TLVL_ERROR, &("routingReceiver")[0], 0, 0, 0) << "A signal of type " << signum << " was caught by routingReceiver. Stopping receive loop!";
310 
311  should_stop = true;
312 
313  sigset_t set;
314  pthread_sigmask(SIG_UNBLOCK, NULL, &set);
315  pthread_sigmask(SIG_UNBLOCK, &set, NULL);
316 
317 }
318 
319 int main(int argc, char* argv[])
320 {
321  artdaq::configureMessageFacility("RoutingReceiver", false, false);
322  static std::mutex sighandler_mutex;
323  std::unique_lock<std::mutex> lk(sighandler_mutex);
324 
325  if (!sighandler_init)//&& manager_id_ == 0) // ELF 3/22/18: Taking out manager_id_==0 requirement as I think kill(getpid()) is enough protection
326  {
327  sighandler_init = true;
328  std::vector<int> signals = { SIGINT, SIGTERM, SIGUSR1, SIGUSR2 }; // SIGQUIT is used by art in normal operation
329  for (auto signal : signals)
330  {
331  struct sigaction old_action;
332  sigaction(signal, NULL, &old_action);
333 
334  //If the old handler wasn't SIG_IGN (it's a handler that just
335  // "ignore" the signal)
336  if (old_action.sa_handler != SIG_IGN)
337  {
338  struct sigaction action;
339  action.sa_handler = signal_handler;
340  sigemptyset(&action.sa_mask);
341  for (auto sigblk : signals)
342  {
343  sigaddset(&action.sa_mask, sigblk);
344  }
345  action.sa_flags = 0;
346 
347  //Replace the signal handler of SIGINT with the one described by new_action
348  sigaction(signal, &action, NULL);
349  }
350  }
351  }
352 
353  fhicl::ParameterSet init_ps = LoadParameterSet<artdaq::RoutingReceiver::Config>(argc, argv, "routingReceiver", "This application receives Routing Tables, and calculates statistics about the usage of the receivers");
354  fhicl::ParameterSet config_ps = init_ps.get<fhicl::ParameterSet>("daq", init_ps);
355  fhicl::ParameterSet metric_ps = config_ps.get<fhicl::ParameterSet>("metrics", config_ps);
356  fhicl::ParameterSet fr_ps = config_ps.get<fhicl::ParameterSet>("fragment_receiver", config_ps);
357 
358  artdaq::RoutingReceiver rr(fr_ps);
359 
360  auto host_map = rr.GetHostMap();
361 
362  size_t collection_time_ms = init_ps.get<size_t>("collection_time_ms", 1000);
363  size_t max_graph_width = init_ps.get<size_t>("max_graph_width", 100);
364  bool print_verbose = init_ps.get<bool>("print_verbose_info", true);
365  bool verbose_clear_screen = init_ps.get<bool>("clear_screen", true);
366 
367  auto blue = "\033[34m";
368  auto cyan = "\033[36m";
369  auto green = "\033[32m";
370  auto yellow = "\033[93m";
371  auto red = "\033[31m";
372 
373  metricMan->initialize(metric_ps, "RoutingReceiver");
374  metricMan->do_start();
375  if (print_verbose && verbose_clear_screen) std::cout << "\033[2J";
376 
377  std::map<int, int> receiver_table = std::map<int,int>();
378 
379  while (!should_stop)
380  {
381  auto start_time = std::chrono::steady_clock::now();
382 
383  auto this_table = rr.GetAndClearRoutingTable();
384 
385  if (this_table.size() > 0)
386  {
387  auto graph_width = this_table.size();
388  auto n = 1; // n becomes entries per graph character
389  auto graph_width_orig = graph_width;
390  while (graph_width > max_graph_width)
391  {
392  n++;
393  graph_width = graph_width_orig / n;
394  }
395 
396  for (auto& entry : this_table)
397  {
398  receiver_table[entry.second]++;
399  }
400 
401  auto average_entries_per_receiver = this_table.size() / receiver_table.size();
402  auto offset = 2 * n; // Offset is 2 characters, in entries
403 
404  auto cyan_threshold = ((average_entries_per_receiver - offset) / 2) / n;
405  auto green_threshold = (average_entries_per_receiver - offset) / n;
406  auto yellow_threshold = (average_entries_per_receiver + offset) / n;
407  auto red_threshold = (2 * average_entries_per_receiver) / n;
408 
409  TLOG(TLVL_TRACE) << "CT: " << cyan_threshold << ", GT: " << green_threshold << ", YT: " << yellow_threshold << ", RT: " << red_threshold;
410 
411  std::ostringstream report;
412  std::ostringstream verbose_report;
413 
414  if (print_verbose && verbose_clear_screen) std::cout << "\033[;H\033[J";
415 
416  report << artdaq::TimeUtils::gettimeofday_us() << ": " << this_table.size() << " Entries, ";
417  for (auto& receiver : receiver_table)
418  {
419  auto percent = static_cast<int>(receiver.second * 100 / this_table.size());
420  report << receiver.first << ": " << receiver.second << " (" << percent << "%), ";
421  if (print_verbose)
422  {
423  verbose_report << receiver.first << ": " << receiver.second << " (" << percent << "%)\t[";
424 
425  size_t graph_characters = receiver.second / n;
426 
427  for (size_t ii = 0; ii < graph_characters; ++ii)
428  {
429  if (ii < cyan_threshold)
430  {
431  verbose_report << blue;
432  }
433  else if (ii < green_threshold)
434  {
435  verbose_report << cyan;
436  }
437  else if (ii < yellow_threshold)
438  {
439  verbose_report << green;
440  }
441  else if (ii < red_threshold)
442  {
443  verbose_report << yellow;
444  }
445  else
446  {
447  verbose_report << red;
448  }
449  verbose_report << "|";
450  }
451  std::string spaces = std::string(graph_width - graph_characters, ' ');
452  verbose_report << "\033[0m" << spaces << "]" << std::endl;
453  }
454  receiver.second = 0;
455  }
456  TLOG(TLVL_INFO) << report.str();
457  std::cout << report.str() << std::endl;
458  if(print_verbose) std::cout << verbose_report.str() << std::endl;
459  }
460  std::this_thread::sleep_until(start_time + std::chrono::milliseconds(collection_time_ms));
461  }
462 
463  metricMan->do_stop();
465 }
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
A row of the Routing Table.
~RoutingReceiver()
RoutingReceiver Destructor.
hostMap_t MakeHostMap(fhicl::ParameterSet pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:66
fhicl::Atom< bool > print_verbose_info
&quot;print_verbose_info&quot; (Default: true): Print verbose information about each receiver detected in routi...
static void CleanUpGlobals()
Clean up statically-allocated Manager class instances.
Definition: Globals.hh:150
fhicl::Atom< size_t > collection_time_ms
&quot;collection_time_ms&quot;: Time to collect routing table updates between printing summaries ...
RoutingReceiver(fhicl::ParameterSet const &pset)
RoutingReceiver Constructor.
fhicl::TableFragment< artdaq::artdaqapp::Config > artdaqAppConfig
Configuration for artdaq Application (BoardReader, etc)
Accepted configuration parameters for RoutingReceiver.
std::map< Fragment::sequence_id_t, int > GetRoutingTable()
Get the current routing table.
fhicl::WrappedTable< Config > Parameters
Used for ParameterSet validation (if desired)
The header of the Routing Table, containing the magic bytes and the number of entries.
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Class which receives routing tables and prints updates.
hostMap_t GetHostMap()
Get the host map.
std::map< Fragment::sequence_id_t, int > GetAndClearRoutingTable()
Get the current routing table, additionally clearing all entries.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:39
fhicl::Atom< size_t > graph_width
&quot;graph_width&quot;: Width of the summary graph