artdaq  v3_04_00
routingReceiver.cc
1 #define TRACE_NAME "RoutingReceiver"
2 #include "artdaq/DAQdata/Globals.hh"
3 
4 #include <chrono>
5 #include "canvas/Utilities/Exception.h"
6 #include <arpa/inet.h>
7 #include <netinet/in.h>
8 #include <sys/types.h>
9 #include <poll.h>
10 #include <signal.h>
11 #include <thread>
12 #include <sys/socket.h>
14 #include "artdaq/Application/LoadParameterSet.hh"
15 #include "artdaq/DAQrate/detail/RoutingPacket.hh"
16 #include "artdaq/TransferPlugins/detail/HostMap.hh"
17 #include "proto/artdaqapp.hh"
18 #include "fhiclcpp/types/Atom.h"
19 #include "fhiclcpp/types/OptionalTable.h"
20 #include "fhiclcpp/types/TableFragment.h"
21 
22 namespace artdaq
23 {
25  {
26  public:
27  struct Config
28  {
30  fhicl::Atom<size_t> collection_time_ms{ fhicl::Name{ "collection_time_ms" }, fhicl::Comment{ "Time to collect routing table updates between printing summaries" }, 1000 };
32  fhicl::Atom<bool> print_verbose_info{ fhicl::Name{ "print_verbose_info" }, fhicl::Comment{ "Print verbose information about each receiver detected in routing tables" }, true };
34  fhicl::Atom<size_t> graph_width{ fhicl::Name{ "graph_width" }, fhicl::Comment{ "Width of the summary graph" }, 40 };
35  fhicl::TableFragment<artdaq::artdaqapp::Config> artdaqAppConfig;
36  };
37  using Parameters = fhicl::WrappedTable<Config>;
38 
39  explicit RoutingReceiver(fhicl::ParameterSet const& pset)
40  : should_stop_(false)
41  , table_socket_(-1)
42  , routing_table_last_(0)
43  {
44  TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
45 
46  // Validate parameters
47 
48  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
49  use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
50  table_port_ = rmConfig.get<int>("table_update_port", 35556);
51  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
52 
53  host_map_ = MakeHostMap(pset);
54 
55  if (use_routing_master_) startTableReceiverThread_();
56  }
57 
59  {
60  TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver BEGIN";
61  should_stop_ = true;
62  if (routing_thread_.joinable()) routing_thread_.join();
63  TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver END.";
64  }
65 
66  std::map<Fragment::sequence_id_t, int> GetRoutingTable()
67  {
68  std::unique_lock<std::mutex> lk(routing_mutex_);
69  std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_);
70  return routing_table_copy;
71  }
72 
73  std::map<Fragment::sequence_id_t, int> GetAndClearRoutingTable()
74  {
75  std::unique_lock<std::mutex> lk(routing_mutex_);
76  std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_);
77  routing_table_.clear();
78  return routing_table_copy;
79  }
80 
81  hostMap_t GetHostMap() { return host_map_; }
82 
83  private:
84  void setupTableListener_()
85  {
86  int sts;
87  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
88  if (table_socket_ < 0)
89  {
90  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
91  exit(1);
92  }
93 
94  struct sockaddr_in si_me_request;
95 
96  int yes = 1;
97  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
98  {
99  TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
100  exit(1);
101  }
102  memset(&si_me_request, 0, sizeof(si_me_request));
103  si_me_request.sin_family = AF_INET;
104  si_me_request.sin_port = htons(table_port_);
105  //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
106  struct in_addr in_addr_s;
107  sts = inet_aton(table_address_.c_str(), &in_addr_s);
108  if (sts == 0)
109  {
110  TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
111  }
112  si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
113  if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
114  {
115  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
116  exit(1);
117  }
118 
119  struct ip_mreq mreq;
120  sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
121  if (sts == -1)
122  {
123  TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
124  exit(1);
125  }
126  mreq.imr_interface.s_addr = htonl(INADDR_ANY);
127  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
128  {
129  TLOG(TLVL_ERROR) << "Unable to join multicast group";
130  exit(1);
131  }
132  }
133  void startTableReceiverThread_()
134  {
135  if (routing_thread_.joinable()) routing_thread_.join();
136  TLOG(TLVL_INFO) << "Starting Routing Thread";
137  try {
138  routing_thread_ = boost::thread(&RoutingReceiver::receiveTableUpdatesLoop_, this);
139  }
140  catch (const boost::exception& e)
141  {
142  TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
143  std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
144  exit(5);
145  }
146  }
147  void receiveTableUpdatesLoop_()
148  {
149  while (true)
150  {
151  if (should_stop_)
152  {
153  TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
154  return;
155  }
156 
157  TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes";
158  if (table_socket_ == -1)
159  {
160  TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
161  setupTableListener_();
162  }
163  if (table_socket_ == -1)
164  {
165  TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
166  return;
167  }
168 
169  struct pollfd fd;
170  fd.fd = table_socket_;
171  fd.events = POLLIN | POLLPRI;
172 
173  auto res = poll(&fd, 1, 1000);
174  if (res > 0)
175  {
176  auto first = artdaq::Fragment::InvalidSequenceID;
177  auto last = artdaq::Fragment::InvalidSequenceID;
178  std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
180 
181  TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
182  struct sockaddr_in from;
183  socklen_t len = sizeof(from);
184  auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, (struct sockaddr*)&from, &len);
185  TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
186 
187  if (stss > static_cast<ssize_t>(sizeof(hdr)))
188  {
189  memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader));
190  }
191  else
192  {
193  TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
194  continue;
195  }
196 
197  TRACE(TLVL_DEBUG, "receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx", hdr.nEntries, ((unsigned long*)&hdr)[0], ((unsigned long*)&hdr)[1]);
198  if (hdr.header != ROUTING_MAGIC)
199  {
200  TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss;
201  }
202  else
203  {
205  assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
206  memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
207  TRACE(6, "receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx", stss, ((unsigned long*)&buffer[0])[0], ((unsigned long*)&buffer[0])[1]);
208 
209  first = buffer[0].sequence_id;
210  last = buffer[buffer.size() - 1].sequence_id;
211 
212  if (first + hdr.nEntries - 1 != last)
213  {
214  TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
215  continue;
216  }
217  auto thisSeqID = first;
218 
219  {
220  std::unique_lock<std::mutex> lck(routing_mutex_);
221  if (routing_table_.count(last) == 0)
222  {
223  for (auto entry : buffer)
224  {
225  if (thisSeqID != entry.sequence_id)
226  {
227  TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
228  last = thisSeqID - 1;
229  break;
230  }
231  thisSeqID++;
232  if (routing_table_.count(entry.sequence_id))
233  {
234  if (routing_table_[entry.sequence_id] != entry.destination_rank)
235  {
236  TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
237  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
238  << " I will use the original value!";
239  }
240  continue;
241  }
242  if (entry.sequence_id < routing_table_last_) continue;
243  routing_table_[entry.sequence_id] = entry.destination_rank;
244  TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
245  << " -> Rank " << entry.destination_rank;
246  }
247  }
248 
249  TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
250  if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
251 
252  auto counter = 0;
253  for (auto& entry : routing_table_)
254  {
255  TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
256  counter++;
257  }
258  }
259 
260  if (last > routing_table_last_) routing_table_last_ = last;
261  }
262  }
263  }
264  }
265 
266  private:
267  bool use_routing_master_;
268  std::atomic<bool> should_stop_;
269  int table_port_;
270  std::string table_address_;
271  int table_socket_;
272  std::map<Fragment::sequence_id_t, int> routing_table_;
273  Fragment::sequence_id_t routing_table_last_;
274  mutable std::mutex routing_mutex_;
275  boost::thread routing_thread_;
276  hostMap_t host_map_;
277  };
278 }
279 
280 
281 static bool sighandler_init = false;
282 static bool should_stop = false;
283 static void signal_handler(int signum)
284 {
285  // Messagefacility may already be gone at this point, TRACE ONLY!
286  TRACE_STREAMER(TLVL_ERROR, &("routingReceiver")[0], 0, 0, 0) << "A signal of type " << signum << " was caught by routingReceiver. Stopping receive loop!";
287 
288  should_stop = true;
289 
290  sigset_t set;
291  pthread_sigmask(SIG_UNBLOCK, NULL, &set);
292  pthread_sigmask(SIG_UNBLOCK, &set, NULL);
293 
294 }
295 
296 int main(int argc, char* argv[])
297 {
298  artdaq::configureMessageFacility("RoutingReceiver", false, false);
299  static std::mutex sighandler_mutex;
300  std::unique_lock<std::mutex> lk(sighandler_mutex);
301 
302  if (!sighandler_init)//&& manager_id_ == 0) // ELF 3/22/18: Taking out manager_id_==0 requirement as I think kill(getpid()) is enough protection
303  {
304  sighandler_init = true;
305  std::vector<int> signals = { SIGINT, SIGTERM, SIGUSR1, SIGUSR2 }; // SIGQUIT is used by art in normal operation
306  for (auto signal : signals)
307  {
308  struct sigaction old_action;
309  sigaction(signal, NULL, &old_action);
310 
311  //If the old handler wasn't SIG_IGN (it's a handler that just
312  // "ignore" the signal)
313  if (old_action.sa_handler != SIG_IGN)
314  {
315  struct sigaction action;
316  action.sa_handler = signal_handler;
317  sigemptyset(&action.sa_mask);
318  for (auto sigblk : signals)
319  {
320  sigaddset(&action.sa_mask, sigblk);
321  }
322  action.sa_flags = 0;
323 
324  //Replace the signal handler of SIGINT with the one described by new_action
325  sigaction(signal, &action, NULL);
326  }
327  }
328  }
329 
330  fhicl::ParameterSet init_ps = LoadParameterSet<artdaq::RoutingReceiver::Config>(argc, argv, "routingReceiver", "This application receives Routing Tables, and calculates statistics about the usage of the receivers");
331  fhicl::ParameterSet config_ps = init_ps.get<fhicl::ParameterSet>("daq", init_ps);
332  fhicl::ParameterSet metric_ps = config_ps.get<fhicl::ParameterSet>("metrics", config_ps);
333  fhicl::ParameterSet fr_ps = config_ps.get<fhicl::ParameterSet>("fragment_receiver", config_ps);
334 
335  artdaq::RoutingReceiver rr(fr_ps);
336 
337  auto host_map = rr.GetHostMap();
338 
339  size_t collection_time_ms = init_ps.get<size_t>("collection_time_ms", 1000);
340  size_t max_graph_width = init_ps.get<size_t>("max_graph_width", 100);
341  bool print_verbose = init_ps.get<bool>("print_verbose_info", true);
342  bool verbose_clear_screen = init_ps.get<bool>("clear_screen", true);
343 
344  auto blue = "\033[34m";
345  auto cyan = "\033[36m";
346  auto green = "\033[32m";
347  auto yellow = "\033[93m";
348  auto red = "\033[31m";
349 
350  metricMan->initialize(metric_ps, "RoutingReceiver");
351  metricMan->do_start();
352  if (print_verbose && verbose_clear_screen) std::cout << "\033[2J";
353 
354  std::map<int, int> receiver_table = std::map<int,int>();
355 
356  while (!should_stop)
357  {
358  auto start_time = std::chrono::steady_clock::now();
359 
360  auto this_table = rr.GetAndClearRoutingTable();
361 
362  if (this_table.size() > 0)
363  {
364  auto graph_width = this_table.size();
365  auto n = 1; // n becomes entries per graph character
366  auto graph_width_orig = graph_width;
367  while (graph_width > max_graph_width)
368  {
369  n++;
370  graph_width = graph_width_orig / n;
371  }
372 
373  for (auto& entry : this_table)
374  {
375  receiver_table[entry.second]++;
376  }
377 
378  auto average_entries_per_receiver = this_table.size() / receiver_table.size();
379  auto offset = 2 * n; // Offset is 2 characters, in entries
380 
381  auto cyan_threshold = ((average_entries_per_receiver - offset) / 2) / n;
382  auto green_threshold = (average_entries_per_receiver - offset) / n;
383  auto yellow_threshold = (average_entries_per_receiver + offset) / n;
384  auto red_threshold = (2 * average_entries_per_receiver) / n;
385 
386  TLOG(TLVL_TRACE) << "CT: " << cyan_threshold << ", GT: " << green_threshold << ", YT: " << yellow_threshold << ", RT: " << red_threshold;
387 
388  std::ostringstream report;
389  std::ostringstream verbose_report;
390 
391  if (print_verbose && verbose_clear_screen) std::cout << "\033[;H\033[J";
392 
393  report << artdaq::TimeUtils::gettimeofday_us() << ": " << this_table.size() << " Entries, ";
394  for (auto& receiver : receiver_table)
395  {
396  auto percent = static_cast<int>(receiver.second * 100 / this_table.size());
397  report << receiver.first << ": " << receiver.second << " (" << percent << "%), ";
398  if (print_verbose)
399  {
400  verbose_report << receiver.first << ": " << receiver.second << " (" << percent << "%)\t[";
401 
402  size_t graph_characters = receiver.second / n;
403 
404  for (size_t ii = 0; ii < graph_characters; ++ii)
405  {
406  if (ii < cyan_threshold)
407  {
408  verbose_report << blue;
409  }
410  else if (ii < green_threshold)
411  {
412  verbose_report << cyan;
413  }
414  else if (ii < yellow_threshold)
415  {
416  verbose_report << green;
417  }
418  else if (ii < red_threshold)
419  {
420  verbose_report << yellow;
421  }
422  else
423  {
424  verbose_report << red;
425  }
426  verbose_report << "|";
427  }
428  std::string spaces = std::string(graph_width - graph_characters, ' ');
429  verbose_report << "\033[0m" << spaces << "]" << std::endl;
430  }
431  receiver.second = 0;
432  }
433  TLOG(TLVL_INFO) << report.str();
434  std::cout << report.str() << std::endl;
435  if(print_verbose) std::cout << verbose_report.str() << std::endl;
436  }
437  std::this_thread::sleep_until(start_time + std::chrono::milliseconds(collection_time_ms));
438  }
439 
440  metricMan->do_stop();
441  artdaq::Globals::CleanUpGlobals();
442 }
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
A row of the Routing Table.
hostMap_t MakeHostMap(fhicl::ParameterSet pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:66
fhicl::Atom< bool > print_verbose_info
&quot;print_verbose_info&quot; (Default: true): Print verbose information about each receiver detected in routi...
fhicl::Atom< size_t > collection_time_ms
&quot;collection_time_ms&quot;: Time to collect routing table updates between printing summaries ...
fhicl::TableFragment< artdaq::artdaqapp::Config > artdaqAppConfig
Configuration for artdaq Application (BoardReader, etc)
The header of the Routing Table, containing the magic bytes and the number of entries.
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:39
fhicl::Atom< size_t > graph_width
&quot;graph_width&quot;: Width of the summary graph