artdaq_mpich_plugin  v1_00_13
routing_master_t.cc
1 #define TRACE_NAME "routing_master_t"
2 #include "MPIProg.hh"
4 #include "artdaq/DAQdata/TCPConnect.hh"
5 #include "artdaq/DAQrate/detail/RoutingPacket.hh"
6 #include "cetlib/filepath_maker.h"
7 #include "fhiclcpp/ParameterSet.h"
8 #include "fhiclcpp/make_ParameterSet.h"
9 
10 #include <netdb.h>
11 #include <boost/program_options.hpp>
12 #include "artdaq/Application/RoutingMasterApp.hh"
13 #include "artdaq/Application/RoutingMasterCore.hh"
14 namespace bpo = boost::program_options;
15 
16 #include <algorithm>
17 #include <cmath>
18 #include <cstdio>
19 
20 extern "C" {
21 #include <unistd.h>
22 }
23 
24 #include <arpa/inet.h>
25 #include <netinet/in.h>
26 #include <sys/socket.h>
27 #include <sys/types.h>
28 #include <iostream>
29 #include <memory>
30 #include <utility>
31 
32 extern "C" {
33 #include <sys/resource.h>
34 #include <sys/time.h>
35 }
36 
40 class RoutingMasterTest : public MPIProg
41 {
42 public:
57  RoutingMasterTest(int argc, char* argv[]);
58 
62  void go();
63 
67  void generate_tokens();
68 
73  void routing_master();
74 
78  void table_receiver();
79 
86  fhicl::ParameterSet getPset(int argc, char* argv[]) const;
87 
88 private:
89  enum class TestRole_t : int
90  {
91  TOKEN_GEN = 0,
92  ROUTING_MASTER = 1,
93  TABLE_RECEIVER = 2
94  };
95 
96  void printHost(const std::string& functionName) const;
97 
98  fhicl::ParameterSet const pset_;
99  fhicl::ParameterSet const daq_pset_;
100  TestRole_t role_;
101 
102  std::string routing_master_address_;
103  std::string multicast_address_;
104  int token_port_;
105  int table_port_;
106  int ack_port_;
107  std::vector<int> eb_ranks_;
108  int token_count_;
109  size_t token_interval_us_;
110  size_t run_number_;
111 };
112 
114  : MPIProg(argc, argv), pset_(getPset(argc, argv)), daq_pset_(pset_.get<fhicl::ParameterSet>("daq")), routing_master_address_(daq_pset_.get<std::string>("routing_master_hostname", "localhost")), multicast_address_(daq_pset_.get<std::string>("table_update_address", "227.128.12.28")), token_port_(daq_pset_.get<int>("routing_token_port", 35555)), table_port_(daq_pset_.get<int>("table_update_port", 35556)), ack_port_(daq_pset_.get<int>("table_acknowledge_port", 35557)), token_count_(pset_.get<int>("token_count", 1000)), token_interval_us_(pset_.get<size_t>("token_interval_us", 5000)), run_number_(pset_.get<size_t>("run_number"))
115 {
116  assert(!(my_rank < 0));
117  switch (my_rank)
118  {
119  case 0:
120  role_ = TestRole_t::TOKEN_GEN;
121  break;
122  case 1:
123  role_ = TestRole_t::ROUTING_MASTER;
124  break;
125  default:
126  role_ = TestRole_t::TABLE_RECEIVER;
127  break;
128  }
129  auto policy_pset = daq_pset_.get<fhicl::ParameterSet>("policy");
130  eb_ranks_ = policy_pset.get<std::vector<int>>("receiver_ranks");
131 }
132 
133 fhicl::ParameterSet RoutingMasterTest::getPset(int argc, char* argv[]) const
134 {
135  std::ostringstream descstr;
136  descstr << "-- <-c <config-file>>";
137  bpo::options_description desc(descstr.str());
138  desc.add_options()("config,c", bpo::value<std::string>(), "Configuration file.");
139  bpo::variables_map vm;
140  try
141  {
142  bpo::store(bpo::command_line_parser(argc, argv).options(desc).allow_unregistered().run(), vm);
143  bpo::notify(vm);
144  }
145  catch (bpo::error const& e)
146  {
147  std::cerr << "Exception from command line processing in Config::getArtPset: " << e.what() << "\n";
148  throw "cmdline parsing error.";
149  }
150  if (!vm.count("config"))
151  {
152  std::cerr << "Expected \"-- -c <config-file>\" fhicl file specification.\n";
153  throw "cmdline parsing error.";
154  }
155  fhicl::ParameterSet pset;
156  cet::filepath_lookup lookup_policy("FHICL_FILE_PATH");
157  fhicl::make_ParameterSet(vm["config"].as<std::string>(), lookup_policy, pset);
158 
159  return pset;
160 }
161 
163 {
164  TLOG(TLVL_INFO) << "Entering MPI_Barrier";
165  MPI_Barrier(MPI_COMM_WORLD);
166  TLOG(TLVL_INFO) << "Done with Barrier";
167  // std::cout << "daq_pset_: " << daq_pset_.to_string() << std::endl << "conf_.makeParameterSet(): " <<
168  // conf_.makeParameterSet().to_string() << std::endl;
169 
170  switch (role_)
171  {
172  case TestRole_t::TABLE_RECEIVER:
173  table_receiver();
174  break;
175  case TestRole_t::ROUTING_MASTER:
176  routing_master();
177  break;
178  case TestRole_t::TOKEN_GEN:
179  generate_tokens();
180  break;
181  default:
182  throw "No such node type";
183  }
184  TLOG(TLVL_INFO) << "Rank " << my_rank << " complete.";
185 }
186 
188 {
189  TLOG(TLVL_INFO) << "generate_tokens(): Init";
190  printHost("generate_tokens");
191  sleep(1);
192 
193  int token_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
194  if (token_socket < 0)
195  {
196  TLOG(TLVL_ERROR) << "generate_tokens(): I failed to create the socket for sending Routing Tokens!";
197  exit(1);
198  }
199  struct sockaddr_in token_addr;
200  auto sts = ResolveHost(routing_master_address_.c_str(), token_port_, token_addr);
201  if (sts == -1)
202  {
203  TLOG(TLVL_ERROR) << "generate_tokens(): Could not resolve host name";
204  }
205 
206  connect(token_socket, (struct sockaddr*)&token_addr, sizeof(token_addr));
207 
208  int sent_tokens = 0;
209  std::map<int, int> token_counter;
210  for (auto rank : eb_ranks_)
211  {
212  token_counter[rank] = 0;
213  }
214  while (sent_tokens < token_count_)
215  {
216  int this_rank = eb_ranks_[seedAndRandom() % eb_ranks_.size()];
217  token_counter[this_rank]++;
218  artdaq::detail::RoutingToken token;
219  token.header = TOKEN_MAGIC;
220  token.rank = this_rank;
221  token.new_slots_free = 1;
222  token.run_number = run_number_;
223 
224  TLOG(TLVL_INFO) << "generate_tokens(): Sending RoutingToken " << ++sent_tokens << " for rank " << this_rank
225  << " to " << routing_master_address_;
226  send(token_socket, &token, sizeof(artdaq::detail::RoutingToken), 0);
227  usleep(token_interval_us_);
228  }
229  auto max_rank = 0;
230  for (auto rank : token_counter)
231  {
232  if (rank.second > max_rank) max_rank = rank.second;
233  }
234  for (auto rank : token_counter)
235  {
236  artdaq::detail::RoutingToken token;
237  token.header = TOKEN_MAGIC;
238  token.rank = rank.first;
239  token.new_slots_free = max_rank - rank.second;
240  token.run_number = run_number_;
241 
242  TLOG(TLVL_INFO) << "generate_tokens(): Sending RoutingToken " << ++sent_tokens << " for rank " << rank.first
243  << " to " << routing_master_address_;
244  send(token_socket, &token, sizeof(artdaq::detail::RoutingToken), 0);
245  usleep(token_interval_us_);
246  }
247 
248  TLOG(TLVL_INFO) << "generate_tokens(): Waiting at MPI_Barrier";
249  MPI_Barrier(MPI_COMM_WORLD);
250  TLOG(TLVL_INFO) << "generate_tokens(): Done with MPI_Barrier";
251 }
252 
254 {
255  TLOG(TLVL_INFO) << "table_receiver(): Init";
256  printHost("table_receiver");
257 
258  auto table_socket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
259  if (table_socket < 0)
260  {
261  TLOG(TLVL_ERROR) << "table_receiver(): Error creating socket for receiving data requests!";
262  exit(1);
263  }
264 
265  struct sockaddr_in si_me_request;
266 
267  int yes = 1;
268  if (setsockopt(table_socket, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
269  {
270  TLOG(TLVL_ERROR) << "table_receiver(): Unable to enable port reuse on request socket";
271  exit(1);
272  }
273  memset(&si_me_request, 0, sizeof(si_me_request));
274  si_me_request.sin_family = AF_INET;
275  si_me_request.sin_port = htons(table_port_);
276  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
277  if (bind(table_socket, (struct sockaddr*)&si_me_request, sizeof(si_me_request)) == -1)
278  {
279  TLOG(TLVL_ERROR) << "table_receiver(): Cannot bind request socket to port " << table_port_;
280  exit(1);
281  }
282 
283  struct ip_mreq mreq;
284  long int sts = ResolveHost(multicast_address_.c_str(), mreq.imr_multiaddr);
285  if (sts == -1)
286  {
287  TLOG(TLVL_ERROR) << "table_receiver(): Unable to resolve multicast hostname";
288  exit(1);
289  }
290  mreq.imr_interface.s_addr = htonl(INADDR_ANY);
291  if (setsockopt(table_socket, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
292  {
293  TLOG(TLVL_ERROR) << "table_receiver(): Unable to join multicast group";
294  exit(1);
295  }
296 
297  struct epoll_event ev;
298  int table_epoll_fd = epoll_create1(0);
299  ev.events = EPOLLIN | EPOLLPRI;
300  ev.data.fd = table_socket;
301  if (epoll_ctl(table_epoll_fd, EPOLL_CTL_ADD, table_socket, &ev) == -1)
302  {
303  TLOG(TLVL_ERROR) << "table_receiver(): Could not register listen socket to epoll fd";
304  exit(3);
305  }
306 
307  auto ack_socket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
308  struct sockaddr_in ack_addr;
309  sts = ResolveHost(routing_master_address_.c_str(), ack_port_, ack_addr);
310  if (sts == -1)
311  {
312  TLOG(TLVL_ERROR) << "table_receiver(): Unable to resolve routing master hostname";
313  exit(1);
314  }
315 
316  if (table_socket == -1 || table_epoll_fd == -1 || ack_socket == -1)
317  {
318  TLOG(TLVL_INFO) << "table_receiver(): One of the listen sockets was not opened successfully.";
319  exit(4);
320  }
321  artdaq::Fragment::sequence_id_t max_sequence_id = token_count_;
322  artdaq::Fragment::sequence_id_t current_sequence_id = 0;
323  std::map<artdaq::Fragment::sequence_id_t, int> routing_table;
324  TLOG(TLVL_INFO) << "table_receiver(): Expecting " << max_sequence_id << " as the last Sequence ID in this run";
325  while (current_sequence_id < max_sequence_id)
326  {
327  std::vector<epoll_event> table_events_(4);
328  TLOG(TLVL_INFO) << "table_receiver(): Waiting for event on table socket";
329  auto nfds = epoll_wait(table_epoll_fd, &table_events_[0], table_events_.size(), -1);
330  if (nfds == -1)
331  {
332  perror("epoll_wait");
333  exit(EXIT_FAILURE);
334  }
335 
336  TLOG(TLVL_INFO) << "table_receiver(): Received " << nfds << " table update(s)";
337  for (auto n = 0; n < nfds; ++n)
338  {
339  auto first = artdaq::Fragment::InvalidSequenceID;
340  auto last = artdaq::Fragment::InvalidSequenceID;
341 
342  std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
343  artdaq::detail::RoutingPacketHeader hdr;
344  auto stss = recv(table_events_[n].data.fd, &buf[0], MAX_ROUTING_TABLE_SIZE, 0);
345 
346  if (stss > static_cast<ssize_t>(sizeof(hdr)))
347  {
348  memcpy(&hdr, &buf[0], sizeof(hdr));
349  }
350  else
351  {
352  TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
353  continue;
354  }
355 
356  TLOG(TLVL_INFO) << "table_receiver(): Checking for valid header";
357  if (hdr.header == ROUTING_MAGIC)
358  {
359  artdaq::detail::RoutingPacket buffer(hdr.nEntries);
360  TLOG(TLVL_INFO) << "table_receiver(): Receiving data buffer";
361  memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)],
362  sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
363 
364  first = buffer[0].sequence_id;
365  last = buffer[buffer.size() - 1].sequence_id;
366 
367  for (auto entry : buffer)
368  {
369  if (routing_table.count(entry.sequence_id))
370  {
371  assert(routing_table[entry.sequence_id] == entry.destination_rank);
372  continue;
373  }
374  routing_table[entry.sequence_id] = entry.destination_rank;
375  TLOG(TLVL_INFO) << "table_receiver(): table_receiver " << my_rank << ": received update: SeqID "
376  << entry.sequence_id << " -> Rank " << entry.destination_rank;
377  }
378 
379  artdaq::detail::RoutingAckPacket ack;
380  ack.rank = my_rank;
381  ack.first_sequence_id = first;
382  ack.last_sequence_id = last;
383 
384  TLOG(TLVL_INFO) << "table_receiver(): Sending RoutingAckPacket with first= " << first << " and last= " << last
385  << " to " << routing_master_address_ << ", port " << ack_port_;
386  sendto(ack_socket, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr*)&ack_addr,
387  sizeof(ack_addr));
388  current_sequence_id = last;
389  }
390  }
391  }
392 
393  TLOG(TLVL_INFO) << "table_receiver(): Waiting at MPI_Barrier";
394  MPI_Barrier(MPI_COMM_WORLD);
395  TLOG(TLVL_INFO) << "table_receiver(): Done with MPI_Barrier";
396 }
397 
399 {
400  TLOG(TLVL_INFO) << "routing_master: Init";
401  printHost("routing_master");
402 
403  app_name = "RoutingMaster";
404 
405  auto app = std::make_unique<artdaq::RoutingMasterApp>();
406 
407  auto sts = app->initialize(pset_, 0, 0);
408  if (!sts)
409  {
410  TLOG(TLVL_ERROR) << "routing_master: Failed to initalize!";
411  }
412  app->do_start(art::RunID(run_number_), 0, 0);
413  TLOG(TLVL_INFO) << "routing_master: Waiting at MPI_Barrier";
414  MPI_Barrier(MPI_COMM_WORLD);
415  TLOG(TLVL_INFO) << "routing_master: Done with MPI_Barrier, calling RoutingMasterCore::stop";
416  app->do_stop(0, 0);
417  TLOG(TLVL_INFO) << "routing_master: Done with RoutingMasterCore::stop, calling shutdown";
418  app->do_shutdown(0);
419  TLOG(TLVL_INFO) << "routing_master: Done with RoutingMasterCore::shutdown";
420 }
421 
422 void RoutingMasterTest::printHost(const std::string& functionName) const
423 {
424  char* doPrint = getenv("PRINT_HOST");
425  if (doPrint == 0)
426  {
427  return;
428  }
429  const int ARRSIZE = 80;
430  char hostname[ARRSIZE];
431  std::string hostString;
432  if (!gethostname(hostname, ARRSIZE))
433  {
434  hostString = hostname;
435  }
436  else
437  {
438  hostString = "unknown";
439  }
440  TLOG(TLVL_INFO) << "Running " << functionName << " on host " << hostString << " with rank " << my_rank << ".";
441 }
442 
443 void printUsage()
444 {
445  int myid = 0;
446  struct rusage usage;
447  getrusage(RUSAGE_SELF, &usage);
448  std::cout << myid << ":"
449  << " user=" << artdaq::TimeUtils::convertUnixTimeToSeconds(usage.ru_utime)
450  << " sys=" << artdaq::TimeUtils::convertUnixTimeToSeconds(usage.ru_stime) << std::endl;
451 }
452 
453 int main(int argc, char* argv[])
454 {
455  artdaq::configureMessageFacility("routing_master", true);
456  int rc = 1;
457 
458 #if 0
459  std::cerr << "PID: " << getpid() << std::endl;
460  volatile bool attach = true;
461  while (attach)
462  {
463  usleep(100000);
464  }
465 #endif
466 
467  try
468  {
469  RoutingMasterTest p(argc, argv);
470  std::cerr << "Started process " << my_rank << " of " << p.procs_ << ".\n";
471  p.go();
472  rc = 0;
473  }
474  catch (std::string& x)
475  {
476  std::cerr << "Exception (type string) caught in routing_master: " << x << '\n';
477  return 1;
478  }
479  catch (char const* m)
480  {
481  std::cerr << "Exception (type char const*) caught in routing_master: ";
482  if (m)
483  {
484  std::cerr << m;
485  }
486  else
487  {
488  std::cerr << "[the value was a null pointer, so no message is available]";
489  }
490  std::cerr << '\n';
491  }
492  return rc;
493 }
The RoutingMasterTest class runs the routing_master test.
void routing_master()
Load a RoutingMasterCore instance, receive tokens from the token generators, and send table updates t...
A wrapper for a MPI program. Similar to MPISentry.
Definition: MPIProg.hh:10
void go()
Start the test, using the role assigned.
RoutingMasterTest(int argc, char *argv[])
RoutingMasterTest Constructor.
void table_receiver()
Receive Routing Tables from the Routing Master and send acknowledgement packets back.
fhicl::ParameterSet getPset(int argc, char *argv[]) const
Parse the command line arguments and load a configuration FHiCL file.
void generate_tokens()
Generate tokens and send them to the Routing Master.