artdaq_mpich_plugin  v1_00_08a
routing_master_t.cc
1 #define TRACE_NAME "routing_master_t"
2 #include "MPIProg.hh"
4 #include "artdaq/DAQdata/TCPConnect.hh"
5 #include "artdaq/DAQrate/detail/RoutingPacket.hh"
6 #include "cetlib/filepath_maker.h"
7 #include "fhiclcpp/ParameterSet.h"
8 #include "fhiclcpp/make_ParameterSet.h"
9 
10 #include <netdb.h>
11 #include <boost/program_options.hpp>
12 #include "artdaq/Application/RoutingMasterApp.hh"
13 #include "artdaq/Application/RoutingMasterCore.hh"
14 namespace bpo = boost::program_options;
15 
16 #include <algorithm>
17 #include <cmath>
18 #include <cstdio>
19 
20 extern "C" {
21 #include <unistd.h>
22 }
23 
24 #include <arpa/inet.h>
25 #include <netinet/in.h>
26 #include <sys/socket.h>
27 #include <sys/types.h>
28 #include <iostream>
29 #include <memory>
30 #include <utility>
31 
32 extern "C" {
33 #include <sys/resource.h>
34 #include <sys/time.h>
35 }
36 
40 class RoutingMasterTest : public MPIProg {
41  public:
56  RoutingMasterTest(int argc, char* argv[]);
57 
61  void go();
62 
66  void generate_tokens();
67 
72  void routing_master();
73 
77  void table_receiver();
78 
85  fhicl::ParameterSet getPset(int argc, char* argv[]) const;
86 
87  private:
88  enum class TestRole_t : int { TOKEN_GEN = 0, ROUTING_MASTER = 1, TABLE_RECEIVER = 2 };
89 
90  void printHost(const std::string& functionName) const;
91 
92  fhicl::ParameterSet const pset_;
93  fhicl::ParameterSet const daq_pset_;
94  TestRole_t role_;
95 
96  std::string routing_master_address_;
97  std::string multicast_address_;
98  int token_port_;
99  int table_port_;
100  int ack_port_;
101  std::vector<int> eb_ranks_;
102  int token_count_;
103  size_t token_interval_us_;
104  size_t run_number_;
105 };
106 
108  : MPIProg(argc, argv),
109  pset_(getPset(argc, argv)),
110  daq_pset_(pset_.get<fhicl::ParameterSet>("daq")),
111  routing_master_address_(daq_pset_.get<std::string>("routing_master_hostname", "localhost")),
112  multicast_address_(daq_pset_.get<std::string>("table_update_address", "227.128.12.28")),
113  token_port_(daq_pset_.get<int>("routing_token_port", 35555)),
114  table_port_(daq_pset_.get<int>("table_update_port", 35556)),
115  ack_port_(daq_pset_.get<int>("table_acknowledge_port", 35557)),
116  token_count_(pset_.get<int>("token_count", 1000)),
117  token_interval_us_(pset_.get<size_t>("token_interval_us", 5000)),
118  run_number_(pset_.get<size_t>("run_number")) {
119  assert(!(my_rank < 0));
120  switch (my_rank) {
121  case 0:
122  role_ = TestRole_t::TOKEN_GEN;
123  break;
124  case 1:
125  role_ = TestRole_t::ROUTING_MASTER;
126  break;
127  default:
128  role_ = TestRole_t::TABLE_RECEIVER;
129  break;
130  }
131  auto policy_pset = daq_pset_.get<fhicl::ParameterSet>("policy");
132  eb_ranks_ = policy_pset.get<std::vector<int>>("receiver_ranks");
133 }
134 
135 fhicl::ParameterSet RoutingMasterTest::getPset(int argc, char* argv[]) const {
136  std::ostringstream descstr;
137  descstr << "-- <-c <config-file>>";
138  bpo::options_description desc(descstr.str());
139  desc.add_options()("config,c", bpo::value<std::string>(), "Configuration file.");
140  bpo::variables_map vm;
141  try {
142  bpo::store(bpo::command_line_parser(argc, argv).options(desc).allow_unregistered().run(), vm);
143  bpo::notify(vm);
144  } catch (bpo::error const& e) {
145  std::cerr << "Exception from command line processing in Config::getArtPset: " << e.what() << "\n";
146  throw "cmdline parsing error.";
147  }
148  if (!vm.count("config")) {
149  std::cerr << "Expected \"-- -c <config-file>\" fhicl file specification.\n";
150  throw "cmdline parsing error.";
151  }
152  fhicl::ParameterSet pset;
153  cet::filepath_lookup lookup_policy("FHICL_FILE_PATH");
154  fhicl::make_ParameterSet(vm["config"].as<std::string>(), lookup_policy, pset);
155 
156  return pset;
157 }
158 
160  TLOG(TLVL_INFO) << "Entering MPI_Barrier";
161  MPI_Barrier(MPI_COMM_WORLD);
162  TLOG(TLVL_INFO) << "Done with Barrier";
163  // std::cout << "daq_pset_: " << daq_pset_.to_string() << std::endl << "conf_.makeParameterSet(): " <<
164  // conf_.makeParameterSet().to_string() << std::endl;
165 
166  switch (role_) {
167  case TestRole_t::TABLE_RECEIVER:
168  table_receiver();
169  break;
170  case TestRole_t::ROUTING_MASTER:
171  routing_master();
172  break;
173  case TestRole_t::TOKEN_GEN:
174  generate_tokens();
175  break;
176  default:
177  throw "No such node type";
178  }
179  TLOG(TLVL_INFO) << "Rank " << my_rank << " complete.";
180 }
181 
183  TLOG(TLVL_INFO) << "generate_tokens(): Init";
184  printHost("generate_tokens");
185  sleep(1);
186 
187  int token_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
188  if (token_socket < 0) {
189  TLOG(TLVL_ERROR) << "generate_tokens(): I failed to create the socket for sending Routing Tokens!";
190  exit(1);
191  }
192  struct sockaddr_in token_addr;
193  auto sts = ResolveHost(routing_master_address_.c_str(), token_port_, token_addr);
194  if (sts == -1) {
195  TLOG(TLVL_ERROR) << "generate_tokens(): Could not resolve host name";
196  }
197 
198  connect(token_socket, (struct sockaddr*)&token_addr, sizeof(token_addr));
199 
200  int sent_tokens = 0;
201  std::map<int, int> token_counter;
202  for (auto rank : eb_ranks_) {
203  token_counter[rank] = 0;
204  }
205  while (sent_tokens < token_count_) {
206  int this_rank = eb_ranks_[seedAndRandom() % eb_ranks_.size()];
207  token_counter[this_rank]++;
208  artdaq::detail::RoutingToken token;
209  token.header = TOKEN_MAGIC;
210  token.rank = this_rank;
211  token.new_slots_free = 1;
212  token.run_number = run_number_;
213 
214  TLOG(TLVL_INFO) << "generate_tokens(): Sending RoutingToken " << ++sent_tokens << " for rank " << this_rank
215  << " to " << routing_master_address_;
216  send(token_socket, &token, sizeof(artdaq::detail::RoutingToken), 0);
217  usleep(token_interval_us_);
218  }
219  auto max_rank = 0;
220  for (auto rank : token_counter) {
221  if (rank.second > max_rank) max_rank = rank.second;
222  }
223  for (auto rank : token_counter) {
224  artdaq::detail::RoutingToken token;
225  token.header = TOKEN_MAGIC;
226  token.rank = rank.first;
227  token.new_slots_free = max_rank - rank.second;
228  token.run_number = run_number_;
229 
230  TLOG(TLVL_INFO) << "generate_tokens(): Sending RoutingToken " << ++sent_tokens << " for rank " << rank.first
231  << " to " << routing_master_address_;
232  send(token_socket, &token, sizeof(artdaq::detail::RoutingToken), 0);
233  usleep(token_interval_us_);
234  }
235 
236  TLOG(TLVL_INFO) << "generate_tokens(): Waiting at MPI_Barrier";
237  MPI_Barrier(MPI_COMM_WORLD);
238  TLOG(TLVL_INFO) << "generate_tokens(): Done with MPI_Barrier";
239 }
240 
242  TLOG(TLVL_INFO) << "table_receiver(): Init";
243  printHost("table_receiver");
244 
245  auto table_socket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
246  if (table_socket < 0) {
247  TLOG(TLVL_ERROR) << "table_receiver(): Error creating socket for receiving data requests!";
248  exit(1);
249  }
250 
251  struct sockaddr_in si_me_request;
252 
253  int yes = 1;
254  if (setsockopt(table_socket, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) {
255  TLOG(TLVL_ERROR) << "table_receiver(): Unable to enable port reuse on request socket";
256  exit(1);
257  }
258  memset(&si_me_request, 0, sizeof(si_me_request));
259  si_me_request.sin_family = AF_INET;
260  si_me_request.sin_port = htons(table_port_);
261  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
262  if (bind(table_socket, (struct sockaddr*)&si_me_request, sizeof(si_me_request)) == -1) {
263  TLOG(TLVL_ERROR) << "table_receiver(): Cannot bind request socket to port " << table_port_;
264  exit(1);
265  }
266 
267  struct ip_mreq mreq;
268  long int sts = ResolveHost(multicast_address_.c_str(), mreq.imr_multiaddr);
269  if (sts == -1) {
270  TLOG(TLVL_ERROR) << "table_receiver(): Unable to resolve multicast hostname";
271  exit(1);
272  }
273  mreq.imr_interface.s_addr = htonl(INADDR_ANY);
274  if (setsockopt(table_socket, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) {
275  TLOG(TLVL_ERROR) << "table_receiver(): Unable to join multicast group";
276  exit(1);
277  }
278 
279  struct epoll_event ev;
280  int table_epoll_fd = epoll_create1(0);
281  ev.events = EPOLLIN | EPOLLPRI;
282  ev.data.fd = table_socket;
283  if (epoll_ctl(table_epoll_fd, EPOLL_CTL_ADD, table_socket, &ev) == -1) {
284  TLOG(TLVL_ERROR) << "table_receiver(): Could not register listen socket to epoll fd";
285  exit(3);
286  }
287 
288  auto ack_socket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
289  struct sockaddr_in ack_addr;
290  sts = ResolveHost(routing_master_address_.c_str(), ack_port_, ack_addr);
291  if (sts == -1) {
292  TLOG(TLVL_ERROR) << "table_receiver(): Unable to resolve routing master hostname";
293  exit(1);
294  }
295 
296  if (table_socket == -1 || table_epoll_fd == -1 || ack_socket == -1) {
297  TLOG(TLVL_INFO) << "table_receiver(): One of the listen sockets was not opened successfully.";
298  exit(4);
299  }
300  artdaq::Fragment::sequence_id_t max_sequence_id = token_count_;
301  artdaq::Fragment::sequence_id_t current_sequence_id = 0;
302  std::map<artdaq::Fragment::sequence_id_t, int> routing_table;
303  TLOG(TLVL_INFO) << "table_receiver(): Expecting " << max_sequence_id << " as the last Sequence ID in this run";
304  while (current_sequence_id < max_sequence_id) {
305  std::vector<epoll_event> table_events_(4);
306  TLOG(TLVL_INFO) << "table_receiver(): Waiting for event on table socket";
307  auto nfds = epoll_wait(table_epoll_fd, &table_events_[0], table_events_.size(), -1);
308  if (nfds == -1) {
309  perror("epoll_wait");
310  exit(EXIT_FAILURE);
311  }
312 
313  TLOG(TLVL_INFO) << "table_receiver(): Received " << nfds << " table update(s)";
314  for (auto n = 0; n < nfds; ++n) {
315  auto first = artdaq::Fragment::InvalidSequenceID;
316  auto last = artdaq::Fragment::InvalidSequenceID;
317 
318  std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
319  artdaq::detail::RoutingPacketHeader hdr;
320  auto stss = recv(table_events_[n].data.fd, &buf[0], MAX_ROUTING_TABLE_SIZE, 0);
321 
322  if (stss > static_cast<ssize_t>(sizeof(hdr))) {
323  memcpy(&hdr, &buf[0], sizeof(hdr));
324  } else {
325  TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
326  continue;
327  }
328 
329  TLOG(TLVL_INFO) << "table_receiver(): Checking for valid header";
330  if (hdr.header == ROUTING_MAGIC) {
331  artdaq::detail::RoutingPacket buffer(hdr.nEntries);
332  TLOG(TLVL_INFO) << "table_receiver(): Receiving data buffer";
333  memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)],
334  sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
335 
336  first = buffer[0].sequence_id;
337  last = buffer[buffer.size() - 1].sequence_id;
338 
339  for (auto entry : buffer) {
340  if (routing_table.count(entry.sequence_id)) {
341  assert(routing_table[entry.sequence_id] == entry.destination_rank);
342  continue;
343  }
344  routing_table[entry.sequence_id] = entry.destination_rank;
345  TLOG(TLVL_INFO) << "table_receiver(): table_receiver " << my_rank << ": received update: SeqID "
346  << entry.sequence_id << " -> Rank " << entry.destination_rank;
347  }
348 
349  artdaq::detail::RoutingAckPacket ack;
350  ack.rank = my_rank;
351  ack.first_sequence_id = first;
352  ack.last_sequence_id = last;
353 
354  TLOG(TLVL_INFO) << "table_receiver(): Sending RoutingAckPacket with first= " << first << " and last= " << last
355  << " to " << routing_master_address_ << ", port " << ack_port_;
356  sendto(ack_socket, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr*)&ack_addr,
357  sizeof(ack_addr));
358  current_sequence_id = last;
359  }
360  }
361  }
362 
363  TLOG(TLVL_INFO) << "table_receiver(): Waiting at MPI_Barrier";
364  MPI_Barrier(MPI_COMM_WORLD);
365  TLOG(TLVL_INFO) << "table_receiver(): Done with MPI_Barrier";
366 }
367 
369  TLOG(TLVL_INFO) << "routing_master: Init";
370  printHost("routing_master");
371 
372  app_name = "RoutingMaster";
373 
374  auto app = std::make_unique<artdaq::RoutingMasterApp>();
375 
376  auto sts = app->initialize(pset_, 0, 0);
377  if (!sts) {
378  TLOG(TLVL_ERROR) << "routing_master: Failed to initalize!";
379  }
380  app->do_start(art::RunID(run_number_), 0, 0);
381  TLOG(TLVL_INFO) << "routing_master: Waiting at MPI_Barrier";
382  MPI_Barrier(MPI_COMM_WORLD);
383  TLOG(TLVL_INFO) << "routing_master: Done with MPI_Barrier, calling RoutingMasterCore::stop";
384  app->do_stop(0, 0);
385  TLOG(TLVL_INFO) << "routing_master: Done with RoutingMasterCore::stop, calling shutdown";
386  app->do_shutdown(0);
387  TLOG(TLVL_INFO) << "routing_master: Done with RoutingMasterCore::shutdown";
388 }
389 
390 void RoutingMasterTest::printHost(const std::string& functionName) const {
391  char* doPrint = getenv("PRINT_HOST");
392  if (doPrint == 0) {
393  return;
394  }
395  const int ARRSIZE = 80;
396  char hostname[ARRSIZE];
397  std::string hostString;
398  if (!gethostname(hostname, ARRSIZE)) {
399  hostString = hostname;
400  } else {
401  hostString = "unknown";
402  }
403  TLOG(TLVL_INFO) << "Running " << functionName << " on host " << hostString << " with rank " << my_rank << ".";
404 }
405 
406 void printUsage() {
407  int myid = 0;
408  struct rusage usage;
409  getrusage(RUSAGE_SELF, &usage);
410  std::cout << myid << ":"
411  << " user=" << artdaq::TimeUtils::convertUnixTimeToSeconds(usage.ru_utime)
412  << " sys=" << artdaq::TimeUtils::convertUnixTimeToSeconds(usage.ru_stime) << std::endl;
413 }
414 
415 int main(int argc, char* argv[]) {
416  artdaq::configureMessageFacility("routing_master", true);
417  int rc = 1;
418 
419 #if 0
420  std::cerr << "PID: " << getpid() << std::endl;
421  volatile bool attach = true;
422  while (attach)
423  {
424  usleep(100000);
425  }
426 #endif
427 
428  try {
429  RoutingMasterTest p(argc, argv);
430  std::cerr << "Started process " << my_rank << " of " << p.procs_ << ".\n";
431  p.go();
432  rc = 0;
433  } catch (std::string& x) {
434  std::cerr << "Exception (type string) caught in routing_master: " << x << '\n';
435  return 1;
436  } catch (char const* m) {
437  std::cerr << "Exception (type char const*) caught in routing_master: ";
438  if (m) {
439  std::cerr << m;
440  } else {
441  std::cerr << "[the value was a null pointer, so no message is available]";
442  }
443  std::cerr << '\n';
444  }
445  return rc;
446 }
The RoutingMasterTest class runs the routing_master test.
void routing_master()
Load a RoutingMasterCore instance, receive tokens from the token generators, and send table updates t...
A wrapper for a MPI program. Similar to MPISentry.
Definition: MPIProg.hh:10
void go()
Start the test, using the role assigned.
RoutingMasterTest(int argc, char *argv[])
RoutingMasterTest Constructor.
void table_receiver()
Receive Routing Tables from the Routing Master and send acknowledgement packets back.
fhicl::ParameterSet getPset(int argc, char *argv[]) const
Parse the command line arguments and load a configuration FHiCL file.
void generate_tokens()
Generate tokens and send them to the Routing Master.