00001 #define TRACE_NAME "routing_master_t"
00002 #include "MPIProg.hh"
00003 #include "artdaq/DAQrate/detail/RoutingPacket.hh"
00004 #include "artdaq/DAQdata/TCPConnect.hh"
00005 #include "artdaq-mpich-plugin/Utilities/quiet_mpi.hh"
00006 #include "cetlib/filepath_maker.h"
00007 #include "fhiclcpp/ParameterSet.h"
00008 #include "fhiclcpp/make_ParameterSet.h"
00009
00010 #include <boost/program_options.hpp>
00011 #include "artdaq/Application/RoutingMasterCore.hh"
00012 #include "artdaq/Application/RoutingMasterApp.hh"
00013 #include <netdb.h>
00014 namespace bpo = boost::program_options;
00015
00016 #include <algorithm>
00017 #include <cmath>
00018 #include <cstdio>
00019
00020 extern "C"
00021 {
00022 #include <unistd.h>
00023 }
00024
00025 #include <iostream>
00026 #include <memory>
00027 #include <utility>
00028 #include <arpa/inet.h>
00029 #include <netinet/in.h>
00030 #include <sys/types.h>
00031 #include <sys/socket.h>
00032
00033 extern "C"
00034 {
00035 #include <sys/time.h>
00036 #include <sys/resource.h>
00037 }
00038
00042 class RoutingMasterTest : public MPIProg
00043 {
00044 public:
00059 RoutingMasterTest(int argc, char* argv[]);
00060
00064 void go();
00065
00069 void generate_tokens();
00070
00074 void routing_master();
00075
00079 void table_receiver();
00080
00087 fhicl::ParameterSet getPset(int argc, char* argv[]) const;
00088
00089 private:
00090 enum class TestRole_t : int
00091 {
00092 TOKEN_GEN = 0,
00093 ROUTING_MASTER = 1,
00094 TABLE_RECEIVER = 2
00095 };
00096
00097 void printHost(const std::string& functionName) const;
00098
00099 fhicl::ParameterSet const pset_;
00100 fhicl::ParameterSet const daq_pset_;
00101 TestRole_t role_;
00102
00103 std::string routing_master_address_;
00104 std::string multicast_address_;
00105 int token_port_;
00106 int table_port_;
00107 int ack_port_;
00108 std::vector<int> eb_ranks_;
00109 int token_count_;
00110 size_t token_interval_us_;
00111 };
00112
00113 RoutingMasterTest::RoutingMasterTest(int argc, char* argv[]) :
00114 MPIProg(argc, argv)
00115 , pset_(getPset(argc, argv))
00116 , daq_pset_(pset_.get<fhicl::ParameterSet>("daq"))
00117 , routing_master_address_(daq_pset_.get<std::string>("routing_master_hostname", "localhost"))
00118 , multicast_address_(daq_pset_.get<std::string>("table_update_address", "227.128.12.28"))
00119 , token_port_(daq_pset_.get<int>("routing_token_port", 35555))
00120 , table_port_(daq_pset_.get<int>("table_update_port", 35556))
00121 , ack_port_(daq_pset_.get<int>("table_acknowledge_port", 35557))
00122 , token_count_(pset_.get<int>("token_count", 1000))
00123 , token_interval_us_(pset_.get<size_t>("token_interval_us", 5000))
00124 {
00125 assert(!(my_rank < 0));
00126 switch (my_rank)
00127 {
00128 case 0:
00129 role_ = TestRole_t::TOKEN_GEN;
00130 break;
00131 case 1:
00132 role_ = TestRole_t::ROUTING_MASTER;
00133 break;
00134 default:
00135 role_ = TestRole_t::TABLE_RECEIVER;
00136 break;
00137 }
00138 auto policy_pset = daq_pset_.get<fhicl::ParameterSet>("policy");
00139 eb_ranks_ = policy_pset.get<std::vector<int>>("receiver_ranks");
00140 }
00141
00142 fhicl::ParameterSet RoutingMasterTest::getPset(int argc, char* argv[]) const
00143 {
00144 std::ostringstream descstr;
00145 descstr << "-- <-c <config-file>>";
00146 bpo::options_description desc(descstr.str());
00147 desc.add_options()
00148 ("config,c", bpo::value<std::string>(), "Configuration file.");
00149 bpo::variables_map vm;
00150 try
00151 {
00152 bpo::store(bpo::command_line_parser(argc, argv).
00153 options(desc).allow_unregistered().run(), vm);
00154 bpo::notify(vm);
00155 }
00156 catch (bpo::error const& e)
00157 {
00158 std::cerr << "Exception from command line processing in Config::getArtPset: " << e.what() << "\n";
00159 throw "cmdline parsing error.";
00160 }
00161 if (!vm.count("config"))
00162 {
00163 std::cerr << "Expected \"-- -c <config-file>\" fhicl file specification.\n";
00164 throw "cmdline parsing error.";
00165 }
00166 fhicl::ParameterSet pset;
00167 cet::filepath_lookup lookup_policy("FHICL_FILE_PATH");
00168 fhicl::make_ParameterSet(vm["config"].as<std::string>(), lookup_policy, pset);
00169
00170 return pset;
00171 }
00172
00173 void RoutingMasterTest::go()
00174 {
00175 TLOG(TLVL_INFO) << "Entering MPI_Barrier";
00176 MPI_Barrier(MPI_COMM_WORLD);
00177 TLOG(TLVL_INFO) << "Done with Barrier";
00178
00179
00180 switch (role_)
00181 {
00182 case TestRole_t::TABLE_RECEIVER:
00183 table_receiver();
00184 break;
00185 case TestRole_t::ROUTING_MASTER:
00186 routing_master();
00187 break;
00188 case TestRole_t::TOKEN_GEN:
00189 generate_tokens();
00190 break;
00191 default:
00192 throw "No such node type";
00193 }
00194 TLOG(TLVL_INFO) << "Rank " << my_rank << " complete." ;
00195 }
00196
00197 void RoutingMasterTest::generate_tokens()
00198 {
00199 TLOG(TLVL_INFO) << "generate_tokens(): Init" ;
00200 printHost("generate_tokens");
00201 sleep(1);
00202
00203 int token_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
00204 if (token_socket < 0)
00205 {
00206 TLOG(TLVL_ERROR) << "generate_tokens(): I failed to create the socket for sending Routing Tokens!" ;
00207 exit(1);
00208 }
00209 struct sockaddr_in token_addr;
00210 auto sts = ResolveHost(routing_master_address_.c_str(), token_port_, token_addr);
00211 if(sts == -1)
00212 {
00213 TLOG(TLVL_ERROR) << "generate_tokens(): Could not resolve host name" ;
00214 }
00215
00216 connect(token_socket, (struct sockaddr*)&token_addr, sizeof(token_addr));
00217
00218 int sent_tokens = 0;
00219 std::map<int, int> token_counter;
00220 for(auto rank : eb_ranks_)
00221 {
00222 token_counter[rank] = 0;
00223 }
00224 while (sent_tokens < token_count_) {
00225 int this_rank = eb_ranks_[seedAndRandom() % eb_ranks_.size()];
00226 token_counter[this_rank]++;
00227 artdaq::detail::RoutingToken token;
00228 token.header = TOKEN_MAGIC;
00229 token.rank = this_rank;
00230 token.new_slots_free = 1;
00231 token.run_number = 1;
00232
00233 TLOG(TLVL_INFO) << "generate_tokens(): Sending RoutingToken " << ++sent_tokens << " for rank " << this_rank << " to " << routing_master_address_ ;
00234 send(token_socket, &token, sizeof(artdaq::detail::RoutingToken), 0);
00235 usleep(token_interval_us_);
00236 }
00237 auto max_rank = 0;
00238 for(auto rank : token_counter)
00239 {
00240 if (rank.second > max_rank) max_rank = rank.second;
00241 }
00242 for(auto rank : token_counter)
00243 {
00244 artdaq::detail::RoutingToken token;
00245 token.header = TOKEN_MAGIC;
00246 token.rank = rank.first;
00247 token.new_slots_free = max_rank - rank.second;
00248 token.run_number = 1;
00249
00250 TLOG(TLVL_INFO) << "generate_tokens(): Sending RoutingToken " << ++sent_tokens << " for rank " << rank.first << " to " << routing_master_address_ ;
00251 send(token_socket, &token, sizeof(artdaq::detail::RoutingToken), 0);
00252 usleep(token_interval_us_);
00253
00254 }
00255
00256 TLOG(TLVL_INFO) << "generate_tokens(): Waiting at MPI_Barrier" ;
00257 MPI_Barrier(MPI_COMM_WORLD);
00258 TLOG(TLVL_INFO) << "generate_tokens(): Done with MPI_Barrier" ;
00259 }
00260
00261 void RoutingMasterTest::table_receiver()
00262 {
00263 TLOG(TLVL_INFO) << "table_receiver(): Init" ;
00264 printHost("table_receiver");
00265
00266
00267 auto table_socket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00268 if (table_socket < 0)
00269 {
00270 TLOG(TLVL_ERROR) << "table_receiver(): Error creating socket for receiving data requests!" ;
00271 exit(1);
00272 }
00273
00274 struct sockaddr_in si_me_request;
00275
00276 int yes = 1;
00277 if (setsockopt(table_socket, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00278 {
00279 TLOG(TLVL_ERROR) << "table_receiver(): Unable to enable port reuse on request socket" ;
00280 exit(1);
00281 }
00282 memset(&si_me_request, 0, sizeof(si_me_request));
00283 si_me_request.sin_family = AF_INET;
00284 si_me_request.sin_port = htons(table_port_);
00285 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00286 if (bind(table_socket, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
00287 {
00288 TLOG(TLVL_ERROR) << "table_receiver(): Cannot bind request socket to port " << table_port_ ;
00289 exit(1);
00290 }
00291
00292 struct ip_mreq mreq;
00293 long int sts = ResolveHost(multicast_address_.c_str(), mreq.imr_multiaddr);
00294 if(sts == -1)
00295 {
00296 TLOG(TLVL_ERROR) << "table_receiver(): Unable to resolve multicast hostname" ;
00297 exit(1);
00298 }
00299 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
00300 if (setsockopt(table_socket, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
00301 {
00302 TLOG(TLVL_ERROR) << "table_receiver(): Unable to join multicast group" ;
00303 exit(1);
00304 }
00305
00306 struct epoll_event ev;
00307 int table_epoll_fd = epoll_create1(0);
00308 ev.events = EPOLLIN | EPOLLPRI;
00309 ev.data.fd = table_socket;
00310 if (epoll_ctl(table_epoll_fd, EPOLL_CTL_ADD, table_socket, &ev) == -1)
00311 {
00312 TLOG(TLVL_ERROR) << "table_receiver(): Could not register listen socket to epoll fd" ;
00313 exit(3);
00314 }
00315
00316 auto ack_socket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00317 struct sockaddr_in ack_addr;
00318 sts = ResolveHost(routing_master_address_.c_str(), ack_port_, ack_addr);
00319 if(sts == -1)
00320 {
00321 TLOG(TLVL_ERROR) << "table_receiver(): Unable to resolve routing master hostname" ;
00322 exit(1);
00323 }
00324
00325 if (table_socket == -1 || table_epoll_fd == -1 || ack_socket == -1)
00326 {
00327 TLOG(TLVL_INFO) << "table_receiver(): One of the listen sockets was not opened successfully." ;
00328 exit(4);
00329 }
00330 artdaq::Fragment::sequence_id_t max_sequence_id = token_count_;
00331 artdaq::Fragment::sequence_id_t current_sequence_id = 0;
00332 std::map<artdaq::Fragment::sequence_id_t, int> routing_table;
00333 TLOG(TLVL_INFO) << "table_receiver(): Expecting " << max_sequence_id << " as the last Sequence ID in this run" ;
00334 while (current_sequence_id < max_sequence_id)
00335 {
00336 std::vector<epoll_event> table_events_(4);
00337 TLOG(TLVL_INFO) << "table_receiver(): Waiting for event on table socket" ;
00338 auto nfds = epoll_wait(table_epoll_fd, &table_events_[0], table_events_.size(), -1);
00339 if (nfds == -1) {
00340 perror("epoll_wait");
00341 exit(EXIT_FAILURE);
00342 }
00343
00344 TLOG(TLVL_INFO) << "table_receiver(): Received " << nfds << " table update(s)" ;
00345 for (auto n = 0; n < nfds; ++n) {
00346 auto first = artdaq::Fragment::InvalidSequenceID;
00347 auto last = artdaq::Fragment::InvalidSequenceID;
00348
00349 std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
00350 artdaq::detail::RoutingPacketHeader hdr;
00351 auto stss = recv(table_events_[n].data.fd, &buf[0], MAX_ROUTING_TABLE_SIZE, 0);
00352
00353
00354 if (stss > static_cast<ssize_t>(sizeof(hdr)))
00355 {
00356 memcpy(&hdr, &buf[0], sizeof(hdr));
00357 }
00358 else
00359 {
00360 TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
00361 continue;
00362 }
00363
00364 TLOG(TLVL_INFO) << "table_receiver(): Checking for valid header" ;
00365 if (hdr.header == ROUTING_MAGIC) {
00366 artdaq::detail::RoutingPacket buffer(hdr.nEntries);
00367 TLOG(TLVL_INFO) << "table_receiver(): Receiving data buffer" ;
00368 memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
00369
00370 first = buffer[0].sequence_id;
00371 last = buffer[buffer.size() - 1].sequence_id;
00372
00373 for (auto entry : buffer)
00374 {
00375 if (routing_table.count(entry.sequence_id))
00376 {
00377 assert(routing_table[entry.sequence_id] == entry.destination_rank);
00378 continue;
00379 }
00380 routing_table[entry.sequence_id] = entry.destination_rank;
00381 TLOG(TLVL_INFO) << "table_receiver(): table_receiver " << my_rank << ": received update: SeqID " << entry.sequence_id << " -> Rank " << entry.destination_rank ;
00382 }
00383
00384 artdaq::detail::RoutingAckPacket ack;
00385 ack.rank = my_rank;
00386 ack.first_sequence_id = first;
00387 ack.last_sequence_id = last;
00388
00389 TLOG(TLVL_INFO) << "table_receiver(): Sending RoutingAckPacket with first= " << first << " and last= " << last << " to " << routing_master_address_ << ", port " << ack_port_ ;
00390 sendto(ack_socket, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr, sizeof(ack_addr));
00391 current_sequence_id = last;
00392 }
00393 }
00394 }
00395
00396 TLOG(TLVL_INFO) << "table_receiver(): Waiting at MPI_Barrier" ;
00397 MPI_Barrier(MPI_COMM_WORLD);
00398 TLOG(TLVL_INFO) << "table_receiver(): Done with MPI_Barrier" ;
00399 }
00400
00401 void RoutingMasterTest::routing_master()
00402 {
00403 TLOG(TLVL_INFO) << "routing_master: Init" ;
00404 printHost("routing_master");
00405
00406 app_name = "RoutingMaster";
00407
00408 auto app = std::make_unique<artdaq::RoutingMasterApp>();
00409
00410 app->initialize(pset_, 0, 0);
00411 app->do_start(art::RunID(1), 0, 0);
00412 TLOG(TLVL_INFO) << "routing_master: Waiting at MPI_Barrier" ;
00413 MPI_Barrier(MPI_COMM_WORLD);
00414 TLOG(TLVL_INFO) << "routing_master: Done with MPI_Barrier, calling RoutingMasterCore::stop" ;
00415 app->do_stop(0, 0);
00416 TLOG(TLVL_INFO) << "routing_master: Done with RoutingMasterCore::stop, calling shutdown" ;
00417 app->do_shutdown(0);
00418 TLOG(TLVL_INFO) << "routing_master: Done with RoutingMasterCore::shutdown" ;
00419 }
00420
00421 void RoutingMasterTest::printHost(const std::string& functionName) const
00422 {
00423 char* doPrint = getenv("PRINT_HOST");
00424 if (doPrint == 0) { return; }
00425 const int ARRSIZE = 80;
00426 char hostname[ARRSIZE];
00427 std::string hostString;
00428 if (!gethostname(hostname, ARRSIZE))
00429 {
00430 hostString = hostname;
00431 }
00432 else
00433 {
00434 hostString = "unknown";
00435 }
00436 TLOG(TLVL_INFO) << "Running " << functionName
00437 << " on host " << hostString
00438 << " with rank " << my_rank << "."
00439 ;
00440 }
00441
00442 void printUsage()
00443 {
00444 int myid = 0;
00445 struct rusage usage;
00446 getrusage(RUSAGE_SELF, &usage);
00447 std::cout << myid << ":"
00448 << " user=" << artdaq::TimeUtils::convertUnixTimeToSeconds(usage.ru_utime)
00449 << " sys=" << artdaq::TimeUtils::convertUnixTimeToSeconds(usage.ru_stime)
00450 << std::endl;
00451 }
00452
00453 int main(int argc, char* argv[])
00454 {
00455 artdaq::configureMessageFacility("routing_master", true);
00456 int rc = 1;
00457 #if 0
00458 std::cerr << "PID: " << getpid() << std::endl;
00459 volatile bool attach = true;
00460 while (attach)
00461 {
00462 usleep(100000);
00463 }
00464 #endif
00465
00466 try
00467 {
00468 RoutingMasterTest p(argc, argv);
00469 std::cerr << "Started process " << my_rank << " of " << p.procs_ << ".\n";
00470 p.go();
00471 rc = 0;
00472 }
00473 catch (std::string& x)
00474 {
00475 std::cerr << "Exception (type string) caught in routing_master: "
00476 << x
00477 << '\n';
00478 return 1;
00479 }
00480 catch (char const* m)
00481 {
00482 std::cerr << "Exception (type char const*) caught in routing_master: ";
00483 if (m)
00484 {
00485 std::cerr << m;
00486 }
00487 else
00488 {
00489 std::cerr << "[the value was a null pointer, so no message is available]";
00490 }
00491 std::cerr << '\n';
00492 }
00493 return rc;
00494 }