00001 #include "MPIProg.hh"
00002 #include "artdaq/Application/Routing/RoutingPacket.hh"
00003 #include "artdaq/DAQdata/TCPConnect.hh"
00004 #include "artdaq/DAQrate/quiet_mpi.hh"
00005 #include "cetlib/filepath_maker.h"
00006 #include "fhiclcpp/ParameterSet.h"
00007 #include "fhiclcpp/make_ParameterSet.h"
00008
00009 #include <boost/program_options.hpp>
00010 #include <boost/filesystem.hpp>
00011 #include "artdaq/Application/RoutingMasterCore.hh"
00012 #include "artdaq/Application/RoutingMasterApp.hh"
00013 #include <netdb.h>
00014 namespace bpo = boost::program_options;
00015
00016 #include <algorithm>
00017 #include <cmath>
00018 #include <cstdio>
00019
00020 extern "C"
00021 {
00022 #include <unistd.h>
00023 }
00024
00025 #include <iostream>
00026 #include <memory>
00027 #include <utility>
00028 #include <arpa/inet.h>
00029 #include <netinet/in.h>
00030 #include <sys/types.h>
00031 #include <sys/socket.h>
00032
00033 extern "C"
00034 {
00035 #include <sys/time.h>
00036 #include <sys/resource.h>
00037 }
00038
00042 class LockFile
00043 {
00044 public:
00049 explicit LockFile(std::string path) : fileName_(path)
00050 {
00051 std::ofstream fstream(fileName_);
00052 fstream << "Locked" << std::endl;
00053 }
00054
00058 ~LockFile()
00059 {
00060 if(IsLocked(fileName_)) remove(fileName_.c_str());
00061 }
00067 static bool IsLocked(std::string path)
00068 {
00069 return boost::filesystem::exists(path);
00070 }
00071
00072 private:
00073 std::string fileName_;
00074 };
00075
00079 class RoutingMasterTest : public MPIProg
00080 {
00081 public:
00096 RoutingMasterTest(int argc, char* argv[]);
00097
00101 void go();
00102
00106 void generate_tokens();
00107
00111 void routing_master();
00112
00116 void table_receiver();
00117
00124 fhicl::ParameterSet getPset(int argc, char* argv[]) const;
00125
00126 private:
00127 enum class TestRole_t : int
00128 {
00129 TOKEN_GEN = 0,
00130 ROUTING_MASTER = 1,
00131 TABLE_RECEIVER = 2
00132 };
00133
00134 void printHost(const std::string& functionName) const;
00135
00136 fhicl::ParameterSet const pset_;
00137 fhicl::ParameterSet const daq_pset_;
00138 MPI_Comm local_group_comm_;
00139 TestRole_t role_;
00140
00141 std::string routing_master_address_;
00142 std::string multicast_address_;
00143 int token_port_;
00144 int table_port_;
00145 int ack_port_;
00146 std::vector<int> eb_ranks_;
00147 int token_count_;
00148 size_t token_interval_us_;
00149 };
00150
00151 RoutingMasterTest::RoutingMasterTest(int argc, char* argv[]) :
00152 MPIProg(argc, argv)
00153 , pset_(getPset(argc, argv))
00154 , daq_pset_(pset_.get<fhicl::ParameterSet>("daq"))
00155 , local_group_comm_()
00156 , routing_master_address_(daq_pset_.get<std::string>("routing_master_hostname", "localhost"))
00157 , multicast_address_(daq_pset_.get<std::string>("table_update_address", "227.128.12.28"))
00158 , token_port_(daq_pset_.get<int>("routing_token_port", 35555))
00159 , table_port_(daq_pset_.get<int>("table_update_port", 35556))
00160 , ack_port_(daq_pset_.get<int>("table_acknowledge_port", 35557))
00161 , token_count_(pset_.get<int>("token_count", 1000))
00162 , token_interval_us_(pset_.get<size_t>("token_interval_us", 5000))
00163 {
00164 assert(!(my_rank < 0));
00165 switch (my_rank)
00166 {
00167 case 0:
00168 role_ = TestRole_t::TOKEN_GEN;
00169 break;
00170 case 1:
00171 role_ = TestRole_t::ROUTING_MASTER;
00172 break;
00173 default:
00174 role_ = TestRole_t::TABLE_RECEIVER;
00175 break;
00176 }
00177 auto policy_pset = daq_pset_.get<fhicl::ParameterSet>("policy");
00178 eb_ranks_ = policy_pset.get<std::vector<int>>("receiver_ranks");
00179
00180 }
00181
00182 fhicl::ParameterSet RoutingMasterTest::getPset(int argc, char* argv[]) const
00183 {
00184 std::ostringstream descstr;
00185 descstr << "-- <-c <config-file>>";
00186 bpo::options_description desc(descstr.str());
00187 desc.add_options()
00188 ("config,c", bpo::value<std::string>(), "Configuration file.");
00189 bpo::variables_map vm;
00190 try
00191 {
00192 bpo::store(bpo::command_line_parser(argc, argv).
00193 options(desc).allow_unregistered().run(), vm);
00194 bpo::notify(vm);
00195 }
00196 catch (bpo::error const& e)
00197 {
00198 std::cerr << "Exception from command line processing in Config::getArtPset: " << e.what() << "\n";
00199 throw "cmdline parsing error.";
00200 }
00201 if (!vm.count("config"))
00202 {
00203 std::cerr << "Expected \"-- -c <config-file>\" fhicl file specification.\n";
00204 throw "cmdline parsing error.";
00205 }
00206 fhicl::ParameterSet pset;
00207 cet::filepath_lookup lookup_policy("FHICL_FILE_PATH");
00208 fhicl::make_ParameterSet(vm["config"].as<std::string>(), lookup_policy, pset);
00209
00210 return pset;
00211 }
00212
00213 void RoutingMasterTest::go()
00214 {
00215 if (LockFile::IsLocked("/tmp/routing_master_t.lock")) return;
00216 MPI_Barrier(MPI_COMM_WORLD);
00217 std::unique_ptr<LockFile> lock;
00218 if (my_rank == 0) {
00219 lock = std::make_unique<LockFile>("/tmp/routing_master_t.lock");
00220 }
00221
00222 MPI_Comm_split(MPI_COMM_WORLD, static_cast<int>(role_), 0, &local_group_comm_);
00223 switch (role_)
00224 {
00225 case TestRole_t::TABLE_RECEIVER:
00226 table_receiver();
00227 break;
00228 case TestRole_t::ROUTING_MASTER:
00229 routing_master();
00230 break;
00231 case TestRole_t::TOKEN_GEN:
00232 generate_tokens();
00233 break;
00234 default:
00235 throw "No such node type";
00236 }
00237 TLOG_DEBUG("routing_master") << "Rank " << my_rank << " complete." << TLOG_ENDL;
00238 }
00239
00240 void RoutingMasterTest::generate_tokens()
00241 {
00242 TLOG_DEBUG("generate_tokens") << "Init" << TLOG_ENDL;
00243 printHost("generate_tokens");
00244 sleep(1);
00245
00246 int token_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
00247 if (!token_socket)
00248 {
00249 TLOG_ERROR("generate_tokens") << "I failed to create the socket for sending Routing Tokens!" << TLOG_ENDL;
00250 exit(1);
00251 }
00252 struct sockaddr_in token_addr;
00253 auto sts = ResolveHost(routing_master_address_.c_str(), token_port_, token_addr);
00254 if(sts == -1)
00255 {
00256 TLOG_ERROR("generate_tokens") << "Could not resolve host name" << TLOG_ENDL;
00257 }
00258
00259 connect(token_socket, (struct sockaddr*)&token_addr, sizeof(token_addr));
00260
00261 int sent_tokens = 0;
00262 std::map<int, int> token_counter;
00263 for(auto rank : eb_ranks_)
00264 {
00265 token_counter[rank] = 0;
00266 }
00267 while (sent_tokens < token_count_) {
00268 int this_rank = eb_ranks_[rand() % eb_ranks_.size()];
00269 token_counter[this_rank]++;
00270 artdaq::detail::RoutingToken token;
00271 token.header = TOKEN_MAGIC;
00272 token.rank = this_rank;
00273 token.new_slots_free = 1;
00274
00275 TLOG_DEBUG("generate_tokens") << "Sending RoutingToken " << std::to_string(++sent_tokens) << " for rank " << this_rank << " to " << routing_master_address_ << TLOG_ENDL;
00276 send(token_socket, &token, sizeof(artdaq::detail::RoutingToken), 0);
00277 usleep(token_interval_us_);
00278 }
00279 auto max_rank = 0;
00280 for(auto rank : token_counter)
00281 {
00282 if (rank.second > max_rank) max_rank = rank.second;
00283 }
00284 for(auto rank : token_counter)
00285 {
00286 artdaq::detail::RoutingToken token;
00287 token.header = TOKEN_MAGIC;
00288 token.rank = rank.first;
00289 token.new_slots_free = max_rank - rank.second;
00290
00291 TLOG_DEBUG("generate_tokens") << "Sending RoutingToken " << std::to_string(++sent_tokens) << " for rank " << rank.first << " to " << routing_master_address_ << TLOG_ENDL;
00292 send(token_socket, &token, sizeof(artdaq::detail::RoutingToken), 0);
00293 usleep(token_interval_us_);
00294
00295 }
00296
00297 MPI_Comm_free(&local_group_comm_);
00298 TLOG_INFO("generate_tokens") << "Waiting at MPI_Barrier" << TLOG_ENDL;
00299 MPI_Barrier(MPI_COMM_WORLD);
00300 TLOG_INFO("generate_tokens") << "Done with MPI_Barrier" << TLOG_ENDL;
00301 }
00302
00303 void RoutingMasterTest::table_receiver()
00304 {
00305 TLOG_DEBUG("table_receiver") << "Init" << TLOG_ENDL;
00306 printHost("table_receiver");
00307
00308
00309 auto table_socket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00310 if (!table_socket)
00311 {
00312 TLOG_ERROR("table_receiver") << "Error creating socket for receiving data requests!" << TLOG_ENDL;
00313 exit(1);
00314 }
00315
00316 struct sockaddr_in si_me_request;
00317
00318 int yes = 1;
00319 if (setsockopt(table_socket, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00320 {
00321 TLOG_ERROR("table_receiver") << " Unable to enable port reuse on request socket" << TLOG_ENDL;
00322 exit(1);
00323 }
00324 memset(&si_me_request, 0, sizeof(si_me_request));
00325 si_me_request.sin_family = AF_INET;
00326 si_me_request.sin_port = htons(table_port_);
00327 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00328 if (bind(table_socket, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
00329 {
00330 TLOG_ERROR("table_receiver") << "Cannot bind request socket to port " << table_port_ << TLOG_ENDL;
00331 exit(1);
00332 }
00333
00334 struct ip_mreq mreq;
00335 long int sts = ResolveHost(multicast_address_.c_str(), mreq.imr_multiaddr);
00336 if(sts == -1)
00337 {
00338 TLOG_ERROR("table_Receiver") << "Unable to resolve multicast hostname" << TLOG_ENDL;
00339 exit(1);
00340 }
00341 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
00342 if (setsockopt(table_socket, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
00343 {
00344 TLOG_ERROR("table_receiver") << "Unable to join multicast group" << TLOG_ENDL;
00345 exit(1);
00346 }
00347
00348 struct epoll_event ev;
00349 int table_epoll_fd = epoll_create1(0);
00350 ev.events = EPOLLIN | EPOLLPRI;
00351 ev.data.fd = table_socket;
00352 if (epoll_ctl(table_epoll_fd, EPOLL_CTL_ADD, table_socket, &ev) == -1)
00353 {
00354 TLOG_ERROR("table_receiver") << "Could not register listen socket to epoll fd" << TLOG_ENDL;
00355 exit(3);
00356 }
00357
00358 auto ack_socket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00359 struct sockaddr_in ack_addr;
00360 sts = ResolveHost(routing_master_address_.c_str(), ack_port_, ack_addr);
00361 if(sts == -1)
00362 {
00363 TLOG_ERROR("table_Receiver") << "Unable to resolve routing master hostname" << TLOG_ENDL;
00364 exit(1);
00365 }
00366
00367 if (table_socket == -1 || table_epoll_fd == -1 || ack_socket == -1)
00368 {
00369 TLOG_DEBUG("table_receiver") << "One of the listen sockets was not opened successfully." << TLOG_ENDL;
00370 exit(4);
00371 }
00372 artdaq::Fragment::sequence_id_t max_sequence_id = token_count_;
00373 artdaq::Fragment::sequence_id_t current_sequence_id = 0;
00374 std::map<artdaq::Fragment::sequence_id_t, int> routing_table;
00375 TLOG_INFO("table_receiver") << "Expecting " << std::to_string(max_sequence_id) << " as the last Sequence ID in this run" << TLOG_ENDL;
00376 while (current_sequence_id < max_sequence_id)
00377 {
00378 std::vector<epoll_event> table_events_(4);
00379 TLOG_DEBUG("table_receiver") << "Waiting for event on table socket" << TLOG_ENDL;
00380 auto nfds = epoll_wait(table_epoll_fd, &table_events_[0], table_events_.size(), -1);
00381 if (nfds == -1) {
00382 perror("epoll_wait");
00383 exit(EXIT_FAILURE);
00384 }
00385
00386 TLOG_DEBUG("table_receiver") << "Received " << nfds << " table update(s)" << TLOG_ENDL;
00387 for (auto n = 0; n < nfds; ++n) {
00388 auto first = artdaq::Fragment::InvalidSequenceID;
00389 auto last = artdaq::Fragment::InvalidSequenceID;
00390 artdaq::detail::RoutingPacketHeader hdr;
00391 recv(table_events_[n].data.fd, &hdr, sizeof(artdaq::detail::RoutingPacketHeader), 0);
00392
00393 TLOG_DEBUG("table_receiver") << "Checking for valid header" << TLOG_ENDL;
00394 if (hdr.header == ROUTING_MAGIC) {
00395 artdaq::detail::RoutingPacket buffer(hdr.nEntries);
00396 TLOG_DEBUG("table_receiver") << "Receiving data buffer" << TLOG_ENDL;
00397 sts = recv(table_events_[n].data.fd, &buffer[0], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries, 0);
00398 assert(static_cast<size_t>(sts) == sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
00399
00400 first = buffer[0].sequence_id;
00401 last = buffer[buffer.size() - 1].sequence_id;
00402
00403 for (auto entry : buffer)
00404 {
00405 if (routing_table.count(entry.sequence_id))
00406 {
00407 assert(routing_table[entry.sequence_id] == entry.destination_rank);
00408 continue;
00409 }
00410 routing_table[entry.sequence_id] = entry.destination_rank;
00411 TLOG_DEBUG("table_receiver") << "table_receiver " << std::to_string(my_rank) << ": received update: SeqID " << std::to_string(entry.sequence_id) << " -> Rank " << std::to_string(entry.destination_rank) << TLOG_ENDL;
00412 }
00413
00414 artdaq::detail::RoutingAckPacket ack;
00415 ack.rank = my_rank;
00416 ack.first_sequence_id = first;
00417 ack.last_sequence_id = last;
00418
00419 TLOG_DEBUG("table_receiver") << "Sending RoutingAckPacket with first= " << std::to_string(first) << " and last= " << std::to_string(last) << " to " << routing_master_address_ << ", port " << ack_port_ << TLOG_ENDL
00420 sendto(ack_socket, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr, sizeof(ack_addr));
00421 current_sequence_id = last;
00422 }
00423 }
00424 }
00425
00426 MPI_Comm_free(&local_group_comm_);
00427 TLOG_INFO("table_receiver") << "Waiting at MPI_Barrier" << TLOG_ENDL;
00428 MPI_Barrier(MPI_COMM_WORLD);
00429 TLOG_INFO("table_receiver") << "Done with MPI_Barrier" << TLOG_ENDL;
00430 }
00431
00432 void RoutingMasterTest::routing_master()
00433 {
00434 TLOG_DEBUG("routing_master") << "Init" << TLOG_ENDL;
00435 printHost("routing_master");
00436
00437 auto app = std::make_unique<artdaq::RoutingMasterApp>(local_group_comm_, "RoutingMaster");
00438
00439 app->initialize(pset_, 0, 0);
00440 app->do_start(art::RunID(1), 0, 0);
00441 TLOG_INFO("routing_master") << "Waiting at MPI_Barrier" << TLOG_ENDL;
00442 MPI_Barrier(MPI_COMM_WORLD);
00443 TLOG_INFO("routing_master") << "Done with MPI_Barrier, calling RoutingMasterCore::stop" << TLOG_ENDL;
00444 app->do_stop(0, 0);
00445 TLOG_INFO("routing_master") << "Done with RoutingMasterCore::stop, calling shutdown" << TLOG_ENDL;
00446 app->do_shutdown(0);
00447 TLOG_INFO("routing_master") << "Done with RoutingMasterCore::shutdown" << TLOG_ENDL;
00448 MPI_Comm_free(&local_group_comm_);
00449 }
00450
00451 void RoutingMasterTest::printHost(const std::string& functionName) const
00452 {
00453 char* doPrint = getenv("PRINT_HOST");
00454 if (doPrint == 0) { return; }
00455 const int ARRSIZE = 80;
00456 char hostname[ARRSIZE];
00457 std::string hostString;
00458 if (!gethostname(hostname, ARRSIZE))
00459 {
00460 hostString = hostname;
00461 }
00462 else
00463 {
00464 hostString = "unknown";
00465 }
00466 TLOG_DEBUG("routing_master") << "Running " << functionName
00467 << " on host " << hostString
00468 << " with rank " << my_rank << "."
00469 << TLOG_ENDL;
00470 }
00471
00472 void printUsage()
00473 {
00474 int myid = 0;
00475 struct rusage usage;
00476 getrusage(RUSAGE_SELF, &usage);
00477 std::cout << myid << ":"
00478 << " user=" << artdaq::Globals::timevalAsDouble(usage.ru_utime)
00479 << " sys=" << artdaq::Globals::timevalAsDouble(usage.ru_stime)
00480 << std::endl;
00481 }
00482
00483 int main(int argc, char* argv[])
00484 {
00485 artdaq::configureMessageFacility("routing_master", false);
00486 int rc = 1;
00487 try
00488 {
00489 RoutingMasterTest p(argc, argv);
00490 std::cerr << "Started process " << my_rank << " of " << p.procs_ << ".\n";
00491 p.go();
00492 rc = 0;
00493 }
00494 catch (std::string& x)
00495 {
00496 std::cerr << "Exception (type string) caught in routing_master: "
00497 << x
00498 << '\n';
00499 return 1;
00500 }
00501 catch (char const* m)
00502 {
00503 std::cerr << "Exception (type char const*) caught in routing_master: ";
00504 if (m)
00505 {
00506 std::cerr << m;
00507 }
00508 else
00509 {
00510 std::cerr << "[the value was a null pointer, so no message is available]";
00511 }
00512 std::cerr << '\n';
00513 }
00514 return rc;
00515 }