$treeview $search $mathjax $extrastylesheet
artdaq_mpich_plugin
v1_00_06a
$projectbrief
|
$projectbrief
|
$searchbox |
00001 #define TRACE_NAME "routing_master_t" 00002 #include "MPIProg.hh" 00003 #include "artdaq/DAQrate/detail/RoutingPacket.hh" 00004 #include "artdaq/DAQdata/TCPConnect.hh" 00005 #include "artdaq-mpich-plugin/Utilities/quiet_mpi.hh" 00006 #include "cetlib/filepath_maker.h" 00007 #include "fhiclcpp/ParameterSet.h" 00008 #include "fhiclcpp/make_ParameterSet.h" 00009 00010 #include <boost/program_options.hpp> 00011 #include "artdaq/Application/RoutingMasterCore.hh" 00012 #include "artdaq/Application/RoutingMasterApp.hh" 00013 #include <netdb.h> 00014 namespace bpo = boost::program_options; 00015 00016 #include <algorithm> 00017 #include <cmath> 00018 #include <cstdio> 00019 00020 extern "C" 00021 { 00022 #include <unistd.h> 00023 } 00024 00025 #include <iostream> 00026 #include <memory> 00027 #include <utility> 00028 #include <arpa/inet.h> 00029 #include <netinet/in.h> 00030 #include <sys/types.h> 00031 #include <sys/socket.h> 00032 00033 extern "C" 00034 { 00035 #include <sys/time.h> 00036 #include <sys/resource.h> 00037 } 00038 00042 class RoutingMasterTest : public MPIProg 00043 { 00044 public: 00059 RoutingMasterTest(int argc, char* argv[]); 00060 00064 void go(); 00065 00069 void generate_tokens(); 00070 00074 void routing_master(); 00075 00079 void table_receiver(); 00080 00087 fhicl::ParameterSet getPset(int argc, char* argv[]) const; 00088 00089 private: 00090 enum class TestRole_t : int 00091 { 00092 TOKEN_GEN = 0, 00093 ROUTING_MASTER = 1, 00094 TABLE_RECEIVER = 2 00095 }; 00096 00097 void printHost(const std::string& functionName) const; 00098 00099 fhicl::ParameterSet const pset_; 00100 fhicl::ParameterSet const daq_pset_; 00101 TestRole_t role_; 00102 00103 std::string routing_master_address_; 00104 std::string multicast_address_; 00105 int token_port_; 00106 int table_port_; 00107 int ack_port_; 00108 std::vector<int> eb_ranks_; 00109 int token_count_; 00110 size_t token_interval_us_; 00111 }; 00112 00113 RoutingMasterTest::RoutingMasterTest(int argc, char* argv[]) : 00114 MPIProg(argc, argv) 00115 , pset_(getPset(argc, argv)) 00116 , daq_pset_(pset_.get<fhicl::ParameterSet>("daq")) 00117 , routing_master_address_(daq_pset_.get<std::string>("routing_master_hostname", "localhost")) 00118 , multicast_address_(daq_pset_.get<std::string>("table_update_address", "227.128.12.28")) 00119 , token_port_(daq_pset_.get<int>("routing_token_port", 35555)) 00120 , table_port_(daq_pset_.get<int>("table_update_port", 35556)) 00121 , ack_port_(daq_pset_.get<int>("table_acknowledge_port", 35557)) 00122 , token_count_(pset_.get<int>("token_count", 1000)) 00123 , token_interval_us_(pset_.get<size_t>("token_interval_us", 5000)) 00124 { 00125 assert(!(my_rank < 0)); 00126 switch (my_rank) 00127 { 00128 case 0: 00129 role_ = TestRole_t::TOKEN_GEN; 00130 break; 00131 case 1: 00132 role_ = TestRole_t::ROUTING_MASTER; 00133 break; 00134 default: 00135 role_ = TestRole_t::TABLE_RECEIVER; 00136 break; 00137 } 00138 auto policy_pset = daq_pset_.get<fhicl::ParameterSet>("policy"); 00139 eb_ranks_ = policy_pset.get<std::vector<int>>("receiver_ranks"); 00140 } 00141 00142 fhicl::ParameterSet RoutingMasterTest::getPset(int argc, char* argv[]) const 00143 { 00144 std::ostringstream descstr; 00145 descstr << "-- <-c <config-file>>"; 00146 bpo::options_description desc(descstr.str()); 00147 desc.add_options() 00148 ("config,c", bpo::value<std::string>(), "Configuration file."); 00149 bpo::variables_map vm; 00150 try 00151 { 00152 bpo::store(bpo::command_line_parser(argc, argv). 00153 options(desc).allow_unregistered().run(), vm); 00154 bpo::notify(vm); 00155 } 00156 catch (bpo::error const& e) 00157 { 00158 std::cerr << "Exception from command line processing in Config::getArtPset: " << e.what() << "\n"; 00159 throw "cmdline parsing error."; 00160 } 00161 if (!vm.count("config")) 00162 { 00163 std::cerr << "Expected \"-- -c <config-file>\" fhicl file specification.\n"; 00164 throw "cmdline parsing error."; 00165 } 00166 fhicl::ParameterSet pset; 00167 cet::filepath_lookup lookup_policy("FHICL_FILE_PATH"); 00168 fhicl::make_ParameterSet(vm["config"].as<std::string>(), lookup_policy, pset); 00169 00170 return pset; 00171 } 00172 00173 void RoutingMasterTest::go() 00174 { 00175 TLOG(TLVL_INFO) << "Entering MPI_Barrier"; 00176 MPI_Barrier(MPI_COMM_WORLD); 00177 TLOG(TLVL_INFO) << "Done with Barrier"; 00178 //std::cout << "daq_pset_: " << daq_pset_.to_string() << std::endl << "conf_.makeParameterSet(): " << conf_.makeParameterSet().to_string() << std::endl; 00179 00180 switch (role_) 00181 { 00182 case TestRole_t::TABLE_RECEIVER: 00183 table_receiver(); 00184 break; 00185 case TestRole_t::ROUTING_MASTER: 00186 routing_master(); 00187 break; 00188 case TestRole_t::TOKEN_GEN: 00189 generate_tokens(); 00190 break; 00191 default: 00192 throw "No such node type"; 00193 } 00194 TLOG(TLVL_INFO) << "Rank " << my_rank << " complete." ; 00195 } 00196 00197 void RoutingMasterTest::generate_tokens() 00198 { 00199 TLOG(TLVL_INFO) << "generate_tokens(): Init" ; 00200 printHost("generate_tokens"); 00201 sleep(1); 00202 00203 int token_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); 00204 if (token_socket < 0) 00205 { 00206 TLOG(TLVL_ERROR) << "generate_tokens(): I failed to create the socket for sending Routing Tokens!" ; 00207 exit(1); 00208 } 00209 struct sockaddr_in token_addr; 00210 auto sts = ResolveHost(routing_master_address_.c_str(), token_port_, token_addr); 00211 if(sts == -1) 00212 { 00213 TLOG(TLVL_ERROR) << "generate_tokens(): Could not resolve host name" ; 00214 } 00215 00216 connect(token_socket, (struct sockaddr*)&token_addr, sizeof(token_addr)); 00217 00218 int sent_tokens = 0; 00219 std::map<int, int> token_counter; 00220 for(auto rank : eb_ranks_) 00221 { 00222 token_counter[rank] = 0; 00223 } 00224 while (sent_tokens < token_count_) { 00225 int this_rank = eb_ranks_[seedAndRandom() % eb_ranks_.size()]; 00226 token_counter[this_rank]++; 00227 artdaq::detail::RoutingToken token; 00228 token.header = TOKEN_MAGIC; 00229 token.rank = this_rank; 00230 token.new_slots_free = 1; 00231 token.run_number = 1; 00232 00233 TLOG(TLVL_INFO) << "generate_tokens(): Sending RoutingToken " << ++sent_tokens << " for rank " << this_rank << " to " << routing_master_address_ ; 00234 send(token_socket, &token, sizeof(artdaq::detail::RoutingToken), 0); 00235 usleep(token_interval_us_); 00236 } 00237 auto max_rank = 0; 00238 for(auto rank : token_counter) 00239 { 00240 if (rank.second > max_rank) max_rank = rank.second; 00241 } 00242 for(auto rank : token_counter) 00243 { 00244 artdaq::detail::RoutingToken token; 00245 token.header = TOKEN_MAGIC; 00246 token.rank = rank.first; 00247 token.new_slots_free = max_rank - rank.second; 00248 token.run_number = 1; 00249 00250 TLOG(TLVL_INFO) << "generate_tokens(): Sending RoutingToken " << ++sent_tokens << " for rank " << rank.first << " to " << routing_master_address_ ; 00251 send(token_socket, &token, sizeof(artdaq::detail::RoutingToken), 0); 00252 usleep(token_interval_us_); 00253 00254 } 00255 00256 TLOG(TLVL_INFO) << "generate_tokens(): Waiting at MPI_Barrier" ; 00257 MPI_Barrier(MPI_COMM_WORLD); 00258 TLOG(TLVL_INFO) << "generate_tokens(): Done with MPI_Barrier" ; 00259 } 00260 00261 void RoutingMasterTest::table_receiver() 00262 { 00263 TLOG(TLVL_INFO) << "table_receiver(): Init" ; 00264 printHost("table_receiver"); 00265 00266 00267 auto table_socket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 00268 if (table_socket < 0) 00269 { 00270 TLOG(TLVL_ERROR) << "table_receiver(): Error creating socket for receiving data requests!" ; 00271 exit(1); 00272 } 00273 00274 struct sockaddr_in si_me_request; 00275 00276 int yes = 1; 00277 if (setsockopt(table_socket, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) 00278 { 00279 TLOG(TLVL_ERROR) << "table_receiver(): Unable to enable port reuse on request socket" ; 00280 exit(1); 00281 } 00282 memset(&si_me_request, 0, sizeof(si_me_request)); 00283 si_me_request.sin_family = AF_INET; 00284 si_me_request.sin_port = htons(table_port_); 00285 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY); 00286 if (bind(table_socket, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1) 00287 { 00288 TLOG(TLVL_ERROR) << "table_receiver(): Cannot bind request socket to port " << table_port_ ; 00289 exit(1); 00290 } 00291 00292 struct ip_mreq mreq; 00293 long int sts = ResolveHost(multicast_address_.c_str(), mreq.imr_multiaddr); 00294 if(sts == -1) 00295 { 00296 TLOG(TLVL_ERROR) << "table_receiver(): Unable to resolve multicast hostname" ; 00297 exit(1); 00298 } 00299 mreq.imr_interface.s_addr = htonl(INADDR_ANY); 00300 if (setsockopt(table_socket, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) 00301 { 00302 TLOG(TLVL_ERROR) << "table_receiver(): Unable to join multicast group" ; 00303 exit(1); 00304 } 00305 00306 struct epoll_event ev; 00307 int table_epoll_fd = epoll_create1(0); 00308 ev.events = EPOLLIN | EPOLLPRI; 00309 ev.data.fd = table_socket; 00310 if (epoll_ctl(table_epoll_fd, EPOLL_CTL_ADD, table_socket, &ev) == -1) 00311 { 00312 TLOG(TLVL_ERROR) << "table_receiver(): Could not register listen socket to epoll fd" ; 00313 exit(3); 00314 } 00315 00316 auto ack_socket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 00317 struct sockaddr_in ack_addr; 00318 sts = ResolveHost(routing_master_address_.c_str(), ack_port_, ack_addr); 00319 if(sts == -1) 00320 { 00321 TLOG(TLVL_ERROR) << "table_receiver(): Unable to resolve routing master hostname" ; 00322 exit(1); 00323 } 00324 00325 if (table_socket == -1 || table_epoll_fd == -1 || ack_socket == -1) 00326 { 00327 TLOG(TLVL_INFO) << "table_receiver(): One of the listen sockets was not opened successfully." ; 00328 exit(4); 00329 } 00330 artdaq::Fragment::sequence_id_t max_sequence_id = token_count_; 00331 artdaq::Fragment::sequence_id_t current_sequence_id = 0; 00332 std::map<artdaq::Fragment::sequence_id_t, int> routing_table; 00333 TLOG(TLVL_INFO) << "table_receiver(): Expecting " << max_sequence_id << " as the last Sequence ID in this run" ; 00334 while (current_sequence_id < max_sequence_id) 00335 { 00336 std::vector<epoll_event> table_events_(4); 00337 TLOG(TLVL_INFO) << "table_receiver(): Waiting for event on table socket" ; 00338 auto nfds = epoll_wait(table_epoll_fd, &table_events_[0], table_events_.size(), -1); 00339 if (nfds == -1) { 00340 perror("epoll_wait"); 00341 exit(EXIT_FAILURE); 00342 } 00343 00344 TLOG(TLVL_INFO) << "table_receiver(): Received " << nfds << " table update(s)" ; 00345 for (auto n = 0; n < nfds; ++n) { 00346 auto first = artdaq::Fragment::InvalidSequenceID; 00347 auto last = artdaq::Fragment::InvalidSequenceID; 00348 00349 std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE); 00350 artdaq::detail::RoutingPacketHeader hdr; 00351 auto stss = recv(table_events_[n].data.fd, &buf[0], MAX_ROUTING_TABLE_SIZE, 0); 00352 00353 00354 if (stss > static_cast<ssize_t>(sizeof(hdr))) 00355 { 00356 memcpy(&hdr, &buf[0], sizeof(hdr)); 00357 } 00358 else 00359 { 00360 TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding."; 00361 continue; 00362 } 00363 00364 TLOG(TLVL_INFO) << "table_receiver(): Checking for valid header" ; 00365 if (hdr.header == ROUTING_MAGIC) { 00366 artdaq::detail::RoutingPacket buffer(hdr.nEntries); 00367 TLOG(TLVL_INFO) << "table_receiver(): Receiving data buffer" ; 00368 memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries); 00369 00370 first = buffer[0].sequence_id; 00371 last = buffer[buffer.size() - 1].sequence_id; 00372 00373 for (auto entry : buffer) 00374 { 00375 if (routing_table.count(entry.sequence_id)) 00376 { 00377 assert(routing_table[entry.sequence_id] == entry.destination_rank); 00378 continue; 00379 } 00380 routing_table[entry.sequence_id] = entry.destination_rank; 00381 TLOG(TLVL_INFO) << "table_receiver(): table_receiver " << my_rank << ": received update: SeqID " << entry.sequence_id << " -> Rank " << entry.destination_rank ; 00382 } 00383 00384 artdaq::detail::RoutingAckPacket ack; 00385 ack.rank = my_rank; 00386 ack.first_sequence_id = first; 00387 ack.last_sequence_id = last; 00388 00389 TLOG(TLVL_INFO) << "table_receiver(): Sending RoutingAckPacket with first= " << first << " and last= " << last << " to " << routing_master_address_ << ", port " << ack_port_ ; 00390 sendto(ack_socket, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr, sizeof(ack_addr)); 00391 current_sequence_id = last; 00392 } 00393 } 00394 } 00395 00396 TLOG(TLVL_INFO) << "table_receiver(): Waiting at MPI_Barrier" ; 00397 MPI_Barrier(MPI_COMM_WORLD); 00398 TLOG(TLVL_INFO) << "table_receiver(): Done with MPI_Barrier" ; 00399 } 00400 00401 void RoutingMasterTest::routing_master() 00402 { 00403 TLOG(TLVL_INFO) << "routing_master: Init" ; 00404 printHost("routing_master"); 00405 00406 app_name = "RoutingMaster"; 00407 00408 auto app = std::make_unique<artdaq::RoutingMasterApp>(); 00409 00410 app->initialize(pset_, 0, 0); 00411 app->do_start(art::RunID(1), 0, 0); 00412 TLOG(TLVL_INFO) << "routing_master: Waiting at MPI_Barrier" ; 00413 MPI_Barrier(MPI_COMM_WORLD); 00414 TLOG(TLVL_INFO) << "routing_master: Done with MPI_Barrier, calling RoutingMasterCore::stop" ; 00415 app->do_stop(0, 0); 00416 TLOG(TLVL_INFO) << "routing_master: Done with RoutingMasterCore::stop, calling shutdown" ; 00417 app->do_shutdown(0); 00418 TLOG(TLVL_INFO) << "routing_master: Done with RoutingMasterCore::shutdown" ; 00419 } 00420 00421 void RoutingMasterTest::printHost(const std::string& functionName) const 00422 { 00423 char* doPrint = getenv("PRINT_HOST"); 00424 if (doPrint == 0) { return; } 00425 const int ARRSIZE = 80; 00426 char hostname[ARRSIZE]; 00427 std::string hostString; 00428 if (!gethostname(hostname, ARRSIZE)) 00429 { 00430 hostString = hostname; 00431 } 00432 else 00433 { 00434 hostString = "unknown"; 00435 } 00436 TLOG(TLVL_INFO) << "Running " << functionName 00437 << " on host " << hostString 00438 << " with rank " << my_rank << "." 00439 ; 00440 } 00441 00442 void printUsage() 00443 { 00444 int myid = 0; 00445 struct rusage usage; 00446 getrusage(RUSAGE_SELF, &usage); 00447 std::cout << myid << ":" 00448 << " user=" << artdaq::TimeUtils::convertUnixTimeToSeconds(usage.ru_utime) 00449 << " sys=" << artdaq::TimeUtils::convertUnixTimeToSeconds(usage.ru_stime) 00450 << std::endl; 00451 } 00452 00453 int main(int argc, char* argv[]) 00454 { 00455 artdaq::configureMessageFacility("routing_master", true); 00456 int rc = 1; 00457 #if 0 00458 std::cerr << "PID: " << getpid() << std::endl; 00459 volatile bool attach = true; 00460 while (attach) 00461 { 00462 usleep(100000); 00463 } 00464 #endif 00465 00466 try 00467 { 00468 RoutingMasterTest p(argc, argv); 00469 std::cerr << "Started process " << my_rank << " of " << p.procs_ << ".\n"; 00470 p.go(); 00471 rc = 0; 00472 } 00473 catch (std::string& x) 00474 { 00475 std::cerr << "Exception (type string) caught in routing_master: " 00476 << x 00477 << '\n'; 00478 return 1; 00479 } 00480 catch (char const* m) 00481 { 00482 std::cerr << "Exception (type char const*) caught in routing_master: "; 00483 if (m) 00484 { 00485 std::cerr << m; 00486 } 00487 else 00488 { 00489 std::cerr << "[the value was a null pointer, so no message is available]"; 00490 } 00491 std::cerr << '\n'; 00492 } 00493 return rc; 00494 }