$treeview $search $mathjax $extrastylesheet
artdaq
v3_04_01
$projectbrief
|
$projectbrief
|
$searchbox |
00001 #define TRACE_NAME "RoutingReceiver" 00002 #include "artdaq/DAQdata/Globals.hh" 00003 00004 #include <chrono> 00005 #include "canvas/Utilities/Exception.h" 00006 #include <arpa/inet.h> 00007 #include <netinet/in.h> 00008 #include <sys/types.h> 00009 #include <poll.h> 00010 #include <signal.h> 00011 #include <thread> 00012 #include <sys/socket.h> 00013 #include "artdaq/DAQdata/TCPConnect.hh" 00014 #include "artdaq/Application/LoadParameterSet.hh" 00015 #include "artdaq/DAQrate/detail/RoutingPacket.hh" 00016 #include "artdaq/TransferPlugins/detail/HostMap.hh" 00017 #include "proto/artdaqapp.hh" 00018 #include "fhiclcpp/types/Atom.h" 00019 #include "fhiclcpp/types/OptionalTable.h" 00020 #include "fhiclcpp/types/TableFragment.h" 00021 00022 namespace artdaq { 00026 class RoutingReceiver 00027 { 00028 public: 00030 struct Config 00031 { 00033 fhicl::Atom<size_t> collection_time_ms{ fhicl::Name{ "collection_time_ms" }, fhicl::Comment{ "Time to collect routing table updates between printing summaries" }, 1000 }; 00035 fhicl::Atom<bool> print_verbose_info{ fhicl::Name{ "print_verbose_info" }, fhicl::Comment{ "Print verbose information about each receiver detected in routing tables" }, true }; 00037 fhicl::Atom<size_t> graph_width{ fhicl::Name{ "graph_width" }, fhicl::Comment{ "Width of the summary graph" }, 40 }; 00038 fhicl::TableFragment<artdaq::artdaqapp::Config> artdaqAppConfig; 00039 }; 00041 using Parameters = fhicl::WrappedTable<Config>; 00042 00047 explicit RoutingReceiver(fhicl::ParameterSet const& pset) 00048 : should_stop_(false) 00049 , table_socket_(-1) 00050 , routing_table_last_(0) 00051 { 00052 TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string(); 00053 00054 // Validate parameters 00055 00056 auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet()); 00057 use_routing_master_ = rmConfig.get<bool>("use_routing_master", false); 00058 table_port_ = rmConfig.get<int>("table_update_port", 35556); 00059 table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28"); 00060 00061 host_map_ = MakeHostMap(pset); 00062 00063 if (use_routing_master_) startTableReceiverThread_(); 00064 } 00065 00069 ~RoutingReceiver() 00070 { 00071 TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver BEGIN"; 00072 should_stop_ = true; 00073 if (routing_thread_.joinable()) routing_thread_.join(); 00074 TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver END."; 00075 } 00076 00081 std::map<Fragment::sequence_id_t, int> GetRoutingTable() 00082 { 00083 std::unique_lock<std::mutex> lk(routing_mutex_); 00084 std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_); 00085 return routing_table_copy; 00086 } 00087 00092 std::map<Fragment::sequence_id_t, int> GetAndClearRoutingTable() 00093 { 00094 std::unique_lock<std::mutex> lk(routing_mutex_); 00095 std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_); 00096 routing_table_.clear(); 00097 return routing_table_copy; 00098 } 00099 00104 hostMap_t GetHostMap() { return host_map_; } 00105 00106 private: 00107 void setupTableListener_() 00108 { 00109 int sts; 00110 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 00111 if (table_socket_ < 0) 00112 { 00113 TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!"; 00114 exit(1); 00115 } 00116 00117 struct sockaddr_in si_me_request; 00118 00119 int yes = 1; 00120 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) 00121 { 00122 TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket"; 00123 exit(1); 00124 } 00125 memset(&si_me_request, 0, sizeof(si_me_request)); 00126 si_me_request.sin_family = AF_INET; 00127 si_me_request.sin_port = htons(table_port_); 00128 //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY); 00129 struct in_addr in_addr_s; 00130 sts = inet_aton(table_address_.c_str(), &in_addr_s); 00131 if (sts == 0) 00132 { 00133 TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid"; 00134 } 00135 si_me_request.sin_addr.s_addr = in_addr_s.s_addr; 00136 if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1) 00137 { 00138 TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_; 00139 exit(1); 00140 } 00141 00142 struct ip_mreq mreq; 00143 sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr); 00144 if (sts == -1) 00145 { 00146 TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates"; 00147 exit(1); 00148 } 00149 mreq.imr_interface.s_addr = htonl(INADDR_ANY); 00150 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) 00151 { 00152 TLOG(TLVL_ERROR) << "Unable to join multicast group"; 00153 exit(1); 00154 } 00155 } 00156 void startTableReceiverThread_() 00157 { 00158 if (routing_thread_.joinable()) routing_thread_.join(); 00159 TLOG(TLVL_INFO) << "Starting Routing Thread"; 00160 try { 00161 routing_thread_ = boost::thread(&RoutingReceiver::receiveTableUpdatesLoop_, this); 00162 } 00163 catch (const boost::exception& e) 00164 { 00165 TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno; 00166 std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl; 00167 exit(5); 00168 } 00169 } 00170 void receiveTableUpdatesLoop_() 00171 { 00172 while (true) 00173 { 00174 if (should_stop_) 00175 { 00176 TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping"; 00177 return; 00178 } 00179 00180 TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes"; 00181 if (table_socket_ == -1) 00182 { 00183 TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket"; 00184 setupTableListener_(); 00185 } 00186 if (table_socket_ == -1) 00187 { 00188 TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully."; 00189 return; 00190 } 00191 00192 struct pollfd fd; 00193 fd.fd = table_socket_; 00194 fd.events = POLLIN | POLLPRI; 00195 00196 auto res = poll(&fd, 1, 1000); 00197 if (res > 0) 00198 { 00199 auto first = artdaq::Fragment::InvalidSequenceID; 00200 auto last = artdaq::Fragment::InvalidSequenceID; 00201 std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE); 00202 artdaq::detail::RoutingPacketHeader hdr; 00203 00204 TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader"; 00205 struct sockaddr_in from; 00206 socklen_t len = sizeof(from); 00207 auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, (struct sockaddr*)&from, &len); 00208 TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port; 00209 00210 if (stss > static_cast<ssize_t>(sizeof(hdr))) 00211 { 00212 memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader)); 00213 } 00214 else 00215 { 00216 TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding."; 00217 continue; 00218 } 00219 00220 TRACE(TLVL_DEBUG, "receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx", hdr.nEntries, ((unsigned long*)&hdr)[0], ((unsigned long*)&hdr)[1]); 00221 if (hdr.header != ROUTING_MAGIC) 00222 { 00223 TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss; 00224 } 00225 else 00226 { 00227 artdaq::detail::RoutingPacket buffer(hdr.nEntries); 00228 assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries); 00229 memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries); 00230 TRACE(6, "receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx", stss, ((unsigned long*)&buffer[0])[0], ((unsigned long*)&buffer[0])[1]); 00231 00232 first = buffer[0].sequence_id; 00233 last = buffer[buffer.size() - 1].sequence_id; 00234 00235 if (first + hdr.nEntries - 1 != last) 00236 { 00237 TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!"; 00238 continue; 00239 } 00240 auto thisSeqID = first; 00241 00242 { 00243 std::unique_lock<std::mutex> lck(routing_mutex_); 00244 if (routing_table_.count(last) == 0) 00245 { 00246 for (auto entry : buffer) 00247 { 00248 if (thisSeqID != entry.sequence_id) 00249 { 00250 TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!"; 00251 last = thisSeqID - 1; 00252 break; 00253 } 00254 thisSeqID++; 00255 if (routing_table_.count(entry.sequence_id)) 00256 { 00257 if (routing_table_[entry.sequence_id] != entry.destination_rank) 00258 { 00259 TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id 00260 << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!" 00261 << " I will use the original value!"; 00262 } 00263 continue; 00264 } 00265 if (entry.sequence_id < routing_table_last_) continue; 00266 routing_table_[entry.sequence_id] = entry.destination_rank; 00267 TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id 00268 << " -> Rank " << entry.destination_rank; 00269 } 00270 } 00271 00272 TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table"; 00273 if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first; 00274 00275 auto counter = 0; 00276 for (auto& entry : routing_table_) 00277 { 00278 TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second; 00279 counter++; 00280 } 00281 } 00282 00283 if (last > routing_table_last_) routing_table_last_ = last; 00284 } 00285 } 00286 } 00287 } 00288 00289 private: 00290 bool use_routing_master_; 00291 std::atomic<bool> should_stop_; 00292 int table_port_; 00293 std::string table_address_; 00294 int table_socket_; 00295 std::map<Fragment::sequence_id_t, int> routing_table_; 00296 Fragment::sequence_id_t routing_table_last_; 00297 mutable std::mutex routing_mutex_; 00298 boost::thread routing_thread_; 00299 hostMap_t host_map_; 00300 }; 00301 } 00302 00303 00304 static bool sighandler_init = false; 00305 static bool should_stop = false; 00306 static void signal_handler(int signum) 00307 { 00308 // Messagefacility may already be gone at this point, TRACE ONLY! 00309 TRACE_STREAMER(TLVL_ERROR, &("routingReceiver")[0], 0, 0, 0) << "A signal of type " << signum << " was caught by routingReceiver. Stopping receive loop!"; 00310 00311 should_stop = true; 00312 00313 sigset_t set; 00314 pthread_sigmask(SIG_UNBLOCK, NULL, &set); 00315 pthread_sigmask(SIG_UNBLOCK, &set, NULL); 00316 00317 } 00318 00319 int main(int argc, char* argv[]) 00320 { 00321 artdaq::configureMessageFacility("RoutingReceiver", false, false); 00322 static std::mutex sighandler_mutex; 00323 std::unique_lock<std::mutex> lk(sighandler_mutex); 00324 00325 if (!sighandler_init)//&& manager_id_ == 0) // ELF 3/22/18: Taking out manager_id_==0 requirement as I think kill(getpid()) is enough protection 00326 { 00327 sighandler_init = true; 00328 std::vector<int> signals = { SIGINT, SIGTERM, SIGUSR1, SIGUSR2 }; // SIGQUIT is used by art in normal operation 00329 for (auto signal : signals) 00330 { 00331 struct sigaction old_action; 00332 sigaction(signal, NULL, &old_action); 00333 00334 //If the old handler wasn't SIG_IGN (it's a handler that just 00335 // "ignore" the signal) 00336 if (old_action.sa_handler != SIG_IGN) 00337 { 00338 struct sigaction action; 00339 action.sa_handler = signal_handler; 00340 sigemptyset(&action.sa_mask); 00341 for (auto sigblk : signals) 00342 { 00343 sigaddset(&action.sa_mask, sigblk); 00344 } 00345 action.sa_flags = 0; 00346 00347 //Replace the signal handler of SIGINT with the one described by new_action 00348 sigaction(signal, &action, NULL); 00349 } 00350 } 00351 } 00352 00353 fhicl::ParameterSet init_ps = LoadParameterSet<artdaq::RoutingReceiver::Config>(argc, argv, "routingReceiver", "This application receives Routing Tables, and calculates statistics about the usage of the receivers"); 00354 fhicl::ParameterSet config_ps = init_ps.get<fhicl::ParameterSet>("daq", init_ps); 00355 fhicl::ParameterSet metric_ps = config_ps.get<fhicl::ParameterSet>("metrics", config_ps); 00356 fhicl::ParameterSet fr_ps = config_ps.get<fhicl::ParameterSet>("fragment_receiver", config_ps); 00357 00358 artdaq::RoutingReceiver rr(fr_ps); 00359 00360 auto host_map = rr.GetHostMap(); 00361 00362 size_t collection_time_ms = init_ps.get<size_t>("collection_time_ms", 1000); 00363 size_t max_graph_width = init_ps.get<size_t>("max_graph_width", 100); 00364 bool print_verbose = init_ps.get<bool>("print_verbose_info", true); 00365 bool verbose_clear_screen = init_ps.get<bool>("clear_screen", true); 00366 00367 auto blue = "\033[34m"; 00368 auto cyan = "\033[36m"; 00369 auto green = "\033[32m"; 00370 auto yellow = "\033[93m"; 00371 auto red = "\033[31m"; 00372 00373 metricMan->initialize(metric_ps, "RoutingReceiver"); 00374 metricMan->do_start(); 00375 if (print_verbose && verbose_clear_screen) std::cout << "\033[2J"; 00376 00377 std::map<int, int> receiver_table = std::map<int,int>(); 00378 00379 while (!should_stop) 00380 { 00381 auto start_time = std::chrono::steady_clock::now(); 00382 00383 auto this_table = rr.GetAndClearRoutingTable(); 00384 00385 if (this_table.size() > 0) 00386 { 00387 auto graph_width = this_table.size(); 00388 auto n = 1; // n becomes entries per graph character 00389 auto graph_width_orig = graph_width; 00390 while (graph_width > max_graph_width) 00391 { 00392 n++; 00393 graph_width = graph_width_orig / n; 00394 } 00395 00396 for (auto& entry : this_table) 00397 { 00398 receiver_table[entry.second]++; 00399 } 00400 00401 auto average_entries_per_receiver = this_table.size() / receiver_table.size(); 00402 auto offset = 2 * n; // Offset is 2 characters, in entries 00403 00404 auto cyan_threshold = ((average_entries_per_receiver - offset) / 2) / n; 00405 auto green_threshold = (average_entries_per_receiver - offset) / n; 00406 auto yellow_threshold = (average_entries_per_receiver + offset) / n; 00407 auto red_threshold = (2 * average_entries_per_receiver) / n; 00408 00409 TLOG(TLVL_TRACE) << "CT: " << cyan_threshold << ", GT: " << green_threshold << ", YT: " << yellow_threshold << ", RT: " << red_threshold; 00410 00411 std::ostringstream report; 00412 std::ostringstream verbose_report; 00413 00414 if (print_verbose && verbose_clear_screen) std::cout << "\033[;H\033[J"; 00415 00416 report << artdaq::TimeUtils::gettimeofday_us() << ": " << this_table.size() << " Entries, "; 00417 for (auto& receiver : receiver_table) 00418 { 00419 auto percent = static_cast<int>(receiver.second * 100 / this_table.size()); 00420 report << receiver.first << ": " << receiver.second << " (" << percent << "%), "; 00421 if (print_verbose) 00422 { 00423 verbose_report << receiver.first << ": " << receiver.second << " (" << percent << "%)\t["; 00424 00425 size_t graph_characters = receiver.second / n; 00426 00427 for (size_t ii = 0; ii < graph_characters; ++ii) 00428 { 00429 if (ii < cyan_threshold) 00430 { 00431 verbose_report << blue; 00432 } 00433 else if (ii < green_threshold) 00434 { 00435 verbose_report << cyan; 00436 } 00437 else if (ii < yellow_threshold) 00438 { 00439 verbose_report << green; 00440 } 00441 else if (ii < red_threshold) 00442 { 00443 verbose_report << yellow; 00444 } 00445 else 00446 { 00447 verbose_report << red; 00448 } 00449 verbose_report << "|"; 00450 } 00451 std::string spaces = std::string(graph_width - graph_characters, ' '); 00452 verbose_report << "\033[0m" << spaces << "]" << std::endl; 00453 } 00454 receiver.second = 0; 00455 } 00456 TLOG(TLVL_INFO) << report.str(); 00457 std::cout << report.str() << std::endl; 00458 if(print_verbose) std::cout << verbose_report.str() << std::endl; 00459 } 00460 std::this_thread::sleep_until(start_time + std::chrono::milliseconds(collection_time_ms)); 00461 } 00462 00463 metricMan->do_stop(); 00464 artdaq::Globals::CleanUpGlobals(); 00465 }