$treeview $search $mathjax $extrastylesheet
artdaq
v3_04_00
$projectbrief
|
$projectbrief
|
$searchbox |
00001 #define TRACE_NAME "RoutingReceiver" 00002 #include "artdaq/DAQdata/Globals.hh" 00003 00004 #include <chrono> 00005 #include "canvas/Utilities/Exception.h" 00006 #include <arpa/inet.h> 00007 #include <netinet/in.h> 00008 #include <sys/types.h> 00009 #include <poll.h> 00010 #include <signal.h> 00011 #include <thread> 00012 #include <sys/socket.h> 00013 #include "artdaq/DAQdata/TCPConnect.hh" 00014 #include "artdaq/Application/LoadParameterSet.hh" 00015 #include "artdaq/DAQrate/detail/RoutingPacket.hh" 00016 #include "artdaq/TransferPlugins/detail/HostMap.hh" 00017 #include "proto/artdaqapp.hh" 00018 #include "fhiclcpp/types/Atom.h" 00019 #include "fhiclcpp/types/OptionalTable.h" 00020 #include "fhiclcpp/types/TableFragment.h" 00021 00022 namespace artdaq 00023 { 00024 class RoutingReceiver 00025 { 00026 public: 00027 struct Config 00028 { 00030 fhicl::Atom<size_t> collection_time_ms{ fhicl::Name{ "collection_time_ms" }, fhicl::Comment{ "Time to collect routing table updates between printing summaries" }, 1000 }; 00032 fhicl::Atom<bool> print_verbose_info{ fhicl::Name{ "print_verbose_info" }, fhicl::Comment{ "Print verbose information about each receiver detected in routing tables" }, true }; 00034 fhicl::Atom<size_t> graph_width{ fhicl::Name{ "graph_width" }, fhicl::Comment{ "Width of the summary graph" }, 40 }; 00035 fhicl::TableFragment<artdaq::artdaqapp::Config> artdaqAppConfig; 00036 }; 00037 using Parameters = fhicl::WrappedTable<Config>; 00038 00039 explicit RoutingReceiver(fhicl::ParameterSet const& pset) 00040 : should_stop_(false) 00041 , table_socket_(-1) 00042 , routing_table_last_(0) 00043 { 00044 TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string(); 00045 00046 // Validate parameters 00047 00048 auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet()); 00049 use_routing_master_ = rmConfig.get<bool>("use_routing_master", false); 00050 table_port_ = rmConfig.get<int>("table_update_port", 35556); 00051 table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28"); 00052 00053 host_map_ = MakeHostMap(pset); 00054 00055 if (use_routing_master_) startTableReceiverThread_(); 00056 } 00057 00058 ~RoutingReceiver() 00059 { 00060 TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver BEGIN"; 00061 should_stop_ = true; 00062 if (routing_thread_.joinable()) routing_thread_.join(); 00063 TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver END."; 00064 } 00065 00066 std::map<Fragment::sequence_id_t, int> GetRoutingTable() 00067 { 00068 std::unique_lock<std::mutex> lk(routing_mutex_); 00069 std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_); 00070 return routing_table_copy; 00071 } 00072 00073 std::map<Fragment::sequence_id_t, int> GetAndClearRoutingTable() 00074 { 00075 std::unique_lock<std::mutex> lk(routing_mutex_); 00076 std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_); 00077 routing_table_.clear(); 00078 return routing_table_copy; 00079 } 00080 00081 hostMap_t GetHostMap() { return host_map_; } 00082 00083 private: 00084 void setupTableListener_() 00085 { 00086 int sts; 00087 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 00088 if (table_socket_ < 0) 00089 { 00090 TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!"; 00091 exit(1); 00092 } 00093 00094 struct sockaddr_in si_me_request; 00095 00096 int yes = 1; 00097 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) 00098 { 00099 TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket"; 00100 exit(1); 00101 } 00102 memset(&si_me_request, 0, sizeof(si_me_request)); 00103 si_me_request.sin_family = AF_INET; 00104 si_me_request.sin_port = htons(table_port_); 00105 //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY); 00106 struct in_addr in_addr_s; 00107 sts = inet_aton(table_address_.c_str(), &in_addr_s); 00108 if (sts == 0) 00109 { 00110 TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid"; 00111 } 00112 si_me_request.sin_addr.s_addr = in_addr_s.s_addr; 00113 if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1) 00114 { 00115 TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_; 00116 exit(1); 00117 } 00118 00119 struct ip_mreq mreq; 00120 sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr); 00121 if (sts == -1) 00122 { 00123 TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates"; 00124 exit(1); 00125 } 00126 mreq.imr_interface.s_addr = htonl(INADDR_ANY); 00127 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) 00128 { 00129 TLOG(TLVL_ERROR) << "Unable to join multicast group"; 00130 exit(1); 00131 } 00132 } 00133 void startTableReceiverThread_() 00134 { 00135 if (routing_thread_.joinable()) routing_thread_.join(); 00136 TLOG(TLVL_INFO) << "Starting Routing Thread"; 00137 try { 00138 routing_thread_ = boost::thread(&RoutingReceiver::receiveTableUpdatesLoop_, this); 00139 } 00140 catch (const boost::exception& e) 00141 { 00142 TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno; 00143 std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl; 00144 exit(5); 00145 } 00146 } 00147 void receiveTableUpdatesLoop_() 00148 { 00149 while (true) 00150 { 00151 if (should_stop_) 00152 { 00153 TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping"; 00154 return; 00155 } 00156 00157 TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes"; 00158 if (table_socket_ == -1) 00159 { 00160 TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket"; 00161 setupTableListener_(); 00162 } 00163 if (table_socket_ == -1) 00164 { 00165 TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully."; 00166 return; 00167 } 00168 00169 struct pollfd fd; 00170 fd.fd = table_socket_; 00171 fd.events = POLLIN | POLLPRI; 00172 00173 auto res = poll(&fd, 1, 1000); 00174 if (res > 0) 00175 { 00176 auto first = artdaq::Fragment::InvalidSequenceID; 00177 auto last = artdaq::Fragment::InvalidSequenceID; 00178 std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE); 00179 artdaq::detail::RoutingPacketHeader hdr; 00180 00181 TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader"; 00182 struct sockaddr_in from; 00183 socklen_t len = sizeof(from); 00184 auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, (struct sockaddr*)&from, &len); 00185 TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port; 00186 00187 if (stss > static_cast<ssize_t>(sizeof(hdr))) 00188 { 00189 memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader)); 00190 } 00191 else 00192 { 00193 TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding."; 00194 continue; 00195 } 00196 00197 TRACE(TLVL_DEBUG, "receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx", hdr.nEntries, ((unsigned long*)&hdr)[0], ((unsigned long*)&hdr)[1]); 00198 if (hdr.header != ROUTING_MAGIC) 00199 { 00200 TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss; 00201 } 00202 else 00203 { 00204 artdaq::detail::RoutingPacket buffer(hdr.nEntries); 00205 assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries); 00206 memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries); 00207 TRACE(6, "receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx", stss, ((unsigned long*)&buffer[0])[0], ((unsigned long*)&buffer[0])[1]); 00208 00209 first = buffer[0].sequence_id; 00210 last = buffer[buffer.size() - 1].sequence_id; 00211 00212 if (first + hdr.nEntries - 1 != last) 00213 { 00214 TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!"; 00215 continue; 00216 } 00217 auto thisSeqID = first; 00218 00219 { 00220 std::unique_lock<std::mutex> lck(routing_mutex_); 00221 if (routing_table_.count(last) == 0) 00222 { 00223 for (auto entry : buffer) 00224 { 00225 if (thisSeqID != entry.sequence_id) 00226 { 00227 TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!"; 00228 last = thisSeqID - 1; 00229 break; 00230 } 00231 thisSeqID++; 00232 if (routing_table_.count(entry.sequence_id)) 00233 { 00234 if (routing_table_[entry.sequence_id] != entry.destination_rank) 00235 { 00236 TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id 00237 << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!" 00238 << " I will use the original value!"; 00239 } 00240 continue; 00241 } 00242 if (entry.sequence_id < routing_table_last_) continue; 00243 routing_table_[entry.sequence_id] = entry.destination_rank; 00244 TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id 00245 << " -> Rank " << entry.destination_rank; 00246 } 00247 } 00248 00249 TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table"; 00250 if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first; 00251 00252 auto counter = 0; 00253 for (auto& entry : routing_table_) 00254 { 00255 TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second; 00256 counter++; 00257 } 00258 } 00259 00260 if (last > routing_table_last_) routing_table_last_ = last; 00261 } 00262 } 00263 } 00264 } 00265 00266 private: 00267 bool use_routing_master_; 00268 std::atomic<bool> should_stop_; 00269 int table_port_; 00270 std::string table_address_; 00271 int table_socket_; 00272 std::map<Fragment::sequence_id_t, int> routing_table_; 00273 Fragment::sequence_id_t routing_table_last_; 00274 mutable std::mutex routing_mutex_; 00275 boost::thread routing_thread_; 00276 hostMap_t host_map_; 00277 }; 00278 } 00279 00280 00281 static bool sighandler_init = false; 00282 static bool should_stop = false; 00283 static void signal_handler(int signum) 00284 { 00285 // Messagefacility may already be gone at this point, TRACE ONLY! 00286 TRACE_STREAMER(TLVL_ERROR, &("routingReceiver")[0], 0, 0, 0) << "A signal of type " << signum << " was caught by routingReceiver. Stopping receive loop!"; 00287 00288 should_stop = true; 00289 00290 sigset_t set; 00291 pthread_sigmask(SIG_UNBLOCK, NULL, &set); 00292 pthread_sigmask(SIG_UNBLOCK, &set, NULL); 00293 00294 } 00295 00296 int main(int argc, char* argv[]) 00297 { 00298 artdaq::configureMessageFacility("RoutingReceiver", false, false); 00299 static std::mutex sighandler_mutex; 00300 std::unique_lock<std::mutex> lk(sighandler_mutex); 00301 00302 if (!sighandler_init)//&& manager_id_ == 0) // ELF 3/22/18: Taking out manager_id_==0 requirement as I think kill(getpid()) is enough protection 00303 { 00304 sighandler_init = true; 00305 std::vector<int> signals = { SIGINT, SIGTERM, SIGUSR1, SIGUSR2 }; // SIGQUIT is used by art in normal operation 00306 for (auto signal : signals) 00307 { 00308 struct sigaction old_action; 00309 sigaction(signal, NULL, &old_action); 00310 00311 //If the old handler wasn't SIG_IGN (it's a handler that just 00312 // "ignore" the signal) 00313 if (old_action.sa_handler != SIG_IGN) 00314 { 00315 struct sigaction action; 00316 action.sa_handler = signal_handler; 00317 sigemptyset(&action.sa_mask); 00318 for (auto sigblk : signals) 00319 { 00320 sigaddset(&action.sa_mask, sigblk); 00321 } 00322 action.sa_flags = 0; 00323 00324 //Replace the signal handler of SIGINT with the one described by new_action 00325 sigaction(signal, &action, NULL); 00326 } 00327 } 00328 } 00329 00330 fhicl::ParameterSet init_ps = LoadParameterSet<artdaq::RoutingReceiver::Config>(argc, argv, "routingReceiver", "This application receives Routing Tables, and calculates statistics about the usage of the receivers"); 00331 fhicl::ParameterSet config_ps = init_ps.get<fhicl::ParameterSet>("daq", init_ps); 00332 fhicl::ParameterSet metric_ps = config_ps.get<fhicl::ParameterSet>("metrics", config_ps); 00333 fhicl::ParameterSet fr_ps = config_ps.get<fhicl::ParameterSet>("fragment_receiver", config_ps); 00334 00335 artdaq::RoutingReceiver rr(fr_ps); 00336 00337 auto host_map = rr.GetHostMap(); 00338 00339 size_t collection_time_ms = init_ps.get<size_t>("collection_time_ms", 1000); 00340 size_t max_graph_width = init_ps.get<size_t>("max_graph_width", 100); 00341 bool print_verbose = init_ps.get<bool>("print_verbose_info", true); 00342 bool verbose_clear_screen = init_ps.get<bool>("clear_screen", true); 00343 00344 auto blue = "\033[34m"; 00345 auto cyan = "\033[36m"; 00346 auto green = "\033[32m"; 00347 auto yellow = "\033[93m"; 00348 auto red = "\033[31m"; 00349 00350 metricMan->initialize(metric_ps, "RoutingReceiver"); 00351 metricMan->do_start(); 00352 if (print_verbose && verbose_clear_screen) std::cout << "\033[2J"; 00353 00354 std::map<int, int> receiver_table = std::map<int,int>(); 00355 00356 while (!should_stop) 00357 { 00358 auto start_time = std::chrono::steady_clock::now(); 00359 00360 auto this_table = rr.GetAndClearRoutingTable(); 00361 00362 if (this_table.size() > 0) 00363 { 00364 auto graph_width = this_table.size(); 00365 auto n = 1; // n becomes entries per graph character 00366 auto graph_width_orig = graph_width; 00367 while (graph_width > max_graph_width) 00368 { 00369 n++; 00370 graph_width = graph_width_orig / n; 00371 } 00372 00373 for (auto& entry : this_table) 00374 { 00375 receiver_table[entry.second]++; 00376 } 00377 00378 auto average_entries_per_receiver = this_table.size() / receiver_table.size(); 00379 auto offset = 2 * n; // Offset is 2 characters, in entries 00380 00381 auto cyan_threshold = ((average_entries_per_receiver - offset) / 2) / n; 00382 auto green_threshold = (average_entries_per_receiver - offset) / n; 00383 auto yellow_threshold = (average_entries_per_receiver + offset) / n; 00384 auto red_threshold = (2 * average_entries_per_receiver) / n; 00385 00386 TLOG(TLVL_TRACE) << "CT: " << cyan_threshold << ", GT: " << green_threshold << ", YT: " << yellow_threshold << ", RT: " << red_threshold; 00387 00388 std::ostringstream report; 00389 std::ostringstream verbose_report; 00390 00391 if (print_verbose && verbose_clear_screen) std::cout << "\033[;H\033[J"; 00392 00393 report << artdaq::TimeUtils::gettimeofday_us() << ": " << this_table.size() << " Entries, "; 00394 for (auto& receiver : receiver_table) 00395 { 00396 auto percent = static_cast<int>(receiver.second * 100 / this_table.size()); 00397 report << receiver.first << ": " << receiver.second << " (" << percent << "%), "; 00398 if (print_verbose) 00399 { 00400 verbose_report << receiver.first << ": " << receiver.second << " (" << percent << "%)\t["; 00401 00402 size_t graph_characters = receiver.second / n; 00403 00404 for (size_t ii = 0; ii < graph_characters; ++ii) 00405 { 00406 if (ii < cyan_threshold) 00407 { 00408 verbose_report << blue; 00409 } 00410 else if (ii < green_threshold) 00411 { 00412 verbose_report << cyan; 00413 } 00414 else if (ii < yellow_threshold) 00415 { 00416 verbose_report << green; 00417 } 00418 else if (ii < red_threshold) 00419 { 00420 verbose_report << yellow; 00421 } 00422 else 00423 { 00424 verbose_report << red; 00425 } 00426 verbose_report << "|"; 00427 } 00428 std::string spaces = std::string(graph_width - graph_characters, ' '); 00429 verbose_report << "\033[0m" << spaces << "]" << std::endl; 00430 } 00431 receiver.second = 0; 00432 } 00433 TLOG(TLVL_INFO) << report.str(); 00434 std::cout << report.str() << std::endl; 00435 if(print_verbose) std::cout << verbose_report.str() << std::endl; 00436 } 00437 std::this_thread::sleep_until(start_time + std::chrono::milliseconds(collection_time_ms)); 00438 } 00439 00440 metricMan->do_stop(); 00441 artdaq::Globals::CleanUpGlobals(); 00442 }