00001 #define TRACE_NAME "RoutingReceiver"
00002 #include "artdaq/DAQdata/Globals.hh"
00003
00004 #include <chrono>
00005 #include "canvas/Utilities/Exception.h"
00006 #include <arpa/inet.h>
00007 #include <netinet/in.h>
00008 #include <sys/types.h>
00009 #include <poll.h>
00010 #include <signal.h>
00011 #include <thread>
00012 #include <sys/socket.h>
00013 #include "artdaq/DAQdata/TCPConnect.hh"
00014 #include "artdaq/Application/LoadParameterSet.hh"
00015 #include "artdaq/DAQrate/detail/RoutingPacket.hh"
00016 #include "artdaq/TransferPlugins/detail/HostMap.hh"
00017 #include "proto/artdaqapp.hh"
00018 #include "fhiclcpp/types/Atom.h"
00019 #include "fhiclcpp/types/OptionalTable.h"
00020 #include "fhiclcpp/types/TableFragment.h"
00021
00022 namespace artdaq
00023 {
00024 class RoutingReceiver
00025 {
00026 public:
00027 struct Config
00028 {
00030 fhicl::Atom<size_t> collection_time_ms{ fhicl::Name{ "collection_time_ms" }, fhicl::Comment{ "Time to collect routing table updates between printing summaries" }, 1000 };
00032 fhicl::Atom<bool> print_verbose_info{ fhicl::Name{ "print_verbose_info" }, fhicl::Comment{ "Print verbose information about each receiver detected in routing tables" }, true };
00034 fhicl::Atom<size_t> graph_width{ fhicl::Name{ "graph_width" }, fhicl::Comment{ "Width of the summary graph" }, 40 };
00035 fhicl::TableFragment<artdaq::artdaqapp::Config> artdaqAppConfig;
00036 };
00037 using Parameters = fhicl::WrappedTable<Config>;
00038
00039 explicit RoutingReceiver(fhicl::ParameterSet const& pset)
00040 : should_stop_(false)
00041 , table_socket_(-1)
00042 , routing_table_last_(0)
00043 {
00044 TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
00045
00046
00047
00048 auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
00049 use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
00050 table_port_ = rmConfig.get<int>("table_update_port", 35556);
00051 table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
00052
00053 host_map_ = MakeHostMap(pset);
00054
00055 if (use_routing_master_) startTableReceiverThread_();
00056 }
00057
00058 ~RoutingReceiver()
00059 {
00060 TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver BEGIN";
00061 should_stop_ = true;
00062 if (routing_thread_.joinable()) routing_thread_.join();
00063 TLOG(TLVL_DEBUG) << "Shutting down RoutingReceiver END.";
00064 }
00065
00066 std::map<Fragment::sequence_id_t, int> GetRoutingTable()
00067 {
00068 std::unique_lock<std::mutex> lk(routing_mutex_);
00069 std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_);
00070 return routing_table_copy;
00071 }
00072
00073 std::map<Fragment::sequence_id_t, int> GetAndClearRoutingTable()
00074 {
00075 std::unique_lock<std::mutex> lk(routing_mutex_);
00076 std::map<Fragment::sequence_id_t, int> routing_table_copy(routing_table_);
00077 routing_table_.clear();
00078 return routing_table_copy;
00079 }
00080
00081 hostMap_t GetHostMap() { return host_map_; }
00082
00083 private:
00084 void setupTableListener_()
00085 {
00086 int sts;
00087 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00088 if (table_socket_ < 0)
00089 {
00090 TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
00091 exit(1);
00092 }
00093
00094 struct sockaddr_in si_me_request;
00095
00096 int yes = 1;
00097 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00098 {
00099 TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
00100 exit(1);
00101 }
00102 memset(&si_me_request, 0, sizeof(si_me_request));
00103 si_me_request.sin_family = AF_INET;
00104 si_me_request.sin_port = htons(table_port_);
00105
00106 struct in_addr in_addr_s;
00107 sts = inet_aton(table_address_.c_str(), &in_addr_s);
00108 if (sts == 0)
00109 {
00110 TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
00111 }
00112 si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
00113 if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
00114 {
00115 TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
00116 exit(1);
00117 }
00118
00119 struct ip_mreq mreq;
00120 sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
00121 if (sts == -1)
00122 {
00123 TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
00124 exit(1);
00125 }
00126 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
00127 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
00128 {
00129 TLOG(TLVL_ERROR) << "Unable to join multicast group";
00130 exit(1);
00131 }
00132 }
00133 void startTableReceiverThread_()
00134 {
00135 if (routing_thread_.joinable()) routing_thread_.join();
00136 TLOG(TLVL_INFO) << "Starting Routing Thread";
00137 try {
00138 routing_thread_ = boost::thread(&RoutingReceiver::receiveTableUpdatesLoop_, this);
00139 }
00140 catch (const boost::exception& e)
00141 {
00142 TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
00143 std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
00144 exit(5);
00145 }
00146 }
00147 void receiveTableUpdatesLoop_()
00148 {
00149 while (true)
00150 {
00151 if (should_stop_)
00152 {
00153 TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
00154 return;
00155 }
00156
00157 TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes";
00158 if (table_socket_ == -1)
00159 {
00160 TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
00161 setupTableListener_();
00162 }
00163 if (table_socket_ == -1)
00164 {
00165 TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
00166 return;
00167 }
00168
00169 struct pollfd fd;
00170 fd.fd = table_socket_;
00171 fd.events = POLLIN | POLLPRI;
00172
00173 auto res = poll(&fd, 1, 1000);
00174 if (res > 0)
00175 {
00176 auto first = artdaq::Fragment::InvalidSequenceID;
00177 auto last = artdaq::Fragment::InvalidSequenceID;
00178 std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
00179 artdaq::detail::RoutingPacketHeader hdr;
00180
00181 TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
00182 struct sockaddr_in from;
00183 socklen_t len = sizeof(from);
00184 auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, (struct sockaddr*)&from, &len);
00185 TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
00186
00187 if (stss > static_cast<ssize_t>(sizeof(hdr)))
00188 {
00189 memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader));
00190 }
00191 else
00192 {
00193 TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
00194 continue;
00195 }
00196
00197 TRACE(TLVL_DEBUG, "receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx", hdr.nEntries, ((unsigned long*)&hdr)[0], ((unsigned long*)&hdr)[1]);
00198 if (hdr.header != ROUTING_MAGIC)
00199 {
00200 TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss;
00201 }
00202 else
00203 {
00204 artdaq::detail::RoutingPacket buffer(hdr.nEntries);
00205 assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
00206 memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
00207 TRACE(6, "receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx", stss, ((unsigned long*)&buffer[0])[0], ((unsigned long*)&buffer[0])[1]);
00208
00209 first = buffer[0].sequence_id;
00210 last = buffer[buffer.size() - 1].sequence_id;
00211
00212 if (first + hdr.nEntries - 1 != last)
00213 {
00214 TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
00215 continue;
00216 }
00217 auto thisSeqID = first;
00218
00219 {
00220 std::unique_lock<std::mutex> lck(routing_mutex_);
00221 if (routing_table_.count(last) == 0)
00222 {
00223 for (auto entry : buffer)
00224 {
00225 if (thisSeqID != entry.sequence_id)
00226 {
00227 TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
00228 last = thisSeqID - 1;
00229 break;
00230 }
00231 thisSeqID++;
00232 if (routing_table_.count(entry.sequence_id))
00233 {
00234 if (routing_table_[entry.sequence_id] != entry.destination_rank)
00235 {
00236 TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
00237 << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
00238 << " I will use the original value!";
00239 }
00240 continue;
00241 }
00242 if (entry.sequence_id < routing_table_last_) continue;
00243 routing_table_[entry.sequence_id] = entry.destination_rank;
00244 TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
00245 << " -> Rank " << entry.destination_rank;
00246 }
00247 }
00248
00249 TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
00250 if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
00251
00252 auto counter = 0;
00253 for (auto& entry : routing_table_)
00254 {
00255 TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
00256 counter++;
00257 }
00258 }
00259
00260 if (last > routing_table_last_) routing_table_last_ = last;
00261 }
00262 }
00263 }
00264 }
00265
00266 private:
00267 bool use_routing_master_;
00268 std::atomic<bool> should_stop_;
00269 int table_port_;
00270 std::string table_address_;
00271 int table_socket_;
00272 std::map<Fragment::sequence_id_t, int> routing_table_;
00273 Fragment::sequence_id_t routing_table_last_;
00274 mutable std::mutex routing_mutex_;
00275 boost::thread routing_thread_;
00276 hostMap_t host_map_;
00277 };
00278 }
00279
00280
00281 static bool sighandler_init = false;
00282 static bool should_stop = false;
00283 static void signal_handler(int signum)
00284 {
00285
00286 TRACE_STREAMER(TLVL_ERROR, &("routingReceiver")[0], 0, 0, 0) << "A signal of type " << signum << " was caught by routingReceiver. Stopping receive loop!";
00287
00288 should_stop = true;
00289
00290 sigset_t set;
00291 pthread_sigmask(SIG_UNBLOCK, NULL, &set);
00292 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
00293
00294 }
00295
00296 int main(int argc, char* argv[])
00297 {
00298 artdaq::configureMessageFacility("RoutingReceiver", false, false);
00299 static std::mutex sighandler_mutex;
00300 std::unique_lock<std::mutex> lk(sighandler_mutex);
00301
00302 if (!sighandler_init)
00303 {
00304 sighandler_init = true;
00305 std::vector<int> signals = { SIGINT, SIGTERM, SIGUSR1, SIGUSR2 };
00306 for (auto signal : signals)
00307 {
00308 struct sigaction old_action;
00309 sigaction(signal, NULL, &old_action);
00310
00311
00312
00313 if (old_action.sa_handler != SIG_IGN)
00314 {
00315 struct sigaction action;
00316 action.sa_handler = signal_handler;
00317 sigemptyset(&action.sa_mask);
00318 for (auto sigblk : signals)
00319 {
00320 sigaddset(&action.sa_mask, sigblk);
00321 }
00322 action.sa_flags = 0;
00323
00324
00325 sigaction(signal, &action, NULL);
00326 }
00327 }
00328 }
00329
00330 fhicl::ParameterSet init_ps = LoadParameterSet<artdaq::RoutingReceiver::Config>(argc, argv, "routingReceiver", "This application receives Routing Tables, and calculates statistics about the usage of the receivers");
00331 fhicl::ParameterSet config_ps = init_ps.get<fhicl::ParameterSet>("daq", init_ps);
00332 fhicl::ParameterSet metric_ps = config_ps.get<fhicl::ParameterSet>("metrics", config_ps);
00333 fhicl::ParameterSet fr_ps = config_ps.get<fhicl::ParameterSet>("fragment_receiver", config_ps);
00334
00335 artdaq::RoutingReceiver rr(fr_ps);
00336
00337 auto host_map = rr.GetHostMap();
00338
00339 size_t collection_time_ms = init_ps.get<size_t>("collection_time_ms", 1000);
00340 size_t max_graph_width = init_ps.get<size_t>("max_graph_width", 100);
00341 bool print_verbose = init_ps.get<bool>("print_verbose_info", true);
00342 bool verbose_clear_screen = init_ps.get<bool>("clear_screen", true);
00343
00344 auto blue = "\033[34m";
00345 auto cyan = "\033[36m";
00346 auto green = "\033[32m";
00347 auto yellow = "\033[93m";
00348 auto red = "\033[31m";
00349
00350 metricMan->initialize(metric_ps, "RoutingReceiver");
00351 metricMan->do_start();
00352 if (print_verbose && verbose_clear_screen) std::cout << "\033[2J";
00353
00354 std::map<int, int> receiver_table = std::map<int,int>();
00355
00356 while (!should_stop)
00357 {
00358 auto start_time = std::chrono::steady_clock::now();
00359
00360 auto this_table = rr.GetAndClearRoutingTable();
00361
00362 if (this_table.size() > 0)
00363 {
00364 auto graph_width = this_table.size();
00365 auto n = 1;
00366 auto graph_width_orig = graph_width;
00367 while (graph_width > max_graph_width)
00368 {
00369 n++;
00370 graph_width = graph_width_orig / n;
00371 }
00372
00373 for (auto& entry : this_table)
00374 {
00375 receiver_table[entry.second]++;
00376 }
00377
00378 auto average_entries_per_receiver = this_table.size() / receiver_table.size();
00379 auto offset = 2 * n;
00380
00381 auto cyan_threshold = ((average_entries_per_receiver - offset) / 2) / n;
00382 auto green_threshold = (average_entries_per_receiver - offset) / n;
00383 auto yellow_threshold = (average_entries_per_receiver + offset) / n;
00384 auto red_threshold = (2 * average_entries_per_receiver) / n;
00385
00386 TLOG(TLVL_TRACE) << "CT: " << cyan_threshold << ", GT: " << green_threshold << ", YT: " << yellow_threshold << ", RT: " << red_threshold;
00387
00388 std::ostringstream report;
00389 std::ostringstream verbose_report;
00390
00391 if (print_verbose && verbose_clear_screen) std::cout << "\033[;H\033[J";
00392
00393 report << artdaq::TimeUtils::gettimeofday_us() << ": " << this_table.size() << " Entries, ";
00394 for (auto& receiver : receiver_table)
00395 {
00396 auto percent = static_cast<int>(receiver.second * 100 / this_table.size());
00397 report << receiver.first << ": " << receiver.second << " (" << percent << "%), ";
00398 if (print_verbose)
00399 {
00400 verbose_report << receiver.first << ": " << receiver.second << " (" << percent << "%)\t[";
00401
00402 size_t graph_characters = receiver.second / n;
00403
00404 for (size_t ii = 0; ii < graph_characters; ++ii)
00405 {
00406 if (ii < cyan_threshold)
00407 {
00408 verbose_report << blue;
00409 }
00410 else if (ii < green_threshold)
00411 {
00412 verbose_report << cyan;
00413 }
00414 else if (ii < yellow_threshold)
00415 {
00416 verbose_report << green;
00417 }
00418 else if (ii < red_threshold)
00419 {
00420 verbose_report << yellow;
00421 }
00422 else
00423 {
00424 verbose_report << red;
00425 }
00426 verbose_report << "|";
00427 }
00428 std::string spaces = std::string(graph_width - graph_characters, ' ');
00429 verbose_report << "\033[0m" << spaces << "]" << std::endl;
00430 }
00431 receiver.second = 0;
00432 }
00433 TLOG(TLVL_INFO) << report.str();
00434 std::cout << report.str() << std::endl;
00435 if(print_verbose) std::cout << verbose_report.str() << std::endl;
00436 }
00437 std::this_thread::sleep_until(start_time + std::chrono::milliseconds(collection_time_ms));
00438 }
00439
00440 metricMan->do_stop();
00441 artdaq::Globals::CleanUpGlobals();
00442 }