00001 #include "artdaq/DAQrate/DataSenderManager.hh"
00002 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
00003 #include "artdaq/DAQdata/Globals.hh"
00004
00005 #include <chrono>
00006 #include "canvas/Utilities/Exception.h"
00007 #include <arpa/inet.h>
00008 #include <netinet/in.h>
00009 #include <sys/types.h>
00010 #include <poll.h>
00011 #include <sys/socket.h>
00012 #include "artdaq/Application/Routing/RoutingPacket.hh"
00013 #include "artdaq/DAQdata/TCPConnect.hh"
00014
00015 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
00016 : destinations_()
00017 , enabled_destinations_()
00018 , sent_frag_count_()
00019 , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
00020 , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
00021 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
00022 , should_stop_(false)
00023 , ack_socket_(-1)
00024 , table_socket_(-1)
00025 {
00026 TLOG_DEBUG("DataSenderManager") << "Received pset: " << pset.to_string() << TLOG_ENDL;
00027 auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
00028 use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
00029 table_port_ = rmConfig.get<int>("table_update_port", 35556);
00030 table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
00031 ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
00032 ack_address_ = rmConfig.get<std::string>("routing_master_hostname", "localhost");
00033 routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
00034
00035
00036 auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
00037 for (auto& d : dests.get_pset_names())
00038 {
00039 try
00040 {
00041 auto transfer = MakeTransferPlugin(dests, d, TransferInterface::Role::kSend);
00042 auto destination_rank = transfer->destination_rank();
00043 destinations_.emplace( destination_rank, std::move(transfer));
00044 }
00045 catch (std::invalid_argument)
00046 {
00047 TRACE(3, "Invalid destination specification: " + d);
00048 }
00049 catch (cet::exception ex)
00050 {
00051 TLOG_WARNING("DataSenderManager") << "Caught cet::exception: " << ex.what() << TLOG_ENDL;
00052 }
00053 catch (...)
00054 {
00055 TLOG_WARNING("DataSenderManager") << "Non-cet exception while setting up TransferPlugin: " << d << "." << TLOG_ENDL;
00056 }
00057 }
00058 if (destinations_.size() == 0)
00059 {
00060 TLOG_ERROR("DataSenderManager") << "No destinations specified!" << TLOG_ENDL;
00061 }
00062 else
00063 {
00064 auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
00065 if (enabled_dests.size() == 0)
00066 {
00067 TLOG_INFO("DataSenderManager") << "enabled_destinations not specified, assuming all destinations enabled." << TLOG_ENDL;
00068 for (auto& d : destinations_)
00069 {
00070 enabled_destinations_.insert(d.first);
00071 }
00072 }
00073 else
00074 {
00075 for (auto& d : enabled_dests)
00076 {
00077 enabled_destinations_.insert(d);
00078 }
00079 }
00080 }
00081 if (use_routing_master_) startTableReceiverThread_();
00082 }
00083
00084 artdaq::DataSenderManager::~DataSenderManager()
00085 {
00086 TLOG_DEBUG("DataSenderManager") << "Shutting down DataSenderManager BEGIN" << TLOG_ENDL;
00087 should_stop_ = true;
00088 for (auto& dest : enabled_destinations_)
00089 {
00090 if (destinations_.count(dest))
00091 {
00092 destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
00093
00094 }
00095 }
00096 if (routing_thread_.joinable()) routing_thread_.join();
00097 TLOG_DEBUG("DataSenderManager") << "Shutting down DataSenderManager END. Sent " << count() << " fragments." << TLOG_ENDL;
00098 }
00099
00100
00101 void artdaq::DataSenderManager::setupTableListener_()
00102 {
00103 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00104 if (!table_socket_)
00105 {
00106 TLOG_ERROR("DataSenderManager") << "Error creating socket for receiving table updates!" << TLOG_ENDL;
00107 exit(1);
00108 }
00109
00110 struct sockaddr_in si_me_request;
00111
00112 int yes = 1;
00113 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00114 {
00115 TLOG_ERROR("DataSenderManager") << " Unable to enable port reuse on request socket" << TLOG_ENDL;
00116 exit(1);
00117 }
00118 memset(&si_me_request, 0, sizeof(si_me_request));
00119 si_me_request.sin_family = AF_INET;
00120 si_me_request.sin_port = htons(table_port_);
00121 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00122 if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
00123 {
00124 TLOG_ERROR("DataSenderManager") << "Cannot bind request socket to port " << table_port_ << TLOG_ENDL;
00125 exit(1);
00126 }
00127
00128 struct ip_mreq mreq;
00129 int sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
00130 if (sts == -1)
00131 {
00132 TLOG_ERROR("DataSenderManager") << "Unable to resolve multicast address for table updates" << TLOG_ENDL;
00133 exit(1);
00134 }
00135 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
00136 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
00137 {
00138 TLOG_ERROR("DataSenderManager") << "Unable to join multicast group" << TLOG_ENDL;
00139 exit(1);
00140 }
00141 }
00142 void artdaq::DataSenderManager::startTableReceiverThread_()
00143 {
00144 if (routing_thread_.joinable()) routing_thread_.join();
00145 TLOG_INFO("DataSenderManager") << "Starting Routing Thread" << TLOG_ENDL;
00146 routing_thread_ = std::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
00147 }
00148 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
00149 {
00150 while (true)
00151 {
00152 if (should_stop_)
00153 {
00154 TLOG_DEBUG("DataSenderManager") << "receiveTableUpdatesLoop: should_stop is " << std::boolalpha << should_stop_ << ", stopping" << TLOG_ENDL;
00155 return;
00156 }
00157
00158 TRACE(4, "DataSenderManager::receiveTableUpdatesLoop: Polling Request socket for new requests");
00159 if (table_socket_ == -1)
00160 {
00161 TLOG_DEBUG("DataSenderManager") << "Opening table listener socket" << TLOG_ENDL;
00162 setupTableListener_();
00163 }
00164 if (table_socket_ == -1)
00165 {
00166 TLOG_DEBUG("DataSenderManager") << "The listen socket was not opened successfully." << TLOG_ENDL;
00167 return;
00168 }
00169 if (ack_socket_ == -1)
00170 {
00171 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00172 auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
00173 if (sts == -1)
00174 {
00175 TLOG_ERROR("DataSenderManager") << "Unable to resolve routing_master_address" << TLOG_ENDL;
00176 exit(1);
00177 }
00178 TLOG_DEBUG("DataSenderManager") << "Ack socket is fd " << ack_socket_ << TLOG_ENDL;
00179 }
00180
00181 struct pollfd fd;
00182 fd.fd = table_socket_;
00183 fd.events = POLLIN | POLLPRI;
00184
00185 auto res = poll(&fd, 1, 1000);
00186 if (res > 0) {
00187 auto first = artdaq::Fragment::InvalidSequenceID;
00188 auto last = artdaq::Fragment::InvalidSequenceID;
00189 artdaq::detail::RoutingPacketHeader hdr;
00190
00191 TLOG_DEBUG("DataSenderManager") << "Going to receive RoutingPacketHeader" << TLOG_ENDL;
00192 auto stss = recvfrom(table_socket_, &hdr, sizeof(artdaq::detail::RoutingPacketHeader), 0, NULL, NULL);
00193 TLOG_DEBUG("DataSenderManager") << "Received " << std::to_string(stss) << " bytes. (sizeof(RoutingPacketHeader) == " << std::to_string(sizeof(detail::RoutingPacketHeader)) << TLOG_ENDL;
00194
00195 TLOG_DEBUG("DataSenderManager") << "Checking for valid header" << TLOG_ENDL;
00196 if (hdr.header == ROUTING_MAGIC) {
00197 if(routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
00198 {
00199 TLOG_ERROR("DataSenderManager") << "Received table has different RoutingMasterMode than expected!" << TLOG_ENDL;
00200 exit(1);
00201 }
00202 routing_master_mode_ = hdr.mode;
00203
00204 artdaq::detail::RoutingPacket buffer(hdr.nEntries);
00205 TLOG_DEBUG("DataSenderManager") << "Receiving data buffer" << TLOG_ENDL;
00206 auto sts = recv(table_socket_, &buffer[0], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries, 0);
00207 assert(static_cast<size_t>(sts) == sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
00208 TRACE(6, "Received a packet of %zu bytes", sts);
00209
00210 first = buffer[0].sequence_id;
00211 last = buffer[buffer.size() - 1].sequence_id;
00212
00213 if (first + hdr.nEntries - 1 != last)
00214 {
00215 TLOG_ERROR("DataSenderManager") << "Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!" << TLOG_ENDL;
00216 continue;
00217 }
00218 auto thisSeqID = first;
00219
00220 if (routing_table_.count(last) == 0) {
00221 for (auto entry : buffer)
00222 {
00223 if (thisSeqID != entry.sequence_id)
00224 {
00225 TLOG_ERROR("DataSenderManager") << "Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!" << TLOG_ENDL;
00226 last = thisSeqID - 1;
00227 break;
00228 }
00229 thisSeqID++;
00230 if (routing_table_.count(entry.sequence_id))
00231 {
00232 if (routing_table_[entry.sequence_id] != entry.destination_rank)
00233 {
00234 TLOG_ERROR("DataSenderManager") << "Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
00235 << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
00236 << " I will use the original value!" << TLOG_ENDL;
00237 }
00238 continue;
00239 }
00240 routing_table_[entry.sequence_id] = entry.destination_rank;
00241 TLOG_DEBUG("DataSenderManager") << "DataSenderManager " << std::to_string(my_rank) << ": received update: SeqID " << std::to_string(entry.sequence_id) << " -> Rank " << std::to_string(entry.destination_rank) << TLOG_ENDL;
00242 }
00243 }
00244
00245 artdaq::detail::RoutingAckPacket ack;
00246 ack.rank = my_rank;
00247 ack.first_sequence_id = first;
00248 ack.last_sequence_id = last;
00249
00250 TLOG_DEBUG("DataSenderManager") << "Sending RoutingAckPacket with first= " << std::to_string(first) << " and last= " << std::to_string(last) << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")"<< TLOG_ENDL;
00251 TLOG_DEBUG("DataSenderManager") << "There are now " << routing_table_.size() << " entries in the Routing Table" << TLOG_ENDL;
00252 sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr_, sizeof(ack_addr_));
00253 }
00254 }
00255 }
00256 }
00257
00258 size_t artdaq::DataSenderManager::GetRoutingTableEntryCount() const
00259 {
00260 std::unique_lock<std::mutex> lck(routing_mutex_);
00261 return routing_table_.size();
00262 }
00263
00264 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
00265 {
00266 if (enabled_destinations_.size() == 0) return TransferInterface::RECV_TIMEOUT;
00267 if (enabled_destinations_.size() == 1) return *enabled_destinations_.begin();
00268
00269 if (use_routing_master_)
00270 {
00271 auto start = std::chrono::steady_clock::now();
00272 while (routing_timeout_ms_ <= 0 || std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count() < routing_timeout_ms_)
00273 {
00274 std::unique_lock<std::mutex> lck(routing_mutex_);
00275 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id)) {
00276 routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
00277 return routing_table_.at(sequence_id);
00278 }
00279 else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count()))
00280 {
00281 routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
00282 return routing_table_.at(sent_frag_count_.count());
00283 }
00284 usleep(routing_timeout_ms_ * 10);
00285 }
00286 routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
00287 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID) {
00288 TLOG_ERROR("DataSenderManager") << "Bad Omen: I don't have routing information for seqID " << std::to_string(sequence_id)
00289 << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!" << TLOG_ENDL;
00290 }
00291 else
00292 {
00293 TLOG_ERROR("DataSenderManager") << "Bad Omen: I don't have routing information for send number " << std::to_string(sent_frag_count_.count())
00294 << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!" << TLOG_ENDL;
00295 }
00296 }
00297 else {
00298 auto index = sequence_id % enabled_destinations_.size();
00299 auto it = enabled_destinations_.begin();
00300 for (; index > 0; --index)
00301 {
00302 ++it;
00303 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
00304 }
00305 return *it;
00306 }
00307 return TransferInterface::RECV_TIMEOUT;
00308 }
00309
00310 int
00311 artdaq::DataSenderManager::
00312 sendFragment(Fragment&& frag)
00313 {
00314
00315
00316 auto start_time = std::chrono::steady_clock::now();
00317 if (frag.type() == Fragment::EndOfDataFragmentType)
00318 {
00319 throw cet::exception("LogicError")
00320 << "EOD fragments should not be sent on as received: "
00321 << "use sendEODFrag() instead.";
00322 }
00323 size_t seqID = frag.sequenceID();
00324 size_t fragSize = frag.sizeBytes();
00325 TLOG_ARB(13, "DataSenderManager") << "sendFragment start frag.fragmentHeader()=" << std::hex << (void*)(frag.headerBeginBytes()) << ", szB=" << std::dec << std::to_string(fragSize) << ", seqID=" << std::to_string(seqID) << TLOG_ENDL;
00326 int dest = TransferInterface::RECV_TIMEOUT;
00327 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
00328 {
00329 for (auto& bdest : enabled_destinations_)
00330 {
00331 TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d (broadcast)", seqID, bdest);
00332
00333 Fragment fragCopy(frag);
00334 auto sts = destinations_[bdest]->copyFragment(fragCopy);
00335 while (sts == TransferInterface::CopyStatus::kTimeout)
00336 {
00337 sts = destinations_[bdest]->copyFragment(fragCopy);
00338 }
00339 sent_frag_count_.incSlot(bdest);
00340 }
00341 }
00342 else if (non_blocking_mode_)
00343 {
00344 while (dest == TransferInterface::RECV_TIMEOUT) {
00345 dest = calcDest_(seqID);
00346 if (dest == TransferInterface::RECV_TIMEOUT)
00347 {
00348 TLOG_WARNING("DataSenderManager") << "Could not get destination for seqID " << std::to_string(seqID) << ", retrying." << TLOG_ENDL;
00349 }
00350 }
00351 if (destinations_.count(dest) && enabled_destinations_.count(dest))
00352 {
00353 TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
00354 TransferInterface::CopyStatus sts = TransferInterface::CopyStatus::kErrorNotRequiringException;
00355 auto lastWarnTime = std::chrono::steady_clock::now();
00356 while (sts != TransferInterface::CopyStatus::kSuccess)
00357 {
00358 sts = destinations_[dest]->copyFragment(frag);
00359 if (sts != TransferInterface::CopyStatus::kSuccess && std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - lastWarnTime).count() >= 1)
00360 {
00361 TLOG_ERROR("DataSenderManager") << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying..." << TLOG_ENDL;
00362 lastWarnTime = std::chrono::steady_clock::now();
00363 }
00364 }
00365
00366 sent_frag_count_.incSlot(dest);
00367 }
00368 else
00369 {
00370 TLOG_WARNING("DataSenderManager") << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID << TLOG_ENDL;
00371 }
00372 }
00373 else
00374 {
00375 while (dest == TransferInterface::RECV_TIMEOUT) {
00376 dest = calcDest_(seqID);
00377 if (dest == TransferInterface::RECV_TIMEOUT)
00378 {
00379 TLOG_WARNING("DataSenderManager") << "Could not get destination for seqID " << std::to_string(seqID) << ", send number " << sent_frag_count_.count() << ", retrying." << TLOG_ENDL;
00380 }
00381 }
00382 if (destinations_.count(dest) && enabled_destinations_.count(dest))
00383 {
00384 TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
00385 TransferInterface::CopyStatus sts = TransferInterface::CopyStatus::kErrorNotRequiringException;
00386 auto lastWarnTime = std::chrono::steady_clock::now();
00387 while (sts != TransferInterface::CopyStatus::kSuccess)
00388 {
00389 sts = destinations_[dest]->moveFragment(std::move(frag));
00390 if (sts != TransferInterface::CopyStatus::kSuccess && std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - lastWarnTime).count() >= 1)
00391 {
00392 TLOG_ERROR("DataSenderManager") << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying..." << TLOG_ENDL;
00393 lastWarnTime = std::chrono::steady_clock::now();
00394 }
00395 }
00396
00397 sent_frag_count_.incSlot(dest);
00398 }
00399 else
00400 {
00401 TLOG_WARNING("DataSenderManager") << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID << TLOG_ENDL;
00402 }
00403 }
00404 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.find(seqID - 1) != routing_table_.end())
00405 {
00406 std::unique_lock<std::mutex> lck(routing_mutex_);
00407 routing_table_.erase(routing_table_.begin(), routing_table_.find(seqID - 1));
00408 }
00409 else if(routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount)
00410 {
00411 std::unique_lock<std::mutex> lck(routing_mutex_);
00412 routing_table_.erase(routing_table_.begin(), routing_table_.find(sent_frag_count_.count()));
00413 }
00414 if (metricMan)
00415 {
00416 auto delta_t = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - start_time).count();
00417 metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 1);
00418 metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 1);
00419 metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 1);
00420 if (use_routing_master_) {
00421 metricMan->sendMetric("Routing Table Size", routing_table_.size(), "events", 1);
00422 if (routing_wait_time_ > 0)
00423 {
00424 size_t wttemp = routing_wait_time_;
00425 routing_wait_time_ = 0;
00426 metricMan->sendMetric("Routing Wait Time", wttemp / 1000000000, "s", 1);
00427 }
00428 }
00429 }
00430 TRACE(5, "DataSenderManager::sendFragment: Done sending fragment %zu", seqID);
00431 return dest;
00432 }