00001 #include "artdaq/DAQrate/DataSenderManager.hh"
00002 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
00003 #include "artdaq/DAQdata/Globals.hh"
00004
00005 #include <chrono>
00006 #include <canvas/Utilities/Exception.h>
00007 #include <arpa/inet.h>
00008 #include <netinet/in.h>
00009 #include <sys/types.h>
00010 #include <poll.h>
00011 #include <sys/socket.h>
00012 #include "artdaq/Application/Routing/RoutingPacket.hh"
00013 #include <artdaq/DAQdata/TCPConnect.hh>
00014
00015 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
00016 : destinations_()
00017 , enabled_destinations_()
00018 , sent_frag_count_()
00019 , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
00020 , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
00021 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
00022 , should_stop_(false)
00023 , ack_socket_(-1)
00024 , table_socket_(-1)
00025 {
00026 TLOG_DEBUG("DataSenderManager") << "Received pset: " << pset.to_string() << TLOG_ENDL;
00027 auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
00028 use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
00029 table_port_ = rmConfig.get<int>("table_update_port", 35556);
00030 table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
00031 ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
00032 ack_address_ = rmConfig.get<std::string>("routing_master_hostname", "localhost");
00033 routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
00034
00035
00036 auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
00037 for (auto& d : dests.get_pset_names())
00038 {
00039 try
00040 {
00041 auto dd = dests.get<fhicl::ParameterSet>(d).get<int>("destination_rank");
00042 destinations_.emplace(dd, MakeTransferPlugin(dests, d, TransferInterface::Role::kSend));
00043 }
00044 catch (std::invalid_argument)
00045 {
00046 TRACE(3, "Invalid destination specification: " + d);
00047 }
00048 catch (cet::exception ex)
00049 {
00050 TLOG_WARNING("DataSenderManager") << "Caught cet::exception: " << ex.what() << TLOG_ENDL;
00051 }
00052 catch (...)
00053 {
00054 TLOG_WARNING("DataSenderManager") << "Non-cet exception while setting up TransferPlugin: " << d << "." << TLOG_ENDL;
00055 }
00056 }
00057 if (destinations_.size() == 0)
00058 {
00059 TLOG_ERROR("DataSenderManager") << "No destinations specified!" << TLOG_ENDL;
00060 }
00061 else
00062 {
00063 auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
00064 if (enabled_dests.size() == 0)
00065 {
00066 TLOG_INFO("DataSenderManager") << "enabled_destinations not specified, assuming all destinations enabled." << TLOG_ENDL;
00067 for (auto& d : destinations_)
00068 {
00069 enabled_destinations_.insert(d.first);
00070 }
00071 }
00072 else
00073 {
00074 for (auto& d : enabled_dests)
00075 {
00076 enabled_destinations_.insert(d);
00077 }
00078 }
00079 }
00080 if (use_routing_master_) startTableReceiverThread_();
00081 }
00082
00083 artdaq::DataSenderManager::~DataSenderManager()
00084 {
00085 TLOG_DEBUG("DataSenderManager") << "Shutting down DataSenderManager BEGIN" << TLOG_ENDL;
00086 should_stop_ = true;
00087 for (auto& dest : enabled_destinations_)
00088 {
00089 if (destinations_.count(dest))
00090 {
00091 destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
00092
00093 }
00094 }
00095 if (routing_thread_.joinable()) routing_thread_.join();
00096 TLOG_DEBUG("DataSenderManager") << "Shutting down DataSenderManager END. Sent " << count() << " fragments." << TLOG_ENDL;
00097 }
00098
00099
00100 void artdaq::DataSenderManager::setupTableListener_()
00101 {
00102 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00103 if (!table_socket_)
00104 {
00105 TLOG_ERROR("DataSenderManager") << "Error creating socket for receiving table updates!" << TLOG_ENDL;
00106 exit(1);
00107 }
00108
00109 struct sockaddr_in si_me_request;
00110
00111 int yes = 1;
00112 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00113 {
00114 TLOG_ERROR("DataSenderManager") << " Unable to enable port reuse on request socket" << TLOG_ENDL;
00115 exit(1);
00116 }
00117 memset(&si_me_request, 0, sizeof(si_me_request));
00118 si_me_request.sin_family = AF_INET;
00119 si_me_request.sin_port = htons(table_port_);
00120 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00121 if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
00122 {
00123 TLOG_ERROR("DataSenderManager") << "Cannot bind request socket to port " << table_port_ << TLOG_ENDL;
00124 exit(1);
00125 }
00126
00127 struct ip_mreq mreq;
00128 int sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
00129 if (sts == -1)
00130 {
00131 TLOG_ERROR("DataSenderManager") << "Unable to resolve multicast address for table updates" << TLOG_ENDL;
00132 exit(1);
00133 }
00134 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
00135 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
00136 {
00137 TLOG_ERROR("DataSenderManager") << "Unable to join multicast group" << TLOG_ENDL;
00138 exit(1);
00139 }
00140 }
00141 void artdaq::DataSenderManager::startTableReceiverThread_()
00142 {
00143 if (routing_thread_.joinable()) routing_thread_.join();
00144 TLOG_INFO("DataSenderManager") << "Starting Routing Thread" << TLOG_ENDL;
00145 routing_thread_ = std::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
00146 }
00147 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
00148 {
00149 while (true)
00150 {
00151 if (should_stop_)
00152 {
00153 TLOG_DEBUG("DataSenderManager") << "receiveTableUpdatesLoop: should_stop is " << std::boolalpha << should_stop_ << ", stopping" << TLOG_ENDL;
00154 return;
00155 }
00156
00157 TRACE(4, "DataSenderManager::receiveTableUpdatesLoop: Polling Request socket for new requests");
00158 if (table_socket_ == -1)
00159 {
00160 TLOG_DEBUG("DataSenderManager") << "Opening table listener socket" << TLOG_ENDL;
00161 setupTableListener_();
00162 }
00163 if (table_socket_ == -1)
00164 {
00165 TLOG_DEBUG("DataSenderManager") << "The listen socket was not opened successfully." << TLOG_ENDL;
00166 return;
00167 }
00168 if (ack_socket_ == -1)
00169 {
00170 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00171 auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
00172 if (sts == -1)
00173 {
00174 TLOG_ERROR("DataSenderManager") << "Unable to resolve routing_master_address" << TLOG_ENDL;
00175 exit(1);
00176 }
00177 TLOG_DEBUG("DataSenderManager") << "Ack socket is fd " << ack_socket_ << TLOG_ENDL;
00178 }
00179
00180 struct pollfd fd;
00181 fd.fd = table_socket_;
00182 fd.events = POLLIN | POLLPRI;
00183
00184 auto res = poll(&fd, 1, 1000);
00185 if (res > 0) {
00186 auto first = artdaq::Fragment::InvalidSequenceID;
00187 auto last = artdaq::Fragment::InvalidSequenceID;
00188 artdaq::detail::RoutingPacketHeader hdr;
00189
00190 TLOG_DEBUG("DataSenderManager") << "Going to receive RoutingPacketHeader" << TLOG_ENDL;
00191 auto stss = recvfrom(table_socket_, &hdr, sizeof(artdaq::detail::RoutingPacketHeader), 0, NULL, NULL);
00192 TLOG_DEBUG("DataSenderManager") << "Received " << std::to_string(stss) << " bytes. (sizeof(RoutingPacketHeader) == " << std::to_string(sizeof(detail::RoutingPacketHeader)) << TLOG_ENDL;
00193
00194 TLOG_DEBUG("DataSenderManager") << "Checking for valid header" << TLOG_ENDL;
00195 if (hdr.header == ROUTING_MAGIC) {
00196 if(routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
00197 {
00198 TLOG_ERROR("DataSenderManager") << "Received table has different RoutingMasterMode than expected!" << TLOG_ENDL;
00199 exit(1);
00200 }
00201 routing_master_mode_ = hdr.mode;
00202
00203 artdaq::detail::RoutingPacket buffer(hdr.nEntries);
00204 TLOG_DEBUG("DataSenderManager") << "Receiving data buffer" << TLOG_ENDL;
00205 auto sts = recv(table_socket_, &buffer[0], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries, 0);
00206 assert(static_cast<size_t>(sts) == sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
00207 TRACE(6, "Received a packet of %zu bytes", sts);
00208
00209 first = buffer[0].sequence_id;
00210 last = buffer[buffer.size() - 1].sequence_id;
00211
00212 if (first + hdr.nEntries - 1 != last)
00213 {
00214 TLOG_ERROR("DataSenderManager") << "Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!" << TLOG_ENDL;
00215 continue;
00216 }
00217 auto thisSeqID = first;
00218
00219 if (routing_table_.count(last) == 0) {
00220 for (auto entry : buffer)
00221 {
00222 if (thisSeqID != entry.sequence_id)
00223 {
00224 TLOG_ERROR("DataSenderManager") << "Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!" << TLOG_ENDL;
00225 last = thisSeqID - 1;
00226 break;
00227 }
00228 thisSeqID++;
00229 if (routing_table_.count(entry.sequence_id))
00230 {
00231 if (routing_table_[entry.sequence_id] != entry.destination_rank)
00232 {
00233 TLOG_ERROR("DataSenderManager") << "Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
00234 << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
00235 << " I will use the original value!" << TLOG_ENDL;
00236 }
00237 continue;
00238 }
00239 routing_table_[entry.sequence_id] = entry.destination_rank;
00240 TLOG_DEBUG("DataSenderManager") << "DataSenderManager " << std::to_string(my_rank) << ": received update: SeqID " << std::to_string(entry.sequence_id) << " -> Rank " << std::to_string(entry.destination_rank) << TLOG_ENDL;
00241 }
00242 }
00243
00244 artdaq::detail::RoutingAckPacket ack;
00245 ack.rank = my_rank;
00246 ack.first_sequence_id = first;
00247 ack.last_sequence_id = last;
00248
00249 TLOG_DEBUG("DataSenderManager") << "Sending RoutingAckPacket with first= " << std::to_string(first) << " and last= " << std::to_string(last) << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")"<< TLOG_ENDL;
00250 TLOG_DEBUG("DataSenderManager") << "There are now " << routing_table_.size() << " entries in the Routing Table" << TLOG_ENDL;
00251 sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr_, sizeof(ack_addr_));
00252 }
00253 }
00254 }
00255 }
00256
00257 size_t artdaq::DataSenderManager::GetRoutingTableEntryCount() const
00258 {
00259 std::unique_lock<std::mutex> lck(routing_mutex_);
00260 return routing_table_.size();
00261 }
00262
00263 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
00264 {
00265 if (enabled_destinations_.size() == 0) return TransferInterface::RECV_TIMEOUT;
00266 if (enabled_destinations_.size() == 1) return *enabled_destinations_.begin();
00267
00268 if (use_routing_master_)
00269 {
00270 auto start = std::chrono::steady_clock::now();
00271 while (routing_timeout_ms_ <= 0 || std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count() < routing_timeout_ms_)
00272 {
00273 std::unique_lock<std::mutex> lck(routing_mutex_);
00274 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id)) {
00275 routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
00276 return routing_table_.at(sequence_id);
00277 }
00278 else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count()))
00279 {
00280 routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
00281 return routing_table_.at(sent_frag_count_.count());
00282 }
00283 usleep(routing_timeout_ms_ * 10);
00284 }
00285 routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
00286 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID) {
00287 TLOG_ERROR("DataSenderManager") << "Bad Omen: I don't have routing information for seqID " << std::to_string(sequence_id)
00288 << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!" << TLOG_ENDL;
00289 }
00290 else
00291 {
00292 TLOG_ERROR("DataSenderManager") << "Bad Omen: I don't have routing information for send number " << std::to_string(sent_frag_count_.count())
00293 << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!" << TLOG_ENDL;
00294 }
00295 }
00296 else {
00297 auto index = sequence_id % enabled_destinations_.size();
00298 auto it = enabled_destinations_.begin();
00299 for (; index > 0; --index)
00300 {
00301 ++it;
00302 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
00303 }
00304 return *it;
00305 }
00306 return TransferInterface::RECV_TIMEOUT;
00307 }
00308
00309 int
00310 artdaq::DataSenderManager::
00311 sendFragment(Fragment&& frag)
00312 {
00313
00314
00315 auto start_time = std::chrono::steady_clock::now();
00316 if (frag.type() == Fragment::EndOfDataFragmentType)
00317 {
00318 throw cet::exception("LogicError")
00319 << "EOD fragments should not be sent on as received: "
00320 << "use sendEODFrag() instead.";
00321 }
00322 size_t seqID = frag.sequenceID();
00323 size_t fragSize = frag.sizeBytes();
00324 TLOG_ARB(13, "DataSenderManager") << "sendFragment start frag.fragmentHeader()=" << std::hex << (void*)(frag.headerBeginBytes()) << ", szB=" << std::dec << std::to_string(fragSize) << ", seqID=" << std::to_string(seqID) << TLOG_ENDL;
00325 int dest = TransferInterface::RECV_TIMEOUT;
00326 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
00327 {
00328 for (auto& bdest : enabled_destinations_)
00329 {
00330 TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d (broadcast)", seqID, bdest);
00331
00332 Fragment fragCopy(frag);
00333 auto sts = destinations_[bdest]->copyFragment(fragCopy);
00334 while (sts == TransferInterface::CopyStatus::kTimeout)
00335 {
00336 sts = destinations_[bdest]->copyFragment(fragCopy);
00337 }
00338 sent_frag_count_.incSlot(bdest);
00339 }
00340 }
00341 else if (non_blocking_mode_)
00342 {
00343 while (dest == TransferInterface::RECV_TIMEOUT) {
00344 dest = calcDest_(seqID);
00345 if (dest == TransferInterface::RECV_TIMEOUT)
00346 {
00347 TLOG_WARNING("DataSenderManager") << "Could not get destination for seqID " << std::to_string(seqID) << ", retrying." << TLOG_ENDL;
00348 }
00349 }
00350 if (destinations_.count(dest) && enabled_destinations_.count(dest))
00351 {
00352 TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
00353 TransferInterface::CopyStatus sts = TransferInterface::CopyStatus::kErrorNotRequiringException;
00354 auto lastWarnTime = std::chrono::steady_clock::now();
00355 while (sts != TransferInterface::CopyStatus::kSuccess)
00356 {
00357 sts = destinations_[dest]->copyFragment(frag);
00358 if (sts != TransferInterface::CopyStatus::kSuccess && std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - lastWarnTime).count() >= 1)
00359 {
00360 TLOG_ERROR("DataSenderManager") << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying..." << TLOG_ENDL;
00361 lastWarnTime = std::chrono::steady_clock::now();
00362 }
00363 }
00364
00365 sent_frag_count_.incSlot(dest);
00366 }
00367 else
00368 {
00369 TLOG_WARNING("DataSenderManager") << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID << TLOG_ENDL;
00370 }
00371 }
00372 else
00373 {
00374 while (dest == TransferInterface::RECV_TIMEOUT) {
00375 dest = calcDest_(seqID);
00376 if (dest == TransferInterface::RECV_TIMEOUT)
00377 {
00378 TLOG_WARNING("DataSenderManager") << "Could not get destination for seqID " << std::to_string(seqID) << ", send number " << sent_frag_count_.count() << ", retrying." << TLOG_ENDL;
00379 }
00380 }
00381 if (destinations_.count(dest) && enabled_destinations_.count(dest))
00382 {
00383 TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
00384 TransferInterface::CopyStatus sts = TransferInterface::CopyStatus::kErrorNotRequiringException;
00385 auto lastWarnTime = std::chrono::steady_clock::now();
00386 while (sts != TransferInterface::CopyStatus::kSuccess)
00387 {
00388 sts = destinations_[dest]->moveFragment(std::move(frag));
00389 if (sts != TransferInterface::CopyStatus::kSuccess && std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - lastWarnTime).count() >= 1)
00390 {
00391 TLOG_ERROR("DataSenderManager") << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying..." << TLOG_ENDL;
00392 lastWarnTime = std::chrono::steady_clock::now();
00393 }
00394 }
00395
00396 sent_frag_count_.incSlot(dest);
00397 }
00398 else
00399 {
00400 TLOG_WARNING("DataSenderManager") << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID << TLOG_ENDL;
00401 }
00402 }
00403 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.find(seqID - 1) != routing_table_.end())
00404 {
00405 std::unique_lock<std::mutex> lck(routing_mutex_);
00406 routing_table_.erase(routing_table_.begin(), routing_table_.find(seqID - 1));
00407 }
00408 else if(routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount)
00409 {
00410 std::unique_lock<std::mutex> lck(routing_mutex_);
00411 routing_table_.erase(routing_table_.begin(), routing_table_.find(sent_frag_count_.count()));
00412 }
00413 if (metricMan)
00414 {
00415 auto delta_t = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - start_time).count();
00416 metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 1);
00417 metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 1);
00418 metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 1);
00419 if (use_routing_master_) {
00420 metricMan->sendMetric("Routing Table Size", routing_table_.size(), "events", 1);
00421 if (routing_wait_time_ > 0)
00422 {
00423 size_t wttemp = routing_wait_time_;
00424 routing_wait_time_ = 0;
00425 metricMan->sendMetric("Routing Wait Time", wttemp / 1000000000, "s", 1);
00426 }
00427 }
00428 }
00429 TRACE(5, "DataSenderManager::sendFragment: Done sending fragment %zu", seqID);
00430 return dest;
00431 }