00001 #define TRACE_NAME "DataSenderManager"
00002 #include "artdaq/DAQdata/Globals.hh"
00003 #include "artdaq/DAQrate/DataSenderManager.hh"
00004 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
00005
00006 #include <chrono>
00007 #include "canvas/Utilities/Exception.h"
00008 #include <arpa/inet.h>
00009 #include <netinet/in.h>
00010 #include <sys/types.h>
00011 #include <poll.h>
00012 #include <sys/socket.h>
00013 #include "artdaq/DAQdata/TCPConnect.hh"
00014
00015 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
00016 : destinations_()
00017 , enabled_destinations_()
00018 , sent_frag_count_()
00019 , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
00020 , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
00021 , send_timeout_us_(pset.get<size_t>("send_timeout_usec", 0))
00022 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
00023 , should_stop_(false)
00024 , ack_socket_(-1)
00025 , table_socket_(-1)
00026 {
00027 TLOG_DEBUG("DataSenderManager") << "Received pset: " << pset.to_string() << TLOG_ENDL;
00028 auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
00029 use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
00030 table_port_ = rmConfig.get<int>("table_update_port", 35556);
00031 table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
00032 ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
00033 ack_address_ = rmConfig.get<std::string>("routing_master_hostname", "localhost");
00034 routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
00035 routing_retry_count_ = rmConfig.get<int>("routing_retry_count", 5);
00036
00037
00038 auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
00039 for (auto& d : dests.get_pset_names())
00040 {
00041 try
00042 {
00043 auto transfer = MakeTransferPlugin(dests, d, TransferInterface::Role::kSend);
00044 auto destination_rank = transfer->destination_rank();
00045 destinations_.emplace(destination_rank, std::move(transfer));
00046 }
00047 catch (std::invalid_argument)
00048 {
00049 TRACE(3, "Invalid destination specification: " + d);
00050 }
00051 catch (cet::exception ex)
00052 {
00053 TLOG_WARNING("DataSenderManager") << "Caught cet::exception: " << ex.what() << TLOG_ENDL;
00054 }
00055 catch (...)
00056 {
00057 TLOG_WARNING("DataSenderManager") << "Non-cet exception while setting up TransferPlugin: " << d << "." << TLOG_ENDL;
00058 }
00059 }
00060 if (destinations_.size() == 0)
00061 {
00062 TLOG_ERROR("DataSenderManager") << "No destinations specified!" << TLOG_ENDL;
00063 }
00064 else
00065 {
00066 auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
00067 if (enabled_dests.size() == 0)
00068 {
00069 TLOG_INFO("DataSenderManager") << "enabled_destinations not specified, assuming all destinations enabled." << TLOG_ENDL;
00070 for (auto& d : destinations_)
00071 {
00072 enabled_destinations_.insert(d.first);
00073 }
00074 }
00075 else
00076 {
00077 for (auto& d : enabled_dests)
00078 {
00079 enabled_destinations_.insert(d);
00080 }
00081 }
00082 }
00083 if (use_routing_master_) startTableReceiverThread_();
00084 }
00085
00086 artdaq::DataSenderManager::~DataSenderManager()
00087 {
00088 TLOG_DEBUG("DataSenderManager") << "Shutting down DataSenderManager BEGIN" << TLOG_ENDL;
00089 should_stop_ = true;
00090 for (auto& dest : enabled_destinations_)
00091 {
00092 if (destinations_.count(dest))
00093 {
00094 auto sts = destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
00095 if (sts != TransferInterface::CopyStatus::kSuccess) TLOG_ERROR("DataSenderManager") << "Error sending EOD Fragment to sender rank " << dest << TLOG_ENDL;
00096
00097 }
00098 }
00099 if (routing_thread_.joinable()) routing_thread_.join();
00100 TLOG_DEBUG("DataSenderManager") << "Shutting down DataSenderManager END. Sent " << count() << " fragments." << TLOG_ENDL;
00101 }
00102
00103
00104 void artdaq::DataSenderManager::setupTableListener_()
00105 {
00106 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00107 if (table_socket_ < 0)
00108 {
00109 TLOG_ERROR("DataSenderManager") << "Error creating socket for receiving table updates!" << TLOG_ENDL;
00110 exit(1);
00111 }
00112
00113 struct sockaddr_in si_me_request;
00114
00115 int yes = 1;
00116 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00117 {
00118 TLOG_ERROR("DataSenderManager") << " Unable to enable port reuse on request socket" << TLOG_ENDL;
00119 exit(1);
00120 }
00121 memset(&si_me_request, 0, sizeof(si_me_request));
00122 si_me_request.sin_family = AF_INET;
00123 si_me_request.sin_port = htons(table_port_);
00124 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00125 if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
00126 {
00127 TLOG_ERROR("DataSenderManager") << "Cannot bind request socket to port " << table_port_ << TLOG_ENDL;
00128 exit(1);
00129 }
00130
00131 struct ip_mreq mreq;
00132 int sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
00133 if (sts == -1)
00134 {
00135 TLOG_ERROR("DataSenderManager") << "Unable to resolve multicast address for table updates" << TLOG_ENDL;
00136 exit(1);
00137 }
00138 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
00139 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
00140 {
00141 TLOG_ERROR("DataSenderManager") << "Unable to join multicast group" << TLOG_ENDL;
00142 exit(1);
00143 }
00144 }
00145 void artdaq::DataSenderManager::startTableReceiverThread_()
00146 {
00147 if (routing_thread_.joinable()) routing_thread_.join();
00148 TLOG_INFO("DataSenderManager") << "Starting Routing Thread" << TLOG_ENDL;
00149 routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
00150 }
00151 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
00152 {
00153 while (true)
00154 {
00155 if (should_stop_)
00156 {
00157 TLOG_DEBUG("DataSenderManager") << "receiveTableUpdatesLoop: should_stop is " << std::boolalpha << should_stop_ << ", stopping" << TLOG_ENDL;
00158 return;
00159 }
00160
00161 TRACE(4, "DataSenderManager::receiveTableUpdatesLoop: Polling Request socket for new requests");
00162 if (table_socket_ == -1)
00163 {
00164 TLOG_DEBUG("DataSenderManager") << "Opening table listener socket" << TLOG_ENDL;
00165 setupTableListener_();
00166 }
00167 if (table_socket_ == -1)
00168 {
00169 TLOG_DEBUG("DataSenderManager") << "The listen socket was not opened successfully." << TLOG_ENDL;
00170 return;
00171 }
00172 if (ack_socket_ == -1)
00173 {
00174 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00175 auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
00176 if (sts == -1)
00177 {
00178 TLOG_ERROR("DataSenderManager") << "Unable to resolve routing_master_address" << TLOG_ENDL;
00179 exit(1);
00180 }
00181 TLOG_DEBUG("DataSenderManager") << "Ack socket is fd " << ack_socket_ << TLOG_ENDL;
00182 }
00183
00184 struct pollfd fd;
00185 fd.fd = table_socket_;
00186 fd.events = POLLIN | POLLPRI;
00187
00188 auto res = poll(&fd, 1, 1000);
00189 if (res > 0) {
00190 auto first = artdaq::Fragment::InvalidSequenceID;
00191 auto last = artdaq::Fragment::InvalidSequenceID;
00192 artdaq::detail::RoutingPacketHeader hdr;
00193
00194 TLOG_DEBUG("DataSenderManager") << "Going to receive RoutingPacketHeader" << TLOG_ENDL;
00195 auto stss = recvfrom(table_socket_, &hdr, sizeof(artdaq::detail::RoutingPacketHeader), 0, NULL, NULL);
00196 TLOG_DEBUG("DataSenderManager") << "Received " << std::to_string(stss) << " bytes. (sizeof(RoutingPacketHeader) == " << std::to_string(sizeof(detail::RoutingPacketHeader)) << TLOG_ENDL;
00197
00198 TLOG_DEBUG("DataSenderManager") << "Checking for valid header" << TLOG_ENDL;
00199 if (hdr.header == ROUTING_MAGIC) {
00200 if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
00201 {
00202 TLOG_ERROR("DataSenderManager") << "Received table has different RoutingMasterMode than expected!" << TLOG_ENDL;
00203 exit(1);
00204 }
00205 routing_master_mode_ = hdr.mode;
00206
00207 artdaq::detail::RoutingPacket buffer(hdr.nEntries);
00208 TLOG_DEBUG("DataSenderManager") << "Receiving data buffer" << TLOG_ENDL;
00209 auto sts = recv(table_socket_, &buffer[0], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries, 0);
00210 assert(static_cast<size_t>(sts) == sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
00211 TRACE(6, "Received a packet of %zu bytes", sts);
00212
00213 first = buffer[0].sequence_id;
00214 last = buffer[buffer.size() - 1].sequence_id;
00215
00216 if (first + hdr.nEntries - 1 != last)
00217 {
00218 TLOG_ERROR("DataSenderManager") << "Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!" << TLOG_ENDL;
00219 continue;
00220 }
00221 auto thisSeqID = first;
00222
00223 if (routing_table_.count(last) == 0) {
00224 for (auto entry : buffer)
00225 {
00226 if (thisSeqID != entry.sequence_id)
00227 {
00228 TLOG_ERROR("DataSenderManager") << "Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!" << TLOG_ENDL;
00229 last = thisSeqID - 1;
00230 break;
00231 }
00232 thisSeqID++;
00233 if (routing_table_.count(entry.sequence_id))
00234 {
00235 if (routing_table_[entry.sequence_id] != entry.destination_rank)
00236 {
00237 TLOG_ERROR("DataSenderManager") << "Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
00238 << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
00239 << " I will use the original value!" << TLOG_ENDL;
00240 }
00241 continue;
00242 }
00243 routing_table_[entry.sequence_id] = entry.destination_rank;
00244 TLOG_DEBUG("DataSenderManager") << "DataSenderManager " << std::to_string(my_rank) << ": received update: SeqID " << std::to_string(entry.sequence_id) << " -> Rank " << std::to_string(entry.destination_rank) << TLOG_ENDL;
00245 }
00246 }
00247
00248 artdaq::detail::RoutingAckPacket ack;
00249 ack.rank = my_rank;
00250 ack.first_sequence_id = first;
00251 ack.last_sequence_id = last;
00252
00253 TLOG_DEBUG("DataSenderManager") << "Sending RoutingAckPacket with first= " << std::to_string(first) << " and last= " << std::to_string(last) << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")" << TLOG_ENDL;
00254 TLOG_DEBUG("DataSenderManager") << "There are now " << routing_table_.size() << " entries in the Routing Table" << TLOG_ENDL;
00255 sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr_, sizeof(ack_addr_));
00256 }
00257 }
00258 }
00259 }
00260
00261 size_t artdaq::DataSenderManager::GetRoutingTableEntryCount() const
00262 {
00263 std::unique_lock<std::mutex> lck(routing_mutex_);
00264 return routing_table_.size();
00265 }
00266
00267 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
00268 {
00269 if (enabled_destinations_.size() == 0) return TransferInterface::RECV_TIMEOUT;
00270 if (enabled_destinations_.size() == 1) return *enabled_destinations_.begin();
00271
00272 if (use_routing_master_)
00273 {
00274 auto start = std::chrono::steady_clock::now();
00275 while (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_))
00276 {
00277 std::unique_lock<std::mutex> lck(routing_mutex_);
00278 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id)) {
00279 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
00280 return routing_table_.at(sequence_id);
00281 }
00282 else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count()))
00283 {
00284 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
00285 return routing_table_.at(sent_frag_count_.count());
00286 }
00287 usleep(routing_timeout_ms_ * 10);
00288 }
00289 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
00290 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID) {
00291 TLOG_ERROR("DataSenderManager") << "Bad Omen: I don't have routing information for seqID " << std::to_string(sequence_id)
00292 << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!" << TLOG_ENDL;
00293 }
00294 else
00295 {
00296 TLOG_ERROR("DataSenderManager") << "Bad Omen: I don't have routing information for send number " << std::to_string(sent_frag_count_.count())
00297 << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!" << TLOG_ENDL;
00298 }
00299 }
00300 else {
00301 auto index = sequence_id % enabled_destinations_.size();
00302 auto it = enabled_destinations_.begin();
00303 for (; index > 0; --index)
00304 {
00305 ++it;
00306 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
00307 }
00308 return *it;
00309 }
00310 return TransferInterface::RECV_TIMEOUT;
00311 }
00312
00313 int
00314 artdaq::DataSenderManager::
00315 sendFragment(Fragment&& frag)
00316 {
00317
00318
00319 auto start_time = std::chrono::steady_clock::now();
00320 if (frag.type() == Fragment::EndOfDataFragmentType)
00321 {
00322 throw cet::exception("LogicError")
00323 << "EOD fragments should not be sent on as received: "
00324 << "use sendEODFrag() instead.";
00325 }
00326 size_t seqID = frag.sequenceID();
00327 size_t fragSize = frag.sizeBytes();
00328 TLOG_ARB(13, "DataSenderManager") << "sendFragment start frag.fragmentHeader()=" << std::hex << (void*)(frag.headerBeginBytes()) << ", szB=" << std::dec << std::to_string(fragSize)
00329 << ", seqID=" << std::to_string(seqID) << ", type=" << frag.typeString() << TLOG_ENDL;
00330 int dest = TransferInterface::RECV_TIMEOUT;
00331 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
00332 {
00333 for (auto& bdest : enabled_destinations_)
00334 {
00335 TLOG_TRACE("DataSenderManager") << "sendFragment: Sending fragment with seqId " << std::to_string(seqID) << " to destination " << bdest << " (broadcast)" << TLOG_ENDL;
00336
00337 Fragment fragCopy(frag);
00338 auto sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
00339 while (sts == TransferInterface::CopyStatus::kTimeout)
00340 {
00341 sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
00342 }
00343 sent_frag_count_.incSlot(bdest);
00344 }
00345 }
00346 else if (non_blocking_mode_)
00347 {
00348 auto count = routing_retry_count_;
00349 while (dest == TransferInterface::RECV_TIMEOUT && count > 0) {
00350 dest = calcDest_(seqID);
00351 if (dest == TransferInterface::RECV_TIMEOUT)
00352 {
00353 count--;
00354 TLOG_WARNING("DataSenderManager") << "Could not get destination for seqID " << std::to_string(seqID) << (count > 0 ? ", retrying." : ".") << TLOG_ENDL;
00355 }
00356 }
00357 if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest))
00358 {
00359 TLOG_TRACE("DataSenderManager") << "sendFragment: Sending fragment with seqId " << std::to_string(seqID) << " to destination " << dest << TLOG_ENDL;
00360 TransferInterface::CopyStatus sts = TransferInterface::CopyStatus::kErrorNotRequiringException;
00361 auto lastWarnTime = std::chrono::steady_clock::now();
00362 while (sts != TransferInterface::CopyStatus::kSuccess)
00363 {
00364 sts = destinations_[dest]->copyFragment(frag, send_timeout_us_);
00365 if (sts != TransferInterface::CopyStatus::kSuccess && TimeUtils::GetElapsedTime(lastWarnTime) >= 1)
00366 {
00367 TLOG_ERROR("DataSenderManager") << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying..." << TLOG_ENDL;
00368 lastWarnTime = std::chrono::steady_clock::now();
00369 }
00370 }
00371
00372 sent_frag_count_.incSlot(dest);
00373 }
00374 else
00375 {
00376 TLOG_WARNING("DataSenderManager") << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID << TLOG_ENDL;
00377 }
00378 }
00379 else {
00380 auto count = routing_retry_count_;
00381 while (dest == TransferInterface::RECV_TIMEOUT && count > 0) {
00382 dest = calcDest_(seqID);
00383 if (dest == TransferInterface::RECV_TIMEOUT) {
00384 count--;
00385 TLOG_WARNING("DataSenderManager") << "Could not get destination for seqID "
00386 << std::to_string(seqID) << ", send number " << sent_frag_count_.count()
00387 << (count > 0 ? ", retrying." : ".") << TLOG_ENDL;
00388 }
00389 }
00390 if(dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest)) {
00391 TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
00392 TransferInterface::CopyStatus sts = TransferInterface::CopyStatus::kErrorNotRequiringException;
00393
00394 sts = destinations_[dest]->moveFragment(std::move(frag), send_timeout_us_);
00395 if (sts != TransferInterface::CopyStatus::kSuccess)
00396 TLOG_ERROR("DataSenderManager") << "sendFragment: Sending fragment " << seqID << " to destination "
00397 << dest << " failed! Data has been lost!" << TLOG_ENDL;
00398
00399
00400 sent_frag_count_.incSlot(dest);
00401 }
00402 else
00403 TLOG_WARNING("DataSenderManager") << "calcDest returned invalid destination rank " << dest
00404 << "! This event has been lost: " << seqID << TLOG_ENDL;
00405 }
00406 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID
00407 && routing_table_.find(seqID - 1) != routing_table_.end()) {
00408 std::unique_lock<std::mutex> lck(routing_mutex_);
00409 routing_table_.erase(routing_table_.begin(), routing_table_.find(seqID - 1));
00410 }
00411 else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount) {
00412 std::unique_lock<std::mutex> lck(routing_mutex_);
00413 routing_table_.erase(routing_table_.begin(), routing_table_.find(sent_frag_count_.count()));
00414 }
00415 if (metricMan) {
00416 TRACE(5, "sendFragment: sending metrics");
00417 auto delta_t = TimeUtils::GetElapsedTime(start_time);
00418 metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 3, MetricMode::Accumulate);
00419 metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 3, MetricMode::Accumulate);
00420 metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 3, MetricMode::Average);
00421 metricMan->sendMetric("Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest),
00422 "fragments", 3, MetricMode::LastPoint);
00423 if (use_routing_master_) {
00424 metricMan->sendMetric("Routing Table Size", routing_table_.size(), "events", 1, MetricMode::LastPoint);
00425 if (routing_wait_time_ > 0)
00426 metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 1,
00427 MetricMode::Average);
00428 }
00429 }
00430 TRACE(5, "sendFragment: Done sending fragment %zu", seqID);
00431 return dest;
00432 }