00001 #define TRACE_NAME (app_name + "_DataSenderManager").c_str()
00002 #include "artdaq/DAQdata/Globals.hh"
00003 #include "artdaq/DAQrate/DataSenderManager.hh"
00004 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
00005 #include "artdaq/TransferPlugins/detail/HostMap.hh"
00006
00007 #include <chrono>
00008 #include "canvas/Utilities/Exception.h"
00009 #include <arpa/inet.h>
00010 #include <netinet/in.h>
00011 #include <sys/types.h>
00012 #include <poll.h>
00013 #include <sys/socket.h>
00014 #include "artdaq/DAQdata/TCPConnect.hh"
00015
00016 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
00017 : destinations_()
00018 , destination_metric_data_()
00019 , destination_metric_send_time_()
00020 , enabled_destinations_()
00021 , sent_frag_count_()
00022 , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
00023 , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
00024 , send_timeout_us_(pset.get<size_t>("send_timeout_usec", 5000000))
00025 , send_retry_count_(pset.get<size_t>("send_retry_count", 2))
00026 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
00027 , should_stop_(false)
00028 , ack_socket_(-1)
00029 , table_socket_(-1)
00030 , routing_table_last_(0)
00031 , routing_table_max_size_(pset.get<size_t>("routing_table_max_size", 1000))
00032 , highest_sequence_id_routed_(0)
00033 {
00034 TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
00035
00036
00037 if (send_timeout_us_ == 0) send_timeout_us_ = std::numeric_limits<size_t>::max();
00038
00039 auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
00040 use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
00041 table_port_ = rmConfig.get<int>("table_update_port", 35556);
00042 table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
00043 ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
00044 ack_address_ = rmConfig.get<std::string>("routing_master_hostname", "localhost");
00045 routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
00046 routing_retry_count_ = rmConfig.get<int>("routing_retry_count", 5);
00047
00048 hostMap_t host_map = MakeHostMap(pset);
00049 size_t tcp_send_buffer_size = pset.get<size_t>("tcp_send_buffer_size", 0);
00050 size_t max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0);
00051
00052 auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
00053 for (auto& d : dests.get_pset_names())
00054 {
00055 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
00056 host_map = MakeHostMap(dest_pset, host_map);
00057 }
00058 auto host_map_pset = MakeHostMapPset(host_map);
00059 fhicl::ParameterSet dests_mod;
00060 for (auto& d : dests.get_pset_names())
00061 {
00062 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
00063 dest_pset.erase("host_map");
00064 dest_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
00065
00066 if (tcp_send_buffer_size != 0 && !dest_pset.has_key("tcp_send_buffer_size"))
00067 {
00068 dest_pset.put<size_t>("tcp_send_buffer_size", tcp_send_buffer_size);
00069 }
00070 if (max_fragment_size_words != 0 && !dest_pset.has_key("max_fragment_size_words"))
00071 {
00072 dest_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words);
00073 }
00074
00075 dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
00076 }
00077
00078 for (auto& d : dests_mod.get_pset_names())
00079 {
00080 try
00081 {
00082 auto transfer = MakeTransferPlugin(dests_mod, d, TransferInterface::Role::kSend);
00083 auto destination_rank = transfer->destination_rank();
00084 destinations_.emplace(destination_rank, std::move(transfer));
00085 destination_metric_data_[destination_rank] = std::pair<size_t, double>();
00086 destination_metric_send_time_[destination_rank] = std::chrono::steady_clock::now();
00087 }
00088 catch (std::invalid_argument)
00089 {
00090 TLOG(TLVL_DEBUG) << "Invalid destination specification: " << d;
00091 }
00092 catch (cet::exception ex)
00093 {
00094 TLOG(TLVL_WARNING) << "Caught cet::exception: " << ex.what();
00095 }
00096 catch (...)
00097 {
00098 TLOG(TLVL_WARNING) << "Non-cet exception while setting up TransferPlugin: " << d << ".";
00099 }
00100 }
00101 if (destinations_.size() == 0)
00102 {
00103 TLOG(TLVL_ERROR) << "No destinations specified!";
00104 }
00105 else
00106 {
00107 auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
00108 if (enabled_dests.size() == 0)
00109 {
00110 TLOG(TLVL_INFO) << "enabled_destinations not specified, assuming all destinations enabled.";
00111 for (auto& d : destinations_)
00112 {
00113 enabled_destinations_.insert(d.first);
00114 }
00115 }
00116 else
00117 {
00118 for (auto& d : enabled_dests)
00119 {
00120 enabled_destinations_.insert(d);
00121 }
00122 }
00123 }
00124 if (use_routing_master_) startTableReceiverThread_();
00125 }
00126
00127 artdaq::DataSenderManager::~DataSenderManager()
00128 {
00129 TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager BEGIN";
00130 should_stop_ = true;
00131 for (auto& dest : enabled_destinations_)
00132 {
00133 if (destinations_.count(dest))
00134 {
00135 auto sts = destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
00136 if (sts != TransferInterface::CopyStatus::kSuccess) TLOG(TLVL_ERROR) << "Error sending EOD Fragment to sender rank " << dest;
00137
00138 }
00139 }
00140 if (routing_thread_.joinable()) routing_thread_.join();
00141 TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager END. Sent " << count() << " fragments.";
00142 }
00143
00144
00145 void artdaq::DataSenderManager::setupTableListener_()
00146 {
00147 int sts;
00148 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00149 if (table_socket_ < 0)
00150 {
00151 TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
00152 exit(1);
00153 }
00154
00155 struct sockaddr_in si_me_request;
00156
00157 int yes = 1;
00158 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00159 {
00160 TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
00161 exit(1);
00162 }
00163 memset(&si_me_request, 0, sizeof(si_me_request));
00164 si_me_request.sin_family = AF_INET;
00165 si_me_request.sin_port = htons(table_port_);
00166
00167 struct in_addr in_addr_s;
00168 sts = inet_aton(table_address_.c_str(), &in_addr_s );
00169 if (sts == 0)
00170 {
00171 TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
00172 }
00173 si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
00174 if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
00175 {
00176 TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
00177 exit(1);
00178 }
00179
00180 struct ip_mreq mreq;
00181 sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
00182 if (sts == -1)
00183 {
00184 TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
00185 exit(1);
00186 }
00187 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
00188 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
00189 {
00190 TLOG(TLVL_ERROR) << "Unable to join multicast group";
00191 exit(1);
00192 }
00193 }
00194 void artdaq::DataSenderManager::startTableReceiverThread_()
00195 {
00196 if (routing_thread_.joinable()) routing_thread_.join();
00197 TLOG(TLVL_INFO) << "Starting Routing Thread";
00198 try {
00199 routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
00200 }
00201 catch (const boost::exception& e)
00202 {
00203 TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
00204 std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
00205 exit(5);
00206 }
00207 }
00208 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
00209 {
00210 while (true)
00211 {
00212 if (should_stop_)
00213 {
00214 TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
00215 return;
00216 }
00217
00218 TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes";
00219 if (table_socket_ == -1)
00220 {
00221 TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
00222 setupTableListener_();
00223 }
00224 if (table_socket_ == -1)
00225 {
00226 TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
00227 return;
00228 }
00229 if (ack_socket_ == -1)
00230 {
00231 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00232 auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
00233 if (sts == -1)
00234 {
00235 TLOG(TLVL_ERROR) << __func__ << ": Unable to resolve routing_master_address";
00236 exit(1);
00237 }
00238 TLOG(TLVL_DEBUG) << __func__ << ": Ack socket is fd " << ack_socket_;
00239 }
00240
00241 struct pollfd fd;
00242 fd.fd = table_socket_;
00243 fd.events = POLLIN | POLLPRI;
00244
00245 auto res = poll(&fd, 1, 1000);
00246 if (res > 0)
00247 {
00248 auto first = artdaq::Fragment::InvalidSequenceID;
00249 auto last = artdaq::Fragment::InvalidSequenceID;
00250 artdaq::detail::RoutingPacketHeader hdr;
00251
00252 TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
00253 struct sockaddr_in from;
00254 socklen_t len=sizeof(from);
00255 auto stss = recvfrom(table_socket_, &hdr, sizeof(artdaq::detail::RoutingPacketHeader), 0, (struct sockaddr*)&from, &len );
00256 TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " hdr bytes. (sizeof(RoutingPacketHeader) == " << sizeof(detail::RoutingPacketHeader)
00257 << " from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
00258
00259 TRACE(TLVL_DEBUG,"receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx",hdr.nEntries,((unsigned long*)&hdr)[0],((unsigned long*)&hdr)[1]);
00260 if (hdr.header != ROUTING_MAGIC)
00261 {
00262 TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)="<<stss;
00263 }
00264 else if (stss != sizeof(artdaq::detail::RoutingPacketHeader))
00265 {
00266 TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. size(bytes)="<<stss;
00267 }
00268 else
00269 {
00270 if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
00271 {
00272 TLOG(TLVL_ERROR) << __func__ << ": Received table has different RoutingMasterMode than expected!";
00273 exit(1);
00274 }
00275 routing_master_mode_ = hdr.mode;
00276
00277 artdaq::detail::RoutingPacket buffer(hdr.nEntries);
00278 TLOG(TLVL_DEBUG) << __func__ << ": Receiving data buffer";
00279 auto sts = recvfrom(table_socket_, &buffer[0], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries, 0, (struct sockaddr*)&from, &len );
00280 assert(static_cast<size_t>(sts) == sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
00281 TLOG(TLVL_DEBUG) << __func__ << ": Received " << sts << " pkt bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
00282 TRACE(6,"receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx",sts,((unsigned long*)&buffer[0])[0],((unsigned long*)&buffer[0])[1]);
00283
00284 first = buffer[0].sequence_id;
00285 last = buffer[buffer.size() - 1].sequence_id;
00286
00287 if (first + hdr.nEntries - 1 != last)
00288 {
00289 TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
00290 continue;
00291 }
00292 auto thisSeqID = first;
00293
00294 {
00295 std::unique_lock<std::mutex> lck(routing_mutex_);
00296 if (routing_table_.count(last) == 0)
00297 {
00298 for (auto entry : buffer)
00299 {
00300 if (thisSeqID != entry.sequence_id)
00301 {
00302 TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
00303 last = thisSeqID - 1;
00304 break;
00305 }
00306 thisSeqID++;
00307 if (routing_table_.count(entry.sequence_id))
00308 {
00309 if (routing_table_[entry.sequence_id] != entry.destination_rank)
00310 {
00311 TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
00312 << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
00313 << " I will use the original value!";
00314 }
00315 continue;
00316 }
00317 if (entry.sequence_id < routing_table_last_) continue;
00318 routing_table_[entry.sequence_id] = entry.destination_rank;
00319 TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
00320 << " -> Rank " << entry.destination_rank;
00321 }
00322 }
00323
00324 TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
00325 if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
00326
00327 auto counter = 0;
00328 for (auto& entry : routing_table_)
00329 {
00330 TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
00331 counter++;
00332 }
00333 }
00334
00335 artdaq::detail::RoutingAckPacket ack;
00336 ack.rank = my_rank;
00337 ack.first_sequence_id = first;
00338 ack.last_sequence_id = last;
00339
00340 if (last > routing_table_last_) routing_table_last_ = last;
00341
00342 TLOG(TLVL_DEBUG) << __func__ << ": Sending RoutingAckPacket with first= " << first << " and last= " << last << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")";
00343 sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr_, sizeof(ack_addr_));
00344 }
00345 }
00346 }
00347 }
00348
00349 size_t artdaq::DataSenderManager::GetRoutingTableEntryCount() const
00350 {
00351 std::unique_lock<std::mutex> lck(routing_mutex_);
00352 return routing_table_.size();
00353 }
00354
00355 size_t artdaq::DataSenderManager::GetRemainingRoutingTableEntries() const
00356 {
00357 std::unique_lock<std::mutex> lck(routing_mutex_);
00358
00359 size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
00360 return dist;
00361 }
00362
00363 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
00364 {
00365 if (enabled_destinations_.size() == 0) return TransferInterface::RECV_TIMEOUT;
00366 if (!use_routing_master_ && enabled_destinations_.size() == 1) return *enabled_destinations_.begin();
00367
00368 if (use_routing_master_)
00369 {
00370 auto start = std::chrono::steady_clock::now();
00371 TLOG(15) << "calcDest_ use_routing_master check for routing info for seqID="<<sequence_id<<" routing_timeout_ms="<<routing_timeout_ms_<<" should_stop_="<<should_stop_;
00372 while (!should_stop_ && (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_)))
00373 {
00374 {
00375 std::unique_lock<std::mutex> lck(routing_mutex_);
00376 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id))
00377 {
00378 if (sequence_id > highest_sequence_id_routed_) highest_sequence_id_routed_ = sequence_id;
00379 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
00380 return routing_table_.at(sequence_id);
00381 }
00382 else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count() + 1))
00383 {
00384 if (sent_frag_count_.count() + 1 > highest_sequence_id_routed_) highest_sequence_id_routed_ = sent_frag_count_.count() + 1;
00385 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
00386 return routing_table_.at(sent_frag_count_.count() + 1);
00387 }
00388 }
00389 usleep(routing_timeout_ms_ * 10);
00390 }
00391 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
00392 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID)
00393 {
00394 TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for seqID " << sequence_id
00395 << " and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
00396 }
00397 else
00398 {
00399 TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for send number " << sent_frag_count_.count()
00400 << " and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
00401 }
00402 }
00403 else
00404 {
00405 auto index = sequence_id % enabled_destinations_.size();
00406 auto it = enabled_destinations_.begin();
00407 for (; index > 0; --index)
00408 {
00409 ++it;
00410 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
00411 }
00412 return *it;
00413 }
00414 return TransferInterface::RECV_TIMEOUT;
00415 }
00416
00417 std::pair<int, artdaq::TransferInterface::CopyStatus> artdaq::DataSenderManager::sendFragment(Fragment&& frag)
00418 {
00419
00420
00421 auto start_time = std::chrono::steady_clock::now();
00422 if (frag.type() == Fragment::EndOfDataFragmentType)
00423 {
00424 throw cet::exception("LogicError")
00425 << "EOD fragments should not be sent on as received: "
00426 << "use sendEODFrag() instead.";
00427 }
00428 size_t seqID = frag.sequenceID();
00429 size_t fragSize = frag.sizeBytes();
00430 TLOG(13) << "sendFragment start frag.fragmentHeader()=" << std::hex << (void*)(frag.headerBeginBytes()) << ", szB=" << std::dec << fragSize
00431 << ", seqID=" << seqID << ", type=" << frag.typeString();
00432 int dest = TransferInterface::RECV_TIMEOUT;
00433 auto outsts = TransferInterface::CopyStatus::kSuccess;
00434 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
00435 {
00436 for (auto& bdest : enabled_destinations_)
00437 {
00438 TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << bdest << " (broadcast)";
00439
00440 Fragment fragCopy(frag);
00441 auto sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
00442 size_t retries = 0;
00443 while (sts == TransferInterface::CopyStatus::kTimeout && retries < send_retry_count_)
00444 {
00445 sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
00446 retries++;
00447 }
00448 if (sts != TransferInterface::CopyStatus::kSuccess) outsts = sts;
00449 sent_frag_count_.incSlot(bdest);
00450 }
00451 }
00452 else if (non_blocking_mode_)
00453 {
00454 auto count = routing_retry_count_;
00455 while (dest == TransferInterface::RECV_TIMEOUT && count > 0)
00456 {
00457 dest = calcDest_(seqID);
00458 if (dest == TransferInterface::RECV_TIMEOUT)
00459 {
00460 count--;
00461 TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << (count > 0 ? ", retrying." : ".");
00462 }
00463 }
00464 if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest))
00465 {
00466 TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
00467 TransferInterface::CopyStatus sts = TransferInterface::CopyStatus::kErrorNotRequiringException;
00468 auto lastWarnTime = std::chrono::steady_clock::now();
00469 size_t retries = 0;
00470 while (sts != TransferInterface::CopyStatus::kSuccess && retries <= send_retry_count_)
00471 {
00472 sts = destinations_[dest]->copyFragment(frag, send_timeout_us_);
00473 if (sts != TransferInterface::CopyStatus::kSuccess && TimeUtils::GetElapsedTime(lastWarnTime) >= 1)
00474 {
00475 TLOG(TLVL_WARNING) << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying...";
00476 lastWarnTime = std::chrono::steady_clock::now();
00477 }
00478 }
00479 if (sts != TransferInterface::CopyStatus::kSuccess) outsts = sts;
00480
00481 sent_frag_count_.incSlot(dest);
00482 }
00483 else if (!should_stop_)
00484 TLOG(TLVL_ERROR) << "(in non_blocking) calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
00485 << ". enabled_destinantions_.size()="<<enabled_destinations_.size();
00486 }
00487 else
00488 {
00489 auto start = std::chrono::steady_clock::now();
00490 while (!should_stop_ && dest == TransferInterface::RECV_TIMEOUT)
00491 {
00492 dest = calcDest_(seqID);
00493 if (dest == TransferInterface::RECV_TIMEOUT)
00494 {
00495 TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << ", send number " << sent_frag_count_.count() << ", retrying. Waited " << TimeUtils::GetElapsedTime(start) << " s for routing information.";
00496 usleep(10000);
00497 }
00498 }
00499 if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest))
00500 {
00501 TLOG(5) << "DataSenderManager::sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
00502 TransferInterface::CopyStatus sts = TransferInterface::CopyStatus::kErrorNotRequiringException;
00503
00504 sts = destinations_[dest]->moveFragment(std::move(frag));
00505 if (sts != TransferInterface::CopyStatus::kSuccess)
00506 TLOG(TLVL_ERROR) << "sendFragment: Sending fragment " << seqID << " to destination "
00507 << dest << " failed! Data has been lost!";
00508
00509
00510 sent_frag_count_.incSlot(dest);
00511 outsts = sts;
00512 }
00513 else if (!should_stop_)
00514 TLOG(TLVL_ERROR) << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
00515 << ". enabled_destinantions_.size()="<<enabled_destinations_.size();
00516 }
00517
00518 {
00519 std::unique_lock<std::mutex> lck(routing_mutex_);
00520
00521
00522
00523
00524 if(routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.find(seqID) != routing_table_.end())
00525 routing_table_.erase(routing_table_.find(seqID));
00526 else if(routing_table_.find(sent_frag_count_.count()) != routing_table_.end())
00527 routing_table_.erase(routing_table_.find(sent_frag_count_.count()));
00528 }
00529
00530
00531 auto delta_t = TimeUtils::GetElapsedTime(start_time);
00532 destination_metric_data_[dest].first += fragSize;
00533 destination_metric_data_[dest].second += delta_t;
00534
00535 if (metricMan && TimeUtils::GetElapsedTime(destination_metric_send_time_[dest]) > 1)
00536 {
00537 TLOG(5) << "sendFragment: sending metrics";
00538 metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), destination_metric_data_[dest].second, "s", 5, MetricMode::Accumulate);
00539 metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), destination_metric_data_[dest].first, "B", 5, MetricMode::Accumulate);
00540 metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), destination_metric_data_[dest].first / destination_metric_data_[dest].second, "B/s", 5, MetricMode::Average);
00541 metricMan->sendMetric("Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest), "fragments", 3, MetricMode::LastPoint);
00542
00543 destination_metric_send_time_[dest] = std::chrono::steady_clock::now();
00544 destination_metric_data_[dest].first = 0;
00545 destination_metric_data_[dest].second = 0.0;
00546
00547 if (use_routing_master_)
00548 {
00549 metricMan->sendMetric("Routing Table Size", GetRoutingTableEntryCount(), "events", 2, MetricMode::LastPoint);
00550 if (routing_wait_time_ > 0)
00551 {
00552 metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 2, MetricMode::Average);
00553 routing_wait_time_ = 0;
00554 }
00555 }
00556 }
00557 TLOG(5) << "sendFragment: Done sending fragment " << seqID;
00558 return std::make_pair(dest, outsts);
00559 }