00001 #define TRACE_NAME (app_name + "_DataSenderManager").c_str()
00002 #include "artdaq/DAQdata/Globals.hh"
00003 #include "artdaq/DAQrate/DataSenderManager.hh"
00004 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
00005 #include "artdaq/TransferPlugins/detail/HostMap.hh"
00006
00007 #include <chrono>
00008 #include "canvas/Utilities/Exception.h"
00009 #include <arpa/inet.h>
00010 #include <netinet/in.h>
00011 #include <sys/types.h>
00012 #include <poll.h>
00013 #include <sys/socket.h>
00014 #include "artdaq/DAQdata/TCPConnect.hh"
00015
00016 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
00017 : destinations_()
00018 , destination_metric_data_()
00019 , destination_metric_send_time_()
00020 , enabled_destinations_()
00021 , sent_frag_count_()
00022 , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
00023 , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
00024 , send_timeout_us_(pset.get<size_t>("send_timeout_usec", 5000000))
00025 , send_retry_count_(pset.get<size_t>("send_retry_count", 2))
00026 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
00027 , should_stop_(false)
00028 , ack_socket_(-1)
00029 , table_socket_(-1)
00030 , routing_table_last_(0)
00031 , routing_table_max_size_(pset.get<size_t>("routing_table_max_size", 1000))
00032 , highest_sequence_id_routed_(0)
00033 {
00034 TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
00035
00036
00037 if (send_timeout_us_ == 0) send_timeout_us_ = std::numeric_limits<size_t>::max();
00038
00039 auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
00040 use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
00041 table_port_ = rmConfig.get<int>("table_update_port", 35556);
00042 table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
00043 ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
00044 ack_address_ = rmConfig.get<std::string>("routing_master_hostname", "localhost");
00045 routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
00046 routing_retry_count_ = rmConfig.get<int>("routing_retry_count", 5);
00047
00048 hostMap_t host_map = MakeHostMap(pset);
00049 size_t tcp_send_buffer_size = pset.get<size_t>("tcp_send_buffer_size", 0);
00050 size_t max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0);
00051
00052 auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
00053 for (auto& d : dests.get_pset_names())
00054 {
00055 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
00056 host_map = MakeHostMap(dest_pset, host_map);
00057 }
00058 auto host_map_pset = MakeHostMapPset(host_map);
00059 fhicl::ParameterSet dests_mod;
00060 for (auto& d : dests.get_pset_names())
00061 {
00062 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
00063 dest_pset.erase("host_map");
00064 dest_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
00065
00066 if (tcp_send_buffer_size != 0 && !dest_pset.has_key("tcp_send_buffer_size"))
00067 {
00068 dest_pset.put<size_t>("tcp_send_buffer_size", tcp_send_buffer_size);
00069 }
00070 if (max_fragment_size_words != 0 && !dest_pset.has_key("max_fragment_size_words"))
00071 {
00072 dest_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words);
00073 }
00074
00075 dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
00076 }
00077
00078 for (auto& d : dests_mod.get_pset_names())
00079 {
00080 try
00081 {
00082 auto transfer = MakeTransferPlugin(dests_mod, d, TransferInterface::Role::kSend);
00083 auto destination_rank = transfer->destination_rank();
00084 destinations_.emplace(destination_rank, std::move(transfer));
00085 destination_metric_data_[destination_rank] = std::pair<size_t, double>();
00086 destination_metric_send_time_[destination_rank] = std::chrono::steady_clock::now();
00087 }
00088 catch (std::invalid_argument)
00089 {
00090 TLOG(TLVL_DEBUG) << "Invalid destination specification: " << d;
00091 }
00092 catch (cet::exception ex)
00093 {
00094 TLOG(TLVL_WARNING) << "Caught cet::exception: " << ex.what();
00095 }
00096 catch (...)
00097 {
00098 TLOG(TLVL_WARNING) << "Non-cet exception while setting up TransferPlugin: " << d << ".";
00099 }
00100 }
00101 if (destinations_.size() == 0)
00102 {
00103 TLOG(TLVL_ERROR) << "No destinations specified!";
00104 }
00105 else
00106 {
00107 auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
00108 if (enabled_dests.size() == 0)
00109 {
00110 TLOG(TLVL_INFO) << "enabled_destinations not specified, assuming all destinations enabled.";
00111 for (auto& d : destinations_)
00112 {
00113 enabled_destinations_.insert(d.first);
00114 }
00115 }
00116 else
00117 {
00118 for (auto& d : enabled_dests)
00119 {
00120 enabled_destinations_.insert(d);
00121 }
00122 }
00123 }
00124 if (use_routing_master_) startTableReceiverThread_();
00125 }
00126
00127 artdaq::DataSenderManager::~DataSenderManager()
00128 {
00129 TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager BEGIN";
00130 should_stop_ = true;
00131 for (auto& dest : enabled_destinations_)
00132 {
00133 if (destinations_.count(dest))
00134 {
00135 auto sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
00136 if (sts != TransferInterface::CopyStatus::kSuccess) TLOG(TLVL_ERROR) << "Error sending EOD Fragment to sender rank " << dest;
00137
00138 }
00139 }
00140 if (routing_thread_.joinable()) routing_thread_.join();
00141 TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager END. Sent " << count() << " fragments.";
00142 }
00143
00144
00145 void artdaq::DataSenderManager::setupTableListener_()
00146 {
00147 int sts;
00148 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00149 if (table_socket_ < 0)
00150 {
00151 TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
00152 exit(1);
00153 }
00154
00155 struct sockaddr_in si_me_request;
00156
00157 int yes = 1;
00158 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00159 {
00160 TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
00161 exit(1);
00162 }
00163 memset(&si_me_request, 0, sizeof(si_me_request));
00164 si_me_request.sin_family = AF_INET;
00165 si_me_request.sin_port = htons(table_port_);
00166
00167 struct in_addr in_addr_s;
00168 sts = inet_aton(table_address_.c_str(), &in_addr_s );
00169 if (sts == 0)
00170 {
00171 TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
00172 }
00173 si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
00174 if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
00175 {
00176 TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
00177 exit(1);
00178 }
00179
00180 struct ip_mreq mreq;
00181 sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
00182 if (sts == -1)
00183 {
00184 TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
00185 exit(1);
00186 }
00187 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
00188 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
00189 {
00190 TLOG(TLVL_ERROR) << "Unable to join multicast group";
00191 exit(1);
00192 }
00193 }
00194 void artdaq::DataSenderManager::startTableReceiverThread_()
00195 {
00196 if (routing_thread_.joinable()) routing_thread_.join();
00197 TLOG(TLVL_INFO) << "Starting Routing Thread";
00198 try {
00199 routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
00200 }
00201 catch (const boost::exception& e)
00202 {
00203 TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
00204 std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
00205 exit(5);
00206 }
00207 }
00208 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
00209 {
00210 while (true)
00211 {
00212 if (should_stop_)
00213 {
00214 TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
00215 return;
00216 }
00217
00218 TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes";
00219 if (table_socket_ == -1)
00220 {
00221 TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
00222 setupTableListener_();
00223 }
00224 if (table_socket_ == -1)
00225 {
00226 TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
00227 return;
00228 }
00229 if (ack_socket_ == -1)
00230 {
00231 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00232 auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
00233 if (sts == -1)
00234 {
00235 TLOG(TLVL_ERROR) << __func__ << ": Unable to resolve routing_master_address";
00236 exit(1);
00237 }
00238 TLOG(TLVL_DEBUG) << __func__ << ": Ack socket is fd " << ack_socket_;
00239 }
00240
00241 struct pollfd fd;
00242 fd.fd = table_socket_;
00243 fd.events = POLLIN | POLLPRI;
00244
00245 auto res = poll(&fd, 1, 1000);
00246 if (res > 0)
00247 {
00248 auto first = artdaq::Fragment::InvalidSequenceID;
00249 auto last = artdaq::Fragment::InvalidSequenceID;
00250 std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
00251 artdaq::detail::RoutingPacketHeader hdr;
00252
00253 TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
00254 struct sockaddr_in from;
00255 socklen_t len=sizeof(from);
00256 auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, (struct sockaddr*)&from, &len );
00257 TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
00258
00259 if (stss > static_cast<ssize_t>(sizeof(hdr)))
00260 {
00261 memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader));
00262 }
00263 else
00264 {
00265 TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
00266 continue;
00267 }
00268
00269 TRACE(TLVL_DEBUG,"receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx",hdr.nEntries,((unsigned long*)&hdr)[0],((unsigned long*)&hdr)[1]);
00270 if (hdr.header != ROUTING_MAGIC)
00271 {
00272 TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)="<<stss;
00273 }
00274 else
00275 {
00276 if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
00277 {
00278 TLOG(TLVL_ERROR) << __func__ << ": Received table has different RoutingMasterMode than expected!";
00279 exit(1);
00280 }
00281 routing_master_mode_ = hdr.mode;
00282
00283 artdaq::detail::RoutingPacket buffer(hdr.nEntries);
00284 assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
00285 memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
00286 TRACE(6,"receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx",stss,((unsigned long*)&buffer[0])[0],((unsigned long*)&buffer[0])[1]);
00287
00288 first = buffer[0].sequence_id;
00289 last = buffer[buffer.size() - 1].sequence_id;
00290
00291 if (first + hdr.nEntries - 1 != last)
00292 {
00293 TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
00294 continue;
00295 }
00296 auto thisSeqID = first;
00297
00298 {
00299 std::unique_lock<std::mutex> lck(routing_mutex_);
00300 if (routing_table_.count(last) == 0)
00301 {
00302 for (auto entry : buffer)
00303 {
00304 if (thisSeqID != entry.sequence_id)
00305 {
00306 TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
00307 last = thisSeqID - 1;
00308 break;
00309 }
00310 thisSeqID++;
00311 if (routing_table_.count(entry.sequence_id))
00312 {
00313 if (routing_table_[entry.sequence_id] != entry.destination_rank)
00314 {
00315 TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
00316 << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
00317 << " I will use the original value!";
00318 }
00319 continue;
00320 }
00321 if (entry.sequence_id < routing_table_last_) continue;
00322 routing_table_[entry.sequence_id] = entry.destination_rank;
00323 TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
00324 << " -> Rank " << entry.destination_rank;
00325 }
00326 }
00327
00328 TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
00329 if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
00330
00331 auto counter = 0;
00332 for (auto& entry : routing_table_)
00333 {
00334 TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
00335 counter++;
00336 }
00337 }
00338
00339 artdaq::detail::RoutingAckPacket ack;
00340 ack.rank = my_rank;
00341 ack.first_sequence_id = first;
00342 ack.last_sequence_id = last;
00343
00344 if (last > routing_table_last_) routing_table_last_ = last;
00345
00346 TLOG(TLVL_DEBUG) << __func__ << ": Sending RoutingAckPacket with first= " << first << " and last= " << last << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")";
00347 sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr_, sizeof(ack_addr_));
00348 }
00349 }
00350 }
00351 }
00352
00353 size_t artdaq::DataSenderManager::GetRoutingTableEntryCount() const
00354 {
00355 std::unique_lock<std::mutex> lck(routing_mutex_);
00356 return routing_table_.size();
00357 }
00358
00359 size_t artdaq::DataSenderManager::GetRemainingRoutingTableEntries() const
00360 {
00361 std::unique_lock<std::mutex> lck(routing_mutex_);
00362
00363 size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
00364 return dist;
00365 }
00366
00367 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
00368 {
00369 if (enabled_destinations_.size() == 0) return TransferInterface::RECV_TIMEOUT;
00370 if (!use_routing_master_ && enabled_destinations_.size() == 1) return *enabled_destinations_.begin();
00371
00372 if (use_routing_master_)
00373 {
00374 auto start = std::chrono::steady_clock::now();
00375 TLOG(15) << "calcDest_ use_routing_master check for routing info for seqID="<<sequence_id<<" routing_timeout_ms="<<routing_timeout_ms_<<" should_stop_="<<should_stop_;
00376 while (!should_stop_ && (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_)))
00377 {
00378 {
00379 std::unique_lock<std::mutex> lck(routing_mutex_);
00380 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id))
00381 {
00382 if (sequence_id > highest_sequence_id_routed_) highest_sequence_id_routed_ = sequence_id;
00383 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
00384 return routing_table_.at(sequence_id);
00385 }
00386 else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count() + 1))
00387 {
00388 if (sent_frag_count_.count() + 1 > highest_sequence_id_routed_) highest_sequence_id_routed_ = sent_frag_count_.count() + 1;
00389 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
00390 return routing_table_.at(sent_frag_count_.count() + 1);
00391 }
00392 }
00393 usleep(routing_timeout_ms_ * 10);
00394 }
00395 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
00396 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID)
00397 {
00398 TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for seqID " << sequence_id
00399 << " and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
00400 }
00401 else
00402 {
00403 TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for send number " << sent_frag_count_.count()
00404 << " and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
00405 }
00406 }
00407 else
00408 {
00409 auto index = sequence_id % enabled_destinations_.size();
00410 auto it = enabled_destinations_.begin();
00411 for (; index > 0; --index)
00412 {
00413 ++it;
00414 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
00415 }
00416 return *it;
00417 }
00418 return TransferInterface::RECV_TIMEOUT;
00419 }
00420
00421 std::pair<int, artdaq::TransferInterface::CopyStatus> artdaq::DataSenderManager::sendFragment(Fragment&& frag)
00422 {
00423
00424
00425 auto start_time = std::chrono::steady_clock::now();
00426 if (frag.type() == Fragment::EndOfDataFragmentType)
00427 {
00428 throw cet::exception("LogicError")
00429 << "EOD fragments should not be sent on as received: "
00430 << "use sendEODFrag() instead.";
00431 }
00432 size_t seqID = frag.sequenceID();
00433 size_t fragSize = frag.sizeBytes();
00434 TLOG(13) << "sendFragment start frag.fragmentHeader()=" << std::hex << (void*)(frag.headerBeginBytes()) << ", szB=" << std::dec << fragSize
00435 << ", seqID=" << seqID << ", type=" << frag.typeString();
00436 int dest = TransferInterface::RECV_TIMEOUT;
00437 auto outsts = TransferInterface::CopyStatus::kSuccess;
00438 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
00439 {
00440 for (auto& bdest : enabled_destinations_)
00441 {
00442 TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << bdest << " (broadcast)";
00443
00444 auto sts = TransferInterface::CopyStatus::kTimeout;
00445 size_t retries = 0;
00446 while (sts == TransferInterface::CopyStatus::kTimeout && retries < send_retry_count_)
00447 {
00448 if (!non_blocking_mode_)
00449 {
00450 sts = destinations_[bdest]->transfer_fragment_reliable_mode(Fragment(frag));
00451 }
00452 else
00453 {
00454 sts = destinations_[bdest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
00455 }
00456 retries++;
00457 }
00458 if (sts != TransferInterface::CopyStatus::kSuccess) outsts = sts;
00459 sent_frag_count_.incSlot(bdest);
00460 }
00461 }
00462 else if (non_blocking_mode_)
00463 {
00464 auto count = routing_retry_count_;
00465 while (dest == TransferInterface::RECV_TIMEOUT && count > 0)
00466 {
00467 dest = calcDest_(seqID);
00468 if (dest == TransferInterface::RECV_TIMEOUT)
00469 {
00470 count--;
00471 TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << (count > 0 ? ", retrying." : ".");
00472 }
00473 }
00474 if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest))
00475 {
00476 TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
00477 TransferInterface::CopyStatus sts = TransferInterface::CopyStatus::kErrorNotRequiringException;
00478 auto lastWarnTime = std::chrono::steady_clock::now();
00479 size_t retries = 0;
00480 while (sts != TransferInterface::CopyStatus::kSuccess && retries <= send_retry_count_)
00481 {
00482 sts = destinations_[dest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
00483 if (sts != TransferInterface::CopyStatus::kSuccess && TimeUtils::GetElapsedTime(lastWarnTime) >= 1)
00484 {
00485 TLOG(TLVL_WARNING) << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying...";
00486 lastWarnTime = std::chrono::steady_clock::now();
00487 }
00488 }
00489 if (sts != TransferInterface::CopyStatus::kSuccess) outsts = sts;
00490
00491 sent_frag_count_.incSlot(dest);
00492 }
00493 else if (!should_stop_)
00494 TLOG(TLVL_ERROR) << "(in non_blocking) calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
00495 << ". enabled_destinantions_.size()="<<enabled_destinations_.size();
00496 }
00497 else
00498 {
00499 auto start = std::chrono::steady_clock::now();
00500 while (!should_stop_ && dest == TransferInterface::RECV_TIMEOUT)
00501 {
00502 dest = calcDest_(seqID);
00503 if (dest == TransferInterface::RECV_TIMEOUT)
00504 {
00505 TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << ", send number " << sent_frag_count_.count() << ", retrying. Waited " << TimeUtils::GetElapsedTime(start) << " s for routing information.";
00506 usleep(10000);
00507 }
00508 }
00509 if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest))
00510 {
00511 TLOG(5) << "DataSenderManager::sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
00512 TransferInterface::CopyStatus sts = TransferInterface::CopyStatus::kErrorNotRequiringException;
00513
00514 sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(frag));
00515 if (sts != TransferInterface::CopyStatus::kSuccess)
00516 TLOG(TLVL_ERROR) << "sendFragment: Sending fragment " << seqID << " to destination "
00517 << dest << " failed! Data has been lost!";
00518
00519
00520 sent_frag_count_.incSlot(dest);
00521 outsts = sts;
00522 }
00523 else if (!should_stop_)
00524 TLOG(TLVL_ERROR) << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
00525 << ". enabled_destinantions_.size()="<<enabled_destinations_.size();
00526 }
00527
00528 {
00529 std::unique_lock<std::mutex> lck(routing_mutex_);
00530
00531
00532
00533
00534 if(routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.find(seqID) != routing_table_.end())
00535 routing_table_.erase(routing_table_.find(seqID));
00536 else if(routing_table_.find(sent_frag_count_.count()) != routing_table_.end())
00537 routing_table_.erase(routing_table_.find(sent_frag_count_.count()));
00538 }
00539
00540
00541 auto delta_t = TimeUtils::GetElapsedTime(start_time);
00542 destination_metric_data_[dest].first += fragSize;
00543 destination_metric_data_[dest].second += delta_t;
00544
00545 if (metricMan && TimeUtils::GetElapsedTime(destination_metric_send_time_[dest]) > 1)
00546 {
00547 TLOG(5) << "sendFragment: sending metrics";
00548 metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), destination_metric_data_[dest].second, "s", 5, MetricMode::Accumulate);
00549 metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), destination_metric_data_[dest].first, "B", 5, MetricMode::Accumulate);
00550 metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), destination_metric_data_[dest].first / destination_metric_data_[dest].second, "B/s", 5, MetricMode::Average);
00551 metricMan->sendMetric("Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest), "fragments", 3, MetricMode::LastPoint);
00552
00553 destination_metric_send_time_[dest] = std::chrono::steady_clock::now();
00554 destination_metric_data_[dest].first = 0;
00555 destination_metric_data_[dest].second = 0.0;
00556
00557 if (use_routing_master_)
00558 {
00559 metricMan->sendMetric("Routing Table Size", GetRoutingTableEntryCount(), "events", 2, MetricMode::LastPoint);
00560 if (routing_wait_time_ > 0)
00561 {
00562 metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 2, MetricMode::Average);
00563 routing_wait_time_ = 0;
00564 }
00565 }
00566 }
00567 TLOG(5) << "sendFragment: Done sending fragment " << seqID << " to dest="<<dest;
00568 return std::make_pair(dest, outsts);
00569 }