$treeview $search $mathjax $extrastylesheet
artdaq
v3_04_00
$projectbrief
|
$projectbrief
|
$searchbox |
00001 #define TRACE_NAME (app_name + "_DataSenderManager").c_str() 00002 #include "artdaq/DAQdata/Globals.hh" 00003 #include "artdaq/DAQrate/DataSenderManager.hh" 00004 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh" 00005 #include "artdaq/TransferPlugins/detail/HostMap.hh" 00006 00007 #include <chrono> 00008 #include "canvas/Utilities/Exception.h" 00009 #include <arpa/inet.h> 00010 #include <netinet/in.h> 00011 #include <sys/types.h> 00012 #include <poll.h> 00013 #include <sys/socket.h> 00014 #include "artdaq/DAQdata/TCPConnect.hh" 00015 00016 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset) 00017 : destinations_() 00018 , destination_metric_data_() 00019 , destination_metric_send_time_() 00020 , enabled_destinations_() 00021 , sent_frag_count_() 00022 , broadcast_sends_(pset.get<bool>("broadcast_sends", false)) 00023 , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false)) 00024 , send_timeout_us_(pset.get<size_t>("send_timeout_usec", 5000000)) 00025 , send_retry_count_(pset.get<size_t>("send_retry_count", 2)) 00026 , routing_master_mode_(detail::RoutingMasterMode::INVALID) 00027 , should_stop_(false) 00028 , ack_socket_(-1) 00029 , table_socket_(-1) 00030 , routing_table_last_(0) 00031 , routing_table_max_size_(pset.get<size_t>("routing_table_max_size", 1000)) 00032 , highest_sequence_id_routed_(0) 00033 { 00034 TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string(); 00035 00036 // Validate parameters 00037 if (send_timeout_us_ == 0) send_timeout_us_ = std::numeric_limits<size_t>::max(); 00038 00039 auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet()); 00040 use_routing_master_ = rmConfig.get<bool>("use_routing_master", false); 00041 table_port_ = rmConfig.get<int>("table_update_port", 35556); 00042 table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28"); 00043 ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557); 00044 ack_address_ = rmConfig.get<std::string>("routing_master_hostname", "localhost"); 00045 routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000)); 00046 routing_retry_count_ = rmConfig.get<int>("routing_retry_count", 5); 00047 00048 hostMap_t host_map = MakeHostMap(pset); 00049 size_t tcp_send_buffer_size = pset.get<size_t>("tcp_send_buffer_size", 0); 00050 size_t max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0); 00051 00052 auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet()); 00053 for (auto& d : dests.get_pset_names()) 00054 { 00055 auto dest_pset = dests.get<fhicl::ParameterSet>(d); 00056 host_map = MakeHostMap(dest_pset, host_map); 00057 } 00058 auto host_map_pset = MakeHostMapPset(host_map); 00059 fhicl::ParameterSet dests_mod; 00060 for (auto& d : dests.get_pset_names()) 00061 { 00062 auto dest_pset = dests.get<fhicl::ParameterSet>(d); 00063 dest_pset.erase("host_map"); 00064 dest_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset); 00065 00066 if (tcp_send_buffer_size != 0 && !dest_pset.has_key("tcp_send_buffer_size")) 00067 { 00068 dest_pset.put<size_t>("tcp_send_buffer_size", tcp_send_buffer_size); 00069 } 00070 if (max_fragment_size_words != 0 && !dest_pset.has_key("max_fragment_size_words")) 00071 { 00072 dest_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words); 00073 } 00074 00075 dests_mod.put<fhicl::ParameterSet>(d, dest_pset); 00076 } 00077 00078 for (auto& d : dests_mod.get_pset_names()) 00079 { 00080 try 00081 { 00082 auto transfer = MakeTransferPlugin(dests_mod, d, TransferInterface::Role::kSend); 00083 auto destination_rank = transfer->destination_rank(); 00084 destinations_.emplace(destination_rank, std::move(transfer)); 00085 destination_metric_data_[destination_rank] = std::pair<size_t, double>(); 00086 destination_metric_send_time_[destination_rank] = std::chrono::steady_clock::now(); 00087 } 00088 catch (std::invalid_argument) 00089 { 00090 TLOG(TLVL_DEBUG) << "Invalid destination specification: " << d; 00091 } 00092 catch (cet::exception ex) 00093 { 00094 TLOG(TLVL_WARNING) << "Caught cet::exception: " << ex.what(); 00095 } 00096 catch (...) 00097 { 00098 TLOG(TLVL_WARNING) << "Non-cet exception while setting up TransferPlugin: " << d << "."; 00099 } 00100 } 00101 if (destinations_.size() == 0) 00102 { 00103 TLOG(TLVL_ERROR) << "No destinations specified!"; 00104 } 00105 else 00106 { 00107 auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>()); 00108 if (enabled_dests.size() == 0) 00109 { 00110 TLOG(TLVL_INFO) << "enabled_destinations not specified, assuming all destinations enabled."; 00111 for (auto& d : destinations_) 00112 { 00113 enabled_destinations_.insert(d.first); 00114 } 00115 } 00116 else 00117 { 00118 for (auto& d : enabled_dests) 00119 { 00120 enabled_destinations_.insert(d); 00121 } 00122 } 00123 } 00124 if (use_routing_master_) startTableReceiverThread_(); 00125 } 00126 00127 artdaq::DataSenderManager::~DataSenderManager() 00128 { 00129 TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager BEGIN"; 00130 should_stop_ = true; 00131 for (auto& dest : enabled_destinations_) 00132 { 00133 if (destinations_.count(dest)) 00134 { 00135 auto sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest)))); 00136 if (sts != TransferInterface::CopyStatus::kSuccess) TLOG(TLVL_ERROR) << "Error sending EOD Fragment to sender rank " << dest; 00137 // sendFragTo(std::move(*Fragment::eodFrag(nFragments)), dest, true); 00138 } 00139 } 00140 if (routing_thread_.joinable()) routing_thread_.join(); 00141 TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager END. Sent " << count() << " fragments."; 00142 } 00143 00144 00145 void artdaq::DataSenderManager::setupTableListener_() 00146 { 00147 int sts; 00148 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 00149 if (table_socket_ < 0) 00150 { 00151 TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!"; 00152 exit(1); 00153 } 00154 00155 struct sockaddr_in si_me_request; 00156 00157 int yes = 1; 00158 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) 00159 { 00160 TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket"; 00161 exit(1); 00162 } 00163 memset(&si_me_request, 0, sizeof(si_me_request)); 00164 si_me_request.sin_family = AF_INET; 00165 si_me_request.sin_port = htons(table_port_); 00166 //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY); 00167 struct in_addr in_addr_s; 00168 sts = inet_aton(table_address_.c_str(), &in_addr_s ); 00169 if (sts == 0) 00170 { 00171 TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid"; 00172 } 00173 si_me_request.sin_addr.s_addr = in_addr_s.s_addr; 00174 if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1) 00175 { 00176 TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_; 00177 exit(1); 00178 } 00179 00180 struct ip_mreq mreq; 00181 sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr); 00182 if (sts == -1) 00183 { 00184 TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates"; 00185 exit(1); 00186 } 00187 mreq.imr_interface.s_addr = htonl(INADDR_ANY); 00188 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) 00189 { 00190 TLOG(TLVL_ERROR) << "Unable to join multicast group"; 00191 exit(1); 00192 } 00193 } 00194 void artdaq::DataSenderManager::startTableReceiverThread_() 00195 { 00196 if (routing_thread_.joinable()) routing_thread_.join(); 00197 TLOG(TLVL_INFO) << "Starting Routing Thread"; 00198 try { 00199 routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_, this); 00200 } 00201 catch (const boost::exception& e) 00202 { 00203 TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno; 00204 std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl; 00205 exit(5); 00206 } 00207 } 00208 void artdaq::DataSenderManager::receiveTableUpdatesLoop_() 00209 { 00210 while (true) 00211 { 00212 if (should_stop_) 00213 { 00214 TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping"; 00215 return; 00216 } 00217 00218 TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes"; 00219 if (table_socket_ == -1) 00220 { 00221 TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket"; 00222 setupTableListener_(); 00223 } 00224 if (table_socket_ == -1) 00225 { 00226 TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully."; 00227 return; 00228 } 00229 if (ack_socket_ == -1) 00230 { 00231 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 00232 auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_); 00233 if (sts == -1) 00234 { 00235 TLOG(TLVL_ERROR) << __func__ << ": Unable to resolve routing_master_address"; 00236 exit(1); 00237 } 00238 TLOG(TLVL_DEBUG) << __func__ << ": Ack socket is fd " << ack_socket_; 00239 } 00240 00241 struct pollfd fd; 00242 fd.fd = table_socket_; 00243 fd.events = POLLIN | POLLPRI; 00244 00245 auto res = poll(&fd, 1, 1000); 00246 if (res > 0) 00247 { 00248 auto first = artdaq::Fragment::InvalidSequenceID; 00249 auto last = artdaq::Fragment::InvalidSequenceID; 00250 std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE); 00251 artdaq::detail::RoutingPacketHeader hdr; 00252 00253 TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader"; 00254 struct sockaddr_in from; 00255 socklen_t len=sizeof(from); 00256 auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, (struct sockaddr*)&from, &len ); 00257 TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port; 00258 00259 if (stss > static_cast<ssize_t>(sizeof(hdr))) 00260 { 00261 memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader)); 00262 } 00263 else 00264 { 00265 TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding."; 00266 continue; 00267 } 00268 00269 TRACE(TLVL_DEBUG,"receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx",hdr.nEntries,((unsigned long*)&hdr)[0],((unsigned long*)&hdr)[1]); 00270 if (hdr.header != ROUTING_MAGIC) 00271 { 00272 TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)="<<stss; 00273 } 00274 else 00275 { 00276 if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode) 00277 { 00278 TLOG(TLVL_ERROR) << __func__ << ": Received table has different RoutingMasterMode than expected!"; 00279 exit(1); 00280 } 00281 routing_master_mode_ = hdr.mode; 00282 00283 artdaq::detail::RoutingPacket buffer(hdr.nEntries); 00284 assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries); 00285 memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries); 00286 TRACE(6,"receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx",stss,((unsigned long*)&buffer[0])[0],((unsigned long*)&buffer[0])[1]); 00287 00288 first = buffer[0].sequence_id; 00289 last = buffer[buffer.size() - 1].sequence_id; 00290 00291 if (first + hdr.nEntries - 1 != last) 00292 { 00293 TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!"; 00294 continue; 00295 } 00296 auto thisSeqID = first; 00297 00298 { 00299 std::unique_lock<std::mutex> lck(routing_mutex_); 00300 if (routing_table_.count(last) == 0) 00301 { 00302 for (auto entry : buffer) 00303 { 00304 if (thisSeqID != entry.sequence_id) 00305 { 00306 TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!"; 00307 last = thisSeqID - 1; 00308 break; 00309 } 00310 thisSeqID++; 00311 if (routing_table_.count(entry.sequence_id)) 00312 { 00313 if (routing_table_[entry.sequence_id] != entry.destination_rank) 00314 { 00315 TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id 00316 << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!" 00317 << " I will use the original value!"; 00318 } 00319 continue; 00320 } 00321 if (entry.sequence_id < routing_table_last_) continue; 00322 routing_table_[entry.sequence_id] = entry.destination_rank; 00323 TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id 00324 << " -> Rank " << entry.destination_rank; 00325 } 00326 } 00327 00328 TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table"; 00329 if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first; 00330 00331 auto counter = 0; 00332 for (auto& entry : routing_table_) 00333 { 00334 TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second; 00335 counter++; 00336 } 00337 } 00338 00339 artdaq::detail::RoutingAckPacket ack; 00340 ack.rank = my_rank; 00341 ack.first_sequence_id = first; 00342 ack.last_sequence_id = last; 00343 00344 if (last > routing_table_last_) routing_table_last_ = last; 00345 00346 TLOG(TLVL_DEBUG) << __func__ << ": Sending RoutingAckPacket with first= " << first << " and last= " << last << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")"; 00347 sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr_, sizeof(ack_addr_)); 00348 } 00349 } 00350 } 00351 } 00352 00353 size_t artdaq::DataSenderManager::GetRoutingTableEntryCount() const 00354 { 00355 std::unique_lock<std::mutex> lck(routing_mutex_); 00356 return routing_table_.size(); 00357 } 00358 00359 size_t artdaq::DataSenderManager::GetRemainingRoutingTableEntries() const 00360 { 00361 std::unique_lock<std::mutex> lck(routing_mutex_); 00362 // Find the distance from the next highest sequence ID to the end of the list 00363 size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end()); 00364 return dist; // If dist == 1, there is one entry left. 00365 } 00366 00367 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const 00368 { 00369 if (enabled_destinations_.size() == 0) return TransferInterface::RECV_TIMEOUT; // No destinations configured. 00370 if (!use_routing_master_ && enabled_destinations_.size() == 1) return *enabled_destinations_.begin(); // Trivial case 00371 00372 if (use_routing_master_) 00373 { 00374 auto start = std::chrono::steady_clock::now(); 00375 TLOG(15) << "calcDest_ use_routing_master check for routing info for seqID="<<sequence_id<<" routing_timeout_ms="<<routing_timeout_ms_<<" should_stop_="<<should_stop_; 00376 while (!should_stop_ && (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_))) 00377 { 00378 { 00379 std::unique_lock<std::mutex> lck(routing_mutex_); 00380 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id)) 00381 { 00382 if (sequence_id > highest_sequence_id_routed_) highest_sequence_id_routed_ = sequence_id; 00383 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start)); 00384 return routing_table_.at(sequence_id); 00385 } 00386 else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count() + 1)) 00387 { 00388 if (sent_frag_count_.count() + 1 > highest_sequence_id_routed_) highest_sequence_id_routed_ = sent_frag_count_.count() + 1; 00389 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start)); 00390 return routing_table_.at(sent_frag_count_.count() + 1); 00391 } 00392 } 00393 usleep(routing_timeout_ms_ * 10); 00394 } 00395 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start)); 00396 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID) 00397 { 00398 TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for seqID " << sequence_id 00399 << " and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!"; 00400 } 00401 else 00402 { 00403 TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for send number " << sent_frag_count_.count() 00404 << " and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!"; 00405 } 00406 } 00407 else 00408 { 00409 auto index = sequence_id % enabled_destinations_.size(); 00410 auto it = enabled_destinations_.begin(); 00411 for (; index > 0; --index) 00412 { 00413 ++it; 00414 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin(); 00415 } 00416 return *it; 00417 } 00418 return TransferInterface::RECV_TIMEOUT; 00419 } 00420 00421 std::pair<int, artdaq::TransferInterface::CopyStatus> artdaq::DataSenderManager::sendFragment(Fragment&& frag) 00422 { 00423 // Precondition: Fragment must be complete and consistent (including 00424 // header information). 00425 auto start_time = std::chrono::steady_clock::now(); 00426 if (frag.type() == Fragment::EndOfDataFragmentType) 00427 { 00428 throw cet::exception("LogicError") 00429 << "EOD fragments should not be sent on as received: " 00430 << "use sendEODFrag() instead."; 00431 } 00432 size_t seqID = frag.sequenceID(); 00433 size_t fragSize = frag.sizeBytes(); 00434 TLOG(13) << "sendFragment start frag.fragmentHeader()=" << std::hex << (void*)(frag.headerBeginBytes()) << ", szB=" << std::dec << fragSize 00435 << ", seqID=" << seqID << ", type=" << frag.typeString(); 00436 int dest = TransferInterface::RECV_TIMEOUT; 00437 auto outsts = TransferInterface::CopyStatus::kSuccess; 00438 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType) 00439 { 00440 for (auto& bdest : enabled_destinations_) 00441 { 00442 TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << bdest << " (broadcast)"; 00443 // Gross, we have to copy. 00444 auto sts = TransferInterface::CopyStatus::kTimeout; 00445 size_t retries = 0; // Tried once, so retries < send_retry_count_ will have it retry send_retry_count_ times 00446 while (sts == TransferInterface::CopyStatus::kTimeout && retries < send_retry_count_) 00447 { 00448 if (!non_blocking_mode_) 00449 { 00450 sts = destinations_[bdest]->transfer_fragment_reliable_mode(Fragment(frag)); 00451 } 00452 else 00453 { 00454 sts = destinations_[bdest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_); 00455 } 00456 retries++; 00457 } 00458 if (sts != TransferInterface::CopyStatus::kSuccess) outsts = sts; 00459 sent_frag_count_.incSlot(bdest); 00460 } 00461 } 00462 else if (non_blocking_mode_) 00463 { 00464 auto count = routing_retry_count_; 00465 while (dest == TransferInterface::RECV_TIMEOUT && count > 0) 00466 { 00467 dest = calcDest_(seqID); 00468 if (dest == TransferInterface::RECV_TIMEOUT) 00469 { 00470 count--; 00471 TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << (count > 0 ? ", retrying." : "."); 00472 } 00473 } 00474 if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest)) 00475 { 00476 TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest; 00477 TransferInterface::CopyStatus sts = TransferInterface::CopyStatus::kErrorNotRequiringException; 00478 auto lastWarnTime = std::chrono::steady_clock::now(); 00479 size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times 00480 while (sts != TransferInterface::CopyStatus::kSuccess && retries <= send_retry_count_) 00481 { 00482 sts = destinations_[dest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_); 00483 if (sts != TransferInterface::CopyStatus::kSuccess && TimeUtils::GetElapsedTime(lastWarnTime) >= 1) 00484 { 00485 TLOG(TLVL_WARNING) << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying..."; 00486 lastWarnTime = std::chrono::steady_clock::now(); 00487 } 00488 } 00489 if (sts != TransferInterface::CopyStatus::kSuccess) outsts = sts; 00490 //sendFragTo(std::move(frag), dest); 00491 sent_frag_count_.incSlot(dest); 00492 } 00493 else if (!should_stop_) 00494 TLOG(TLVL_ERROR) << "(in non_blocking) calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID 00495 << ". enabled_destinantions_.size()="<<enabled_destinations_.size(); 00496 } 00497 else 00498 { 00499 auto start = std::chrono::steady_clock::now(); 00500 while (!should_stop_ && dest == TransferInterface::RECV_TIMEOUT) 00501 { 00502 dest = calcDest_(seqID); 00503 if (dest == TransferInterface::RECV_TIMEOUT) 00504 { 00505 TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << ", send number " << sent_frag_count_.count() << ", retrying. Waited " << TimeUtils::GetElapsedTime(start) << " s for routing information."; 00506 usleep(10000); 00507 } 00508 } 00509 if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest)) 00510 { 00511 TLOG(5) << "DataSenderManager::sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest; 00512 TransferInterface::CopyStatus sts = TransferInterface::CopyStatus::kErrorNotRequiringException; 00513 00514 sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(frag)); 00515 if (sts != TransferInterface::CopyStatus::kSuccess) 00516 TLOG(TLVL_ERROR) << "sendFragment: Sending fragment " << seqID << " to destination " 00517 << dest << " failed! Data has been lost!"; 00518 00519 //sendFragTo(std::move(frag), dest); 00520 sent_frag_count_.incSlot(dest); 00521 outsts = sts; 00522 } 00523 else if (!should_stop_) 00524 TLOG(TLVL_ERROR) << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID 00525 << ". enabled_destinantions_.size()="<<enabled_destinations_.size(); 00526 } 00527 00528 { 00529 std::unique_lock<std::mutex> lck(routing_mutex_); 00530 // while (routing_table_.size() > routing_table_max_size_) 00531 // { 00532 // routing_table_.erase(routing_table_.begin()); 00533 // } 00534 if(routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.find(seqID) != routing_table_.end()) 00535 routing_table_.erase(routing_table_.find(seqID)); 00536 else if(routing_table_.find(sent_frag_count_.count()) != routing_table_.end()) 00537 routing_table_.erase(routing_table_.find(sent_frag_count_.count())); 00538 } 00539 00540 00541 auto delta_t = TimeUtils::GetElapsedTime(start_time); 00542 destination_metric_data_[dest].first += fragSize; 00543 destination_metric_data_[dest].second += delta_t; 00544 00545 if (metricMan && TimeUtils::GetElapsedTime(destination_metric_send_time_[dest]) > 1) 00546 { 00547 TLOG(5) << "sendFragment: sending metrics"; 00548 metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), destination_metric_data_[dest].second, "s", 5, MetricMode::Accumulate); 00549 metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), destination_metric_data_[dest].first, "B", 5, MetricMode::Accumulate); 00550 metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), destination_metric_data_[dest].first / destination_metric_data_[dest].second, "B/s", 5, MetricMode::Average); 00551 metricMan->sendMetric("Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest), "fragments", 3, MetricMode::LastPoint); 00552 00553 destination_metric_send_time_[dest] = std::chrono::steady_clock::now(); 00554 destination_metric_data_[dest].first = 0; 00555 destination_metric_data_[dest].second = 0.0; 00556 00557 if (use_routing_master_) 00558 { 00559 metricMan->sendMetric("Routing Table Size", GetRoutingTableEntryCount(), "events", 2, MetricMode::LastPoint); 00560 if (routing_wait_time_ > 0) 00561 { 00562 metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 2, MetricMode::Average); 00563 routing_wait_time_ = 0; 00564 } 00565 } 00566 } 00567 TLOG(5) << "sendFragment: Done sending fragment " << seqID << " to dest="<<dest; 00568 return std::make_pair(dest, outsts); 00569 } // artdaq::DataSenderManager::sendFragment