artdaq  v3_08_00
DataSenderManager.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_DataSenderManager").c_str()
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 #include "artdaq/TransferPlugins/detail/HostMap.hh"
6 
7 #include <arpa/inet.h>
8 #include <netinet/in.h>
9 #include <poll.h>
10 #include <sys/socket.h>
11 #include <sys/types.h>
12 #include <chrono>
14 #include "canvas/Utilities/Exception.h"
15 
16 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
17  : destinations_()
18  , enabled_destinations_()
19  , sent_frag_count_()
20  , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
21  , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
22  , send_timeout_us_(pset.get<size_t>("send_timeout_usec", 5000000))
23  , send_retry_count_(pset.get<size_t>("send_retry_count", 2))
24  , routing_master_mode_(detail::RoutingMasterMode::INVALID)
25  , should_stop_(false)
26  , ack_socket_(-1)
27  , table_socket_(-1)
28  , routing_table_last_(0)
29  , routing_table_max_size_(pset.get<size_t>("routing_table_max_size", 1000))
30  , highest_sequence_id_routed_(0)
31 {
32  TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
33 
34  // Validate parameters
35  if (send_timeout_us_ == 0) send_timeout_us_ = std::numeric_limits<size_t>::max();
36 
37  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
38  use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
39  table_port_ = rmConfig.get<int>("table_update_port", 35556);
40  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
41  table_multicast_interface_ = rmConfig.get<std::string>("table_update_multicast_interface", "localhost");
42  ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
43  ack_address_ = rmConfig.get<std::string>("routing_master_hostname", "localhost");
44  routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
45  routing_retry_count_ = rmConfig.get<int>("routing_retry_count", 5);
46 
47  hostMap_t host_map = MakeHostMap(pset);
48  size_t tcp_send_buffer_size = pset.get<size_t>("tcp_send_buffer_size", 0);
49  size_t max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0);
50 
51  auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
52  for (auto& d : dests.get_pset_names())
53  {
54  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
55  host_map = MakeHostMap(dest_pset, host_map);
56  }
57  auto host_map_pset = MakeHostMapPset(host_map);
58  fhicl::ParameterSet dests_mod;
59  for (auto& d : dests.get_pset_names())
60  {
61  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
62  dest_pset.erase("host_map");
63  dest_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
64 
65  if (tcp_send_buffer_size != 0 && !dest_pset.has_key("tcp_send_buffer_size"))
66  {
67  dest_pset.put<size_t>("tcp_send_buffer_size", tcp_send_buffer_size);
68  }
69  if (max_fragment_size_words != 0 && !dest_pset.has_key("max_fragment_size_words"))
70  {
71  dest_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words);
72  }
73 
74  dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
75  }
76 
77  for (auto& d : dests_mod.get_pset_names())
78  {
79  try
80  {
81  auto transfer = MakeTransferPlugin(dests_mod, d, TransferInterface::Role::kSend);
82  auto destination_rank = transfer->destination_rank();
83  destinations_.emplace(destination_rank, std::move(transfer));
84  }
85  catch (const std::invalid_argument&)
86  {
87  TLOG(TLVL_DEBUG) << "Invalid destination specification: " << d;
88  }
89  catch (const cet::exception& ex)
90  {
91  TLOG(TLVL_WARNING) << "Caught cet::exception: " << ex.what();
92  }
93  catch (...)
94  {
95  TLOG(TLVL_WARNING) << "Non-cet exception while setting up TransferPlugin: " << d << ".";
96  }
97  }
98  if (destinations_.size() == 0)
99  {
100  TLOG(TLVL_ERROR) << "No destinations specified!";
101  }
102  else
103  {
104  auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
105  if (enabled_dests.size() == 0)
106  {
107  TLOG(TLVL_INFO) << "enabled_destinations not specified, assuming all destinations enabled.";
108  for (auto& d : destinations_)
109  {
110  enabled_destinations_.insert(d.first);
111  }
112  }
113  else
114  {
115  for (auto& d : enabled_dests)
116  {
117  enabled_destinations_.insert(d);
118  }
119  }
120  }
121  if (use_routing_master_) startTableReceiverThread_();
122 }
123 
125 {
126  TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager BEGIN";
127  should_stop_ = true;
128  for (auto& dest : enabled_destinations_)
129  {
130  if (destinations_.count(dest))
131  {
132  auto sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
133  if (sts != TransferInterface::CopyStatus::kSuccess) TLOG(TLVL_ERROR) << "Error sending EOD Fragment to sender rank " << dest;
134  // sendFragTo(std::move(*Fragment::eodFrag(nFragments)), dest, true);
135  }
136  }
137  if (routing_thread_.joinable()) routing_thread_.join();
138  TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager END. Sent " << count() << " fragments.";
139 }
140 
141 void artdaq::DataSenderManager::setupTableListener_()
142 {
143  int sts;
144  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
145  if (table_socket_ < 0)
146  {
147  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
148  exit(1);
149  }
150 
151  struct sockaddr_in si_me_request;
152 
153  int yes = 1;
154  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
155  {
156  TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
157  exit(1);
158  }
159  memset(&si_me_request, 0, sizeof(si_me_request));
160  si_me_request.sin_family = AF_INET;
161  si_me_request.sin_port = htons(table_port_);
162  //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
163  struct in_addr in_addr_s;
164  sts = inet_aton(table_address_.c_str(), &in_addr_s);
165  if (sts == 0)
166  {
167  TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
168  }
169  si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
170  if (bind(table_socket_, (struct sockaddr*)&si_me_request, sizeof(si_me_request)) == -1)
171  {
172  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
173  exit(1);
174  }
175 
176  struct ip_mreq mreq;
177  sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
178  if (sts == -1)
179  {
180  TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
181  exit(1);
182  }
183  sts = GetInterfaceForNetwork(table_multicast_interface_.c_str(), mreq.imr_interface);
184  if (sts == -1)
185  {
186  TLOG(TLVL_ERROR) << "Unable to determine the multicast interface for table updates using " << table_multicast_interface_;
187  exit(1);
188  }
189  char addr_str[INET_ADDRSTRLEN];
190  inet_ntop(AF_INET, &(mreq.imr_interface), addr_str, INET_ADDRSTRLEN);
191  TLOG(TLVL_INFO) << "Successfully determined the multicast network interface for " << table_multicast_interface_ << ": " << addr_str << " (DataSenderManager receiving routing table updates)";
192  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
193  {
194  TLOG(TLVL_ERROR) << "Unable to join multicast group";
195  exit(1);
196  }
197 }
198 void artdaq::DataSenderManager::startTableReceiverThread_()
199 {
200  if (routing_thread_.joinable()) routing_thread_.join();
201  TLOG(TLVL_INFO) << "Starting Routing Thread";
202  try
203  {
204  routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
205  }
206  catch (const boost::exception& e)
207  {
208  TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
209  std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
210  exit(5);
211  }
212 }
213 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
214 {
215  while (true)
216  {
217  if (should_stop_)
218  {
219  TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
220  return;
221  }
222 
223  TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes (interface,address,port = "
224  << table_multicast_interface_ << "," << table_address_ << "," << table_port_ << ")";
225  if (table_socket_ == -1)
226  {
227  TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
228  setupTableListener_();
229  }
230  if (table_socket_ == -1)
231  {
232  TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
233  return;
234  }
235  if (ack_socket_ == -1)
236  {
237  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
238  auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
239  if (sts == -1)
240  {
241  TLOG(TLVL_ERROR) << __func__ << ": Unable to resolve routing_master_address";
242  exit(1);
243  }
244  TLOG(TLVL_DEBUG) << __func__ << ": Ack socket is fd " << ack_socket_;
245  char addr_str[INET_ADDRSTRLEN];
246  inet_ntop(AF_INET, &(ack_addr_.sin_addr), addr_str, INET_ADDRSTRLEN);
247  TLOG(TLVL_INFO) << "Successfully determined the network interface for " << ack_address_ << ": " << addr_str << " (DataSenderManager sending table update acknowledgements)";
248  }
249 
250  struct pollfd fd;
251  fd.fd = table_socket_;
252  fd.events = POLLIN | POLLPRI;
253 
254  auto res = poll(&fd, 1, 1000);
255  if (res > 0)
256  {
257  auto first = artdaq::Fragment::InvalidSequenceID;
258  auto last = artdaq::Fragment::InvalidSequenceID;
259  std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
261 
262  TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
263  struct sockaddr_in from;
264  socklen_t len = sizeof(from);
265  auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, (struct sockaddr*)&from, &len);
266  TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
267 
268  if (stss > static_cast<ssize_t>(sizeof(hdr)))
269  {
270  memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader));
271  }
272  else
273  {
274  TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
275  continue;
276  }
277 
278  TRACE(TLVL_DEBUG, "receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx", hdr.nEntries, ((unsigned long*)&hdr)[0], ((unsigned long*)&hdr)[1]);
279  if (hdr.header != ROUTING_MAGIC)
280  {
281  TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss;
282  }
283  else
284  {
285  if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
286  {
287  TLOG(TLVL_ERROR) << __func__ << ": Received table has different RoutingMasterMode than expected!";
288  exit(1);
289  }
290  routing_master_mode_ = hdr.mode;
291 
293  assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
294  memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
295  TRACE(6, "receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx", stss, ((unsigned long*)&buffer[0])[0], ((unsigned long*)&buffer[0])[1]);
296 
297  first = buffer[0].sequence_id;
298  last = buffer[buffer.size() - 1].sequence_id;
299 
300  if (first + hdr.nEntries - 1 != last)
301  {
302  TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
303  continue;
304  }
305  auto thisSeqID = first;
306 
307  {
308  std::unique_lock<std::mutex> lck(routing_mutex_);
309  if (routing_table_.count(last) == 0)
310  {
311  for (auto entry : buffer)
312  {
313  if (thisSeqID != entry.sequence_id)
314  {
315  TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
316  last = thisSeqID - 1;
317  break;
318  }
319  thisSeqID++;
320  if (routing_table_.count(entry.sequence_id))
321  {
322  if (routing_table_[entry.sequence_id] != entry.destination_rank)
323  {
324  TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
325  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
326  << " I will use the original value!";
327  }
328  continue;
329  }
330  if (entry.sequence_id < routing_table_last_) continue;
331  routing_table_[entry.sequence_id] = entry.destination_rank;
332  TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
333  << " -> Rank " << entry.destination_rank;
334  }
335  }
336 
337  TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
338  if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
339 
340  auto counter = 0;
341  for (auto& entry : routing_table_)
342  {
343  TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
344  counter++;
345  }
346  }
347 
349  ack.rank = my_rank;
350  ack.first_sequence_id = first;
351  ack.last_sequence_id = last;
352 
353  if (last > routing_table_last_) routing_table_last_ = last;
354 
355  if (my_rank < static_cast<int>(8 * sizeof(hdr.already_acknowledged_ranks)) && hdr.already_acknowledged_ranks.test(my_rank))
356  {
357  TLOG(TLVL_DEBUG) << __func__ << ": Skipping RoutingAckPacket since this Routing Table Update has already been acknowledged (my_rank = " << my_rank << ")";
358  }
359  else
360  {
361  TLOG(TLVL_DEBUG) << __func__ << ": Sending RoutingAckPacket with first= " << first << " and last= " << last << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")";
362  sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr*)&ack_addr_, sizeof(ack_addr_));
363  }
364  }
365  }
366  }
367 }
368 
370 {
371  std::unique_lock<std::mutex> lck(routing_mutex_);
372  return routing_table_.size();
373 }
374 
376 {
377  std::unique_lock<std::mutex> lck(routing_mutex_);
378  // Find the distance from the next highest sequence ID to the end of the list
379  size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
380  return dist; // If dist == 1, there is one entry left.
381 }
382 
383 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
384 {
385  if (enabled_destinations_.size() == 0) return TransferInterface::RECV_TIMEOUT; // No destinations configured.
386  if (!use_routing_master_ && enabled_destinations_.size() == 1) return *enabled_destinations_.begin(); // Trivial case
387 
388  if (use_routing_master_)
389  {
390  auto start = std::chrono::steady_clock::now();
391  TLOG(15) << "calcDest_ use_routing_master check for routing info for seqID=" << sequence_id << " routing_timeout_ms=" << routing_timeout_ms_ << " should_stop_=" << should_stop_;
392  while (!should_stop_ && (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_)))
393  {
394  {
395  std::unique_lock<std::mutex> lck(routing_mutex_);
396  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id))
397  {
398  if (sequence_id > highest_sequence_id_routed_) highest_sequence_id_routed_ = sequence_id;
399  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
400  return routing_table_.at(sequence_id);
401  }
402  else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count() + 1))
403  {
404  if (sent_frag_count_.count() + 1 > highest_sequence_id_routed_) highest_sequence_id_routed_ = sent_frag_count_.count() + 1;
405  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
406  return routing_table_.at(sent_frag_count_.count() + 1);
407  }
408  }
409  usleep(routing_timeout_ms_ * 10);
410  }
411  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
412  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID)
413  {
414  TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for seqID " << sequence_id
415  << " and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
416  }
417  else
418  {
419  TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for send number " << sent_frag_count_.count()
420  << " and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
421  }
422  }
423  else
424  {
425  auto index = sequence_id % enabled_destinations_.size();
426  auto it = enabled_destinations_.begin();
427  for (; index > 0; --index)
428  {
429  ++it;
430  if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
431  }
432  return *it;
433  }
435 }
436 
437 void artdaq::DataSenderManager::RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
438 {
439  TLOG(15) << "RemoveRoutingTableEntry: Removing sequence ID " << seq << " from routing table. Sent " << GetSentSequenceIDCount(seq) << " Fragments with this Sequence ID.";
440  std::unique_lock<std::mutex> lck(routing_mutex_);
441  // while (routing_table_.size() > routing_table_max_size_)
442  // {
443  // routing_table_.erase(routing_table_.begin());
444  // }
445  if (routing_table_.find(seq) != routing_table_.end())
446  routing_table_.erase(routing_table_.find(seq));
447 
448  if (sent_sequence_id_count_.find(seq) != sent_sequence_id_count_.end())
449  {
450  sent_sequence_id_count_.erase(sent_sequence_id_count_.find(seq));
451  }
452 }
453 
454 size_t artdaq::DataSenderManager::GetSentSequenceIDCount(Fragment::sequence_id_t seq)
455 {
456  std::unique_lock<std::mutex> lck(routing_mutex_);
457  if (!sent_sequence_id_count_.count(seq)) return 0;
458  return sent_sequence_id_count_[seq];
459 }
460 
461 std::pair<int, artdaq::TransferInterface::CopyStatus> artdaq::DataSenderManager::sendFragment(Fragment&& frag)
462 {
463  // Precondition: Fragment must be complete and consistent (including
464  // header information).
465  auto start_time = std::chrono::steady_clock::now();
466  if (frag.type() == Fragment::EndOfDataFragmentType)
467  {
468  throw cet::exception("LogicError")
469  << "EOD fragments should not be sent on as received: "
470  << "use sendEODFrag() instead.";
471  }
472  size_t seqID = frag.sequenceID();
473  size_t fragSize = frag.sizeBytes();
474  auto latency_s = frag.getLatency(true);
475  double latency = latency_s.tv_sec + (latency_s.tv_nsec / 1000000000.0);
476  TLOG(13) << "sendFragment start frag.fragmentHeader()=" << std::hex << (void*)(frag.headerBeginBytes()) << ", szB=" << std::dec << fragSize
477  << ", seqID=" << seqID << ", fragID=" << frag.fragmentID() << ", type=" << frag.typeString();
480  if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
481  {
482  for (auto& bdest : enabled_destinations_)
483  {
484  TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << bdest << " (broadcast)";
485  // Gross, we have to copy.
487  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
488  while (sts == TransferInterface::CopyStatus::kTimeout && retries <= send_retry_count_)
489  {
490  if (!non_blocking_mode_)
491  {
492  sts = destinations_[bdest]->transfer_fragment_reliable_mode(Fragment(frag));
493  }
494  else
495  {
496  sts = destinations_[bdest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
497  }
498  ++retries;
499  }
500  if (sts != TransferInterface::CopyStatus::kSuccess) outsts = sts;
501  sent_frag_count_.incSlot(bdest);
502  }
503  }
504  else if (non_blocking_mode_)
505  {
506  auto count = routing_retry_count_;
507  while (dest == TransferInterface::RECV_TIMEOUT && count > 0)
508  {
509  dest = calcDest_(seqID);
511  {
512  count--;
513  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << (count > 0 ? ", retrying." : ".");
514  }
515  }
516  if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest))
517  {
518  TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
520  auto lastWarnTime = std::chrono::steady_clock::now();
521  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
522  while (sts != TransferInterface::CopyStatus::kSuccess && retries <= send_retry_count_)
523  {
524  sts = destinations_[dest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
525  if (sts != TransferInterface::CopyStatus::kSuccess && TimeUtils::GetElapsedTime(lastWarnTime) >= 1)
526  {
527  TLOG(TLVL_WARNING) << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying...";
528  lastWarnTime = std::chrono::steady_clock::now();
529  }
530  ++retries;
531  }
532  if (sts != TransferInterface::CopyStatus::kSuccess) outsts = sts;
533  //sendFragTo(std::move(frag), dest);
534  sent_frag_count_.incSlot(dest);
535  }
536  else if (!should_stop_)
537  TLOG(TLVL_ERROR) << "(in non_blocking) calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
538  << ". enabled_destinantions_.size()=" << enabled_destinations_.size();
539  }
540  else
541  {
542  auto start = std::chrono::steady_clock::now();
543  while (!should_stop_ && dest == TransferInterface::RECV_TIMEOUT)
544  {
545  dest = calcDest_(seqID);
547  {
548  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << ", send number " << sent_frag_count_.count() << ", retrying. Waited " << TimeUtils::GetElapsedTime(start) << " s for routing information.";
549  usleep(10000);
550  }
551  }
552  if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest))
553  {
554  TLOG(5) << "DataSenderManager::sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
556 
557  sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(frag));
559  TLOG(TLVL_ERROR) << "sendFragment: Sending fragment " << seqID << " to destination "
560  << dest << " failed! Data has been lost!";
561 
562  //sendFragTo(std::move(frag), dest);
563  sent_frag_count_.incSlot(dest);
564  outsts = sts;
565  }
566  else if (!should_stop_)
567  TLOG(TLVL_ERROR) << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
568  << ". enabled_destinantions_.size()=" << enabled_destinations_.size();
569  }
570 
571  {
572  std::unique_lock<std::mutex> lck(routing_mutex_);
573  sent_sequence_id_count_[seqID]++;
574  }
575 
576  auto delta_t = TimeUtils::GetElapsedTime(start_time);
577 
578  if (metricMan)
579  {
580  TLOG(5) << "sendFragment: sending metrics";
581  metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 5, MetricMode::Accumulate);
582  metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 5, MetricMode::Accumulate);
583  metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 5, MetricMode::Average);
584  metricMan->sendMetric("Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest), "fragments", 3, MetricMode::LastPoint);
585  metricMan->sendMetric("Fragment Latency at Send", latency, "s", 4, MetricMode::Average | MetricMode::Maximum);
586 
587  if (use_routing_master_)
588  {
589  metricMan->sendMetric("Routing Table Size", GetRoutingTableEntryCount(), "events", 2, MetricMode::LastPoint);
590  if (routing_wait_time_ > 0)
591  {
592  metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 2, MetricMode::Average);
593  routing_wait_time_ = 0;
594  }
595  }
596  }
597  TLOG(5) << "sendFragment: Done sending fragment " << seqID << " to dest=" << dest;
598  return std::make_pair(dest, outsts);
599 } // artdaq::DataSenderManager::sendFragment
void RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
Remove the given sequence ID from the routing table and sent_count lists.
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
A row of the Routing Table.
hostMap_t MakeHostMap(fhicl::ParameterSet pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:66
virtual ~DataSenderManager()
DataSenderManager Destructor.
std::bitset< 1024 > already_acknowledged_ranks
Bitset of ranks which have already sent valid acknowledgements and therefore do not need to send agai...
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
int GetInterfaceForNetwork(char const *host_in, in_addr &addr)
Convert an IP address to the network address of the interface sharing the subnet mask.
Definition: TCPConnect.cc:217
size_t GetSentSequenceIDCount(Fragment::sequence_id_t seq)
Get the number of Fragments sent with a given Sequence ID.
Events should be routed by sequence ID (BR -&gt; EB)
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
std::pair< int, TransferInterface::CopyStatus > sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
The header of the Routing Table, containing the magic bytes and the number of entries.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, std::string > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
Definition: HostMap.hh:46
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
RoutingMasterMode mode
The current mode of the RoutingMaster.
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
Value to be returned upon receive timeout.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:39
Events should be routed by send count (EB -&gt; Agg)
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.