artdaq  v3_09_04
DataSenderManager.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_DataSenderManager").c_str()
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 #include "artdaq/TransferPlugins/detail/HostMap.hh"
6 
7 #include <arpa/inet.h>
8 #include <netinet/in.h>
9 #include <poll.h>
10 #include <sys/socket.h>
11 #include <sys/types.h>
12 #include <chrono>
14 #include "canvas/Utilities/Exception.h"
15 
16 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
17  : sent_frag_count_()
18  , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
19  , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
20  , send_timeout_us_(pset.get<size_t>("send_timeout_usec", 5000000))
21  , send_retry_count_(pset.get<size_t>("send_retry_count", 2))
22  , routing_manager_mode_(detail::RoutingManagerMode::INVALID)
23  , should_stop_(false)
24  , ack_socket_(-1)
25  , table_socket_(-1)
26  , routing_table_last_(0)
27  , routing_table_max_size_(pset.get<size_t>("routing_table_max_size", 1000))
28  , highest_sequence_id_routed_(0)
29 {
30  TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
31 
32  // Validate parameters
33  if (send_timeout_us_ == 0)
34  {
35  send_timeout_us_ = std::numeric_limits<size_t>::max();
36  }
37 
38  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
39  use_routing_manager_ = rmConfig.get<bool>("use_routing_manager", false);
40  table_port_ = rmConfig.get<int>("table_update_port", 35556);
41  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
42  table_multicast_interface_ = rmConfig.get<std::string>("table_update_multicast_interface", "localhost");
43  ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
44  ack_address_ = rmConfig.get<std::string>("routing_manager_hostname", "localhost");
45  routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
46  routing_retry_count_ = rmConfig.get<int>("routing_retry_count", 5);
47 
48  hostMap_t host_map = MakeHostMap(pset);
49  auto tcp_send_buffer_size = pset.get<size_t>("tcp_send_buffer_size", 0);
50  auto max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0);
51 
52  auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
53  for (auto& d : dests.get_pset_names())
54  {
55  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
56  host_map = MakeHostMap(dest_pset, host_map);
57  }
58  auto host_map_pset = MakeHostMapPset(host_map);
59  fhicl::ParameterSet dests_mod;
60  for (auto& d : dests.get_pset_names())
61  {
62  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
63  dest_pset.erase("host_map");
64  dest_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
65 
66  if (tcp_send_buffer_size != 0 && !dest_pset.has_key("tcp_send_buffer_size"))
67  {
68  dest_pset.put<size_t>("tcp_send_buffer_size", tcp_send_buffer_size);
69  }
70  if (max_fragment_size_words != 0 && !dest_pset.has_key("max_fragment_size_words"))
71  {
72  dest_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words);
73  }
74 
75  dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
76  }
77 
78  for (auto& d : dests_mod.get_pset_names())
79  {
80  try
81  {
82  auto transfer = MakeTransferPlugin(dests_mod, d, TransferInterface::Role::kSend);
83  auto destination_rank = transfer->destination_rank();
84  destinations_.emplace(destination_rank, std::move(transfer));
85  }
86  catch (const std::invalid_argument&)
87  {
88  TLOG(TLVL_DEBUG) << "Invalid destination specification: " << d;
89  }
90  catch (const cet::exception& ex)
91  {
92  TLOG(TLVL_WARNING) << "Caught cet::exception: " << ex.what();
93  }
94  catch (...)
95  {
96  TLOG(TLVL_WARNING) << "Non-cet exception while setting up TransferPlugin: " << d << ".";
97  }
98  }
99  if (destinations_.empty())
100  {
101  TLOG(TLVL_ERROR) << "No destinations specified!";
102  }
103  else
104  {
105  auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
106  if (enabled_dests.empty())
107  {
108  TLOG(TLVL_INFO) << "enabled_destinations not specified, assuming all destinations enabled.";
109  for (auto& d : destinations_)
110  {
111  enabled_destinations_.insert(d.first);
112  }
113  }
114  else
115  {
116  for (auto& d : enabled_dests)
117  {
118  enabled_destinations_.insert(d);
119  }
120  }
121  }
122  if (use_routing_manager_)
123  {
124  startTableReceiverThread_();
125  }
126 }
127 
129 {
130  TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager BEGIN";
131  should_stop_ = true;
132  for (auto& dest : enabled_destinations_)
133  {
134  if (destinations_.count(dest) != 0u)
135  {
136  auto sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
138  {
139  TLOG(TLVL_ERROR) << "Error sending EOD Fragment to sender rank " << dest;
140  }
141  // sendFragTo(std::move(*Fragment::eodFrag(nFragments)), dest, true);
142  }
143  }
144  try
145  {
146  if (routing_thread_.joinable())
147  {
148  routing_thread_.join();
149  }
150  }
151  catch (...)
152  { // IGNORED
153  }
154  TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager END. Sent " << count() << " fragments.";
155 }
156 
157 void artdaq::DataSenderManager::setupTableListener_()
158 {
159  int sts;
160  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
161  if (table_socket_ < 0)
162  {
163  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
164  exit(1);
165  }
166 
167  struct sockaddr_in si_me_request;
168 
169  int yes = 1;
170  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
171  {
172  TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
173  exit(1);
174  }
175  memset(&si_me_request, 0, sizeof(si_me_request));
176  si_me_request.sin_family = AF_INET;
177  si_me_request.sin_port = htons(table_port_);
178  //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
179  struct in_addr in_addr_s;
180  sts = inet_aton(table_address_.c_str(), &in_addr_s);
181  if (sts == 0)
182  {
183  TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
184  }
185  si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
186  if (bind(table_socket_, reinterpret_cast<struct sockaddr*>(&si_me_request), sizeof(si_me_request)) == -1) // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
187  {
188  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
189  exit(1);
190  }
191 
192  struct ip_mreq mreq;
193  sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
194  if (sts == -1)
195  {
196  TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
197  exit(1);
198  }
199  sts = GetInterfaceForNetwork(table_multicast_interface_.c_str(), mreq.imr_interface);
200  if (sts == -1)
201  {
202  TLOG(TLVL_ERROR) << "Unable to determine the multicast interface for table updates using " << table_multicast_interface_;
203  exit(1);
204  }
205  char addr_str[INET_ADDRSTRLEN];
206  inet_ntop(AF_INET, &(mreq.imr_interface), addr_str, INET_ADDRSTRLEN);
207  TLOG(TLVL_INFO) << "Successfully determined the multicast network interface for " << table_multicast_interface_ << ": " << addr_str << " (DataSenderManager receiving routing table updates)";
208  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
209  {
210  TLOG(TLVL_ERROR) << "Unable to join multicast group";
211  exit(1);
212  }
213 }
214 void artdaq::DataSenderManager::startTableReceiverThread_()
215 {
216  if (routing_thread_.joinable())
217  {
218  routing_thread_.join();
219  }
220  TLOG(TLVL_INFO) << "Starting Routing Thread";
221  try
222  {
223  routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
224  }
225  catch (const boost::exception& e)
226  {
227  TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
228  std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
229  exit(5);
230  }
231 }
232 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
233 {
234  while (true)
235  {
236  if (should_stop_)
237  {
238  TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
240 
241  TLOG(TLVL_DEBUG) << __func__ << ": Sending RoutingAckPacket with end of run markers to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")";
242  sendto(ack_socket_, &endOfDataAck, sizeof(artdaq::detail::RoutingAckPacket), 0, reinterpret_cast<struct sockaddr*>(&ack_addr_), sizeof(ack_addr_)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
243  return;
244  }
245 
246  TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes (interface,address,port = "
247  << table_multicast_interface_ << "," << table_address_ << "," << table_port_ << ")";
248  if (table_socket_ == -1)
249  {
250  TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
251  setupTableListener_();
252  }
253  if (table_socket_ == -1)
254  {
255  TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
256  return;
257  }
258  if (ack_socket_ == -1)
259  {
260  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
261  auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
262  if (sts == -1)
263  {
264  TLOG(TLVL_ERROR) << __func__ << ": Unable to resolve routing_manager_address";
265  exit(1);
266  }
267  TLOG(TLVL_DEBUG) << __func__ << ": Ack socket is fd " << ack_socket_;
268  char addr_str[INET_ADDRSTRLEN];
269  inet_ntop(AF_INET, &(ack_addr_.sin_addr), addr_str, INET_ADDRSTRLEN);
270  TLOG(TLVL_INFO) << "Successfully determined the network interface for " << ack_address_ << ": " << addr_str << " (DataSenderManager sending table update acknowledgements)";
271  }
272 
273  struct pollfd fd;
274  fd.fd = table_socket_;
275  fd.events = POLLIN | POLLPRI;
276 
277  auto res = poll(&fd, 1, 1000);
278  if (res > 0)
279  {
280  auto first = artdaq::Fragment::InvalidSequenceID;
281  auto last = artdaq::Fragment::InvalidSequenceID;
282  std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
284 
285  TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
286  struct sockaddr_in from;
287  socklen_t len = sizeof(from);
288  auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, reinterpret_cast<struct sockaddr*>(&from), &len); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
289  TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
290 
291  if (stss > static_cast<ssize_t>(sizeof(hdr)))
292  {
293  memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader));
294  }
295  else
296  {
297  TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
298  continue;
299  }
300 
301  TLOG(TLVL_DEBUG) << "receiveTableUpdatesLoop_: Checking for valid header with nEntries=" << hdr.nEntries << "header=" << std::hex << hdr.header;
302  if (hdr.header != ROUTING_MAGIC)
303  {
304  TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss;
305  }
306  else
307  {
308  if (routing_manager_mode_ != detail::RoutingManagerMode::INVALID && routing_manager_mode_ != hdr.mode)
309  {
310  TLOG(TLVL_ERROR) << __func__ << ": Received table has different RoutingManagerMode than expected!";
311  exit(1);
312  }
313  routing_manager_mode_ = hdr.mode;
314 
316  assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
317  memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
318 
319  first = buffer.front().sequence_id;
320  last = buffer.back().sequence_id;
321 
322  if (first + hdr.nEntries - 1 != last)
323  {
324  TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
325  continue;
326  }
327  auto thisSeqID = first;
328 
329  {
330  std::unique_lock<std::mutex> lck(routing_mutex_);
331  if (routing_table_.count(last) == 0)
332  {
333  for (auto entry : buffer)
334  {
335  if (thisSeqID != entry.sequence_id)
336  {
337  TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
338  last = thisSeqID - 1;
339  break;
340  }
341  thisSeqID++;
342  if (routing_table_.count(entry.sequence_id) != 0u)
343  {
344  if (routing_table_[entry.sequence_id] != entry.destination_rank)
345  {
346  TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
347  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
348  << " I will use the original value!";
349  }
350  continue;
351  }
352  if (entry.sequence_id < routing_table_last_)
353  {
354  continue;
355  }
356  routing_table_[entry.sequence_id] = entry.destination_rank;
357  TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
358  << " -> Rank " << entry.destination_rank;
359  }
360  }
361 
362  TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
363  if (!routing_table_.empty())
364  {
365  TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
366  }
367 
368  auto counter = 0;
369  for (auto& entry : routing_table_)
370  {
371  TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
372  counter++;
373  }
374  }
375 
377  ack.rank = my_rank;
378  ack.first_sequence_id = first;
379  ack.last_sequence_id = last;
380 
381  if (last > routing_table_last_)
382  {
383  routing_table_last_ = last;
384  }
385 
386  if (my_rank < static_cast<int>(8 * sizeof(hdr.already_acknowledged_ranks)) && hdr.already_acknowledged_ranks.test(my_rank))
387  {
388  TLOG(TLVL_DEBUG) << __func__ << ": Skipping RoutingAckPacket since this Routing Table Update has already been acknowledged (my_rank = " << my_rank << ")";
389  }
390  else
391  {
392  TLOG(TLVL_DEBUG) << __func__ << ": Sending RoutingAckPacket with first= " << first << " and last= " << last << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")";
393  sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, reinterpret_cast<struct sockaddr*>(&ack_addr_), sizeof(ack_addr_)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
394  }
395  }
396  }
397  }
398 }
399 
401 {
402  std::unique_lock<std::mutex> lck(routing_mutex_);
403  return routing_table_.size();
404 }
405 
407 {
408  std::unique_lock<std::mutex> lck(routing_mutex_);
409  // Find the distance from the next highest sequence ID to the end of the list
410  size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
411  return dist; // If dist == 1, there is one entry left.
412 }
413 
414 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
415 {
416  if (enabled_destinations_.empty())
417  {
418  return TransferInterface::RECV_TIMEOUT; // No destinations configured.
419  }
420  if (!use_routing_manager_ && enabled_destinations_.size() == 1)
421  {
422  return *enabled_destinations_.begin(); // Trivial case
423  }
424 
425  if (use_routing_manager_)
426  {
427  auto start = std::chrono::steady_clock::now();
428  TLOG(15) << "calcDest_ use_routing_manager check for routing info for seqID=" << sequence_id << " routing_timeout_ms=" << routing_timeout_ms_ << " should_stop_=" << should_stop_;
429  while (!should_stop_ && (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_)))
430  {
431  {
432  std::unique_lock<std::mutex> lck(routing_mutex_);
433  if (routing_manager_mode_ == detail::RoutingManagerMode::RouteBySequenceID && (routing_table_.count(sequence_id) != 0u))
434  {
435  if (sequence_id > highest_sequence_id_routed_)
436  {
437  highest_sequence_id_routed_ = sequence_id;
438  }
439  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
440  return routing_table_.at(sequence_id);
441  }
442  if (routing_manager_mode_ == detail::RoutingManagerMode::RouteBySendCount && (routing_table_.count(sent_frag_count_.count() + 1) != 0u))
443  {
444  if (sent_frag_count_.count() + 1 > highest_sequence_id_routed_)
445  {
446  highest_sequence_id_routed_ = sent_frag_count_.count() + 1;
447  }
448  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
449  return routing_table_.at(sent_frag_count_.count() + 1);
450  }
451  }
452  usleep(routing_timeout_ms_ * 10);
453  }
454  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
455  if (routing_manager_mode_ == detail::RoutingManagerMode::RouteBySequenceID)
456  {
457  TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for seqID " << sequence_id
458  << " and the Routing Manager did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
459  }
460  else
461  {
462  TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for send number " << sent_frag_count_.count()
463  << " and the Routing Manager did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
464  }
465  }
466  else
467  {
468  auto index = sequence_id % enabled_destinations_.size();
469  auto it = enabled_destinations_.begin();
470  for (; index > 0; --index)
471  {
472  ++it;
473  if (it == enabled_destinations_.end())
474  {
475  it = enabled_destinations_.begin();
476  }
477  }
478  return *it;
479  }
481 }
482 
483 void artdaq::DataSenderManager::RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
484 {
485  TLOG(15) << "RemoveRoutingTableEntry: Removing sequence ID " << seq << " from routing table. Sent " << GetSentSequenceIDCount(seq) << " Fragments with this Sequence ID.";
486  std::unique_lock<std::mutex> lck(routing_mutex_);
487  // while (routing_table_.size() > routing_table_max_size_)
488  // {
489  // routing_table_.erase(routing_table_.begin());
490  // }
491  if (routing_table_.find(seq) != routing_table_.end())
492  {
493  routing_table_.erase(routing_table_.find(seq));
494  }
495 
496  if (sent_sequence_id_count_.find(seq) != sent_sequence_id_count_.end())
497  {
498  sent_sequence_id_count_.erase(sent_sequence_id_count_.find(seq));
499  }
500 }
501 
502 size_t artdaq::DataSenderManager::GetSentSequenceIDCount(Fragment::sequence_id_t seq)
503 {
504  std::unique_lock<std::mutex> lck(routing_mutex_);
505  if (sent_sequence_id_count_.count(seq) == 0u)
506  {
507  return 0;
508  }
509  return sent_sequence_id_count_[seq];
510 }
511 
512 std::pair<int, artdaq::TransferInterface::CopyStatus> artdaq::DataSenderManager::sendFragment(Fragment&& frag)
513 {
514  // Precondition: Fragment must be complete and consistent (including
515  // header information).
516  auto start_time = std::chrono::steady_clock::now();
517  if (frag.type() == Fragment::EndOfDataFragmentType)
518  {
519  throw cet::exception("LogicError") // NOLINT(cert-err60-cpp)
520  << "EOD fragments should not be sent on as received: "
521  << "use sendEODFrag() instead.";
522  }
523  size_t seqID = frag.sequenceID();
524  size_t fragSize = frag.sizeBytes();
525  auto latency_s = frag.getLatency(true);
526  auto isSystemBroadcast = frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType;
527 
528  double latency = latency_s.tv_sec + (latency_s.tv_nsec / 1000000000.0);
529  TLOG(13) << "sendFragment start frag.fragmentHeader()=" << std::hex << static_cast<void*>(frag.headerBeginBytes()) << ", szB=" << std::dec << fragSize
530  << ", seqID=" << seqID << ", fragID=" << frag.fragmentID() << ", type=" << frag.typeString();
533  if (broadcast_sends_ || isSystemBroadcast)
534  {
535  for (auto& bdest : enabled_destinations_)
536  {
537  TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << bdest << " (broadcast)";
538  // Gross, we have to copy.
540  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
541  while (sts == TransferInterface::CopyStatus::kTimeout && retries <= send_retry_count_)
542  {
543  if (!non_blocking_mode_)
544  {
545  sts = destinations_[bdest]->transfer_fragment_reliable_mode(Fragment(frag));
546  }
547  else
548  {
549  sts = destinations_[bdest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
550  }
551  ++retries;
552  }
554  {
555  outsts = sts;
556  }
557  sent_frag_count_.incSlot(bdest);
558  }
559  }
560  else if (non_blocking_mode_)
561  {
562  auto count = routing_retry_count_;
563  while (dest == TransferInterface::RECV_TIMEOUT && count > 0)
564  {
565  dest = calcDest_(seqID);
567  {
568  count--;
569  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << (count > 0 ? ", retrying." : ".");
570  }
571  }
572  if (dest != TransferInterface::RECV_TIMEOUT && (destinations_.count(dest) != 0u) && (enabled_destinations_.count(dest) != 0u))
573  {
574  TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
576  auto lastWarnTime = std::chrono::steady_clock::now();
577  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
578  while (sts != TransferInterface::CopyStatus::kSuccess && retries <= send_retry_count_)
579  {
580  sts = destinations_[dest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
581  if (sts != TransferInterface::CopyStatus::kSuccess && TimeUtils::GetElapsedTime(lastWarnTime) >= 1)
582  {
583  TLOG(TLVL_WARNING) << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying...";
584  lastWarnTime = std::chrono::steady_clock::now();
585  }
586  ++retries;
587  }
589  {
590  outsts = sts;
591  }
592  //sendFragTo(std::move(frag), dest);
593  sent_frag_count_.incSlot(dest);
594  }
595  else if (!should_stop_)
596  {
597  TLOG(TLVL_ERROR) << "(in non_blocking) calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
598  << ". enabled_destinantions_.size()=" << enabled_destinations_.size();
599  }
600  }
601  else
602  {
603  auto start = std::chrono::steady_clock::now();
604  while (!should_stop_ && dest == TransferInterface::RECV_TIMEOUT)
605  {
606  dest = calcDest_(seqID);
608  {
609  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << ", send number " << sent_frag_count_.count() << ", retrying. Waited " << TimeUtils::GetElapsedTime(start) << " s for routing information.";
610  usleep(10000);
611  }
612  }
613  if (dest != TransferInterface::RECV_TIMEOUT && (destinations_.count(dest) != 0u) && (enabled_destinations_.count(dest) != 0u))
614  {
615  TLOG(TLVL_DEBUG + 2) << "DataSenderManager::sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
617 
618  sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(frag));
620  {
621  TLOG(TLVL_ERROR) << "sendFragment: Sending fragment " << seqID << " to destination "
622  << dest << " failed! Data has been lost!";
623  }
624 
625  //sendFragTo(std::move(frag), dest);
626  sent_frag_count_.incSlot(dest);
627  outsts = sts;
628  }
629  else if (!should_stop_)
630  {
631  TLOG(TLVL_ERROR) << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
632  << ". enabled_destinantions_.size()=" << enabled_destinations_.size();
633  }
634  }
635 
636  if (!isSystemBroadcast)
637  {
638  std::unique_lock<std::mutex> lck(routing_mutex_);
639  sent_sequence_id_count_[seqID]++;
640  }
641 
642  auto delta_t = TimeUtils::GetElapsedTime(start_time);
643 
644  if (metricMan)
645  {
646  TLOG(TLVL_DEBUG + 2) << "sendFragment: sending metrics";
647  metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 5, MetricMode::Accumulate);
648  metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 5, MetricMode::Accumulate);
649  metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 5, MetricMode::Average);
650  metricMan->sendMetric("Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest), "fragments", 3, MetricMode::LastPoint);
651  metricMan->sendMetric("Fragment Latency at Send", latency, "s", 4, MetricMode::Average | MetricMode::Maximum);
652 
653  if (use_routing_manager_)
654  {
655  metricMan->sendMetric("Routing Table Size", GetRoutingTableEntryCount(), "events", 2, MetricMode::LastPoint);
656  if (routing_wait_time_ > 0)
657  {
658  metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 2, MetricMode::Average);
659  routing_wait_time_ = 0;
660  }
661  }
662  }
663  TLOG(TLVL_DEBUG + 2) << "sendFragment: Done sending fragment " << seqID << " to dest=" << dest;
664  return std::make_pair(dest, outsts);
665 } // artdaq::DataSenderManager::sendFragment
void RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
Remove the given sequence ID from the routing table and sent_count lists.
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
A row of the Routing Table.
Events should be routed by sequence ID (BR -&gt; EB)
virtual ~DataSenderManager()
DataSenderManager Destructor.
std::bitset< 1024 > already_acknowledged_ranks
Bitset of ranks which have already sent valid acknowledgements and therefore do not need to send agai...
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
int GetInterfaceForNetwork(char const *host_in, in_addr &addr)
Convert an IP address to the network address of the interface sharing the subnet mask.
Definition: TCPConnect.cc:223
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
size_t GetSentSequenceIDCount(Fragment::sequence_id_t seq)
Get the number of Fragments sent with a given Sequence ID.
std::pair< int, TransferInterface::CopyStatus > sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
The header of the Routing Table, containing the magic bytes and the number of entries.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, std::string > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
Definition: HostMap.hh:46
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Events should be routed by send count (EB -&gt; Agg)
static RoutingAckPacket makeEndOfDataRoutingAckPacket(int rank)
Create an EndOfData RoutingAckPacket.
Some error occurred, but no exception was thrown.
RoutingManagerMode mode
The current mode of the RoutingManager.
The send operation completed successfully.
Value to be returned upon receive timeout.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:39
hostMap_t MakeHostMap(fhicl::ParameterSet const &pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:65
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.