artdaq  v3_09_00
DataSenderManager.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_DataSenderManager").c_str()
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 #include "artdaq/TransferPlugins/detail/HostMap.hh"
6 
7 #include <arpa/inet.h>
8 #include <netinet/in.h>
9 #include <poll.h>
10 #include <sys/socket.h>
11 #include <sys/types.h>
12 #include <chrono>
14 #include "canvas/Utilities/Exception.h"
15 
16 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
17  : sent_frag_count_()
18  , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
19  , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
20  , send_timeout_us_(pset.get<size_t>("send_timeout_usec", 5000000))
21  , send_retry_count_(pset.get<size_t>("send_retry_count", 2))
22  , routing_manager_mode_(detail::RoutingManagerMode::INVALID)
23  , should_stop_(false)
24  , ack_socket_(-1)
25  , table_socket_(-1)
26  , routing_table_last_(0)
27  , routing_table_max_size_(pset.get<size_t>("routing_table_max_size", 1000))
28  , highest_sequence_id_routed_(0)
29 {
30  TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
31 
32  // Validate parameters
33  if (send_timeout_us_ == 0)
34  {
35  send_timeout_us_ = std::numeric_limits<size_t>::max();
36  }
37 
38  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
39  use_routing_manager_ = rmConfig.get<bool>("use_routing_manager", false);
40  table_port_ = rmConfig.get<int>("table_update_port", 35556);
41  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
42  table_multicast_interface_ = rmConfig.get<std::string>("table_update_multicast_interface", "localhost");
43  ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
44  ack_address_ = rmConfig.get<std::string>("routing_manager_hostname", "localhost");
45  routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
46  routing_retry_count_ = rmConfig.get<int>("routing_retry_count", 5);
47 
48  hostMap_t host_map = MakeHostMap(pset);
49  auto tcp_send_buffer_size = pset.get<size_t>("tcp_send_buffer_size", 0);
50  auto max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0);
51 
52  auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
53  for (auto& d : dests.get_pset_names())
54  {
55  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
56  host_map = MakeHostMap(dest_pset, host_map);
57  }
58  auto host_map_pset = MakeHostMapPset(host_map);
59  fhicl::ParameterSet dests_mod;
60  for (auto& d : dests.get_pset_names())
61  {
62  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
63  dest_pset.erase("host_map");
64  dest_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
65 
66  if (tcp_send_buffer_size != 0 && !dest_pset.has_key("tcp_send_buffer_size"))
67  {
68  dest_pset.put<size_t>("tcp_send_buffer_size", tcp_send_buffer_size);
69  }
70  if (max_fragment_size_words != 0 && !dest_pset.has_key("max_fragment_size_words"))
71  {
72  dest_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words);
73  }
74 
75  dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
76  }
77 
78  for (auto& d : dests_mod.get_pset_names())
79  {
80  try
81  {
82  auto transfer = MakeTransferPlugin(dests_mod, d, TransferInterface::Role::kSend);
83  auto destination_rank = transfer->destination_rank();
84  destinations_.emplace(destination_rank, std::move(transfer));
85  }
86  catch (const std::invalid_argument&)
87  {
88  TLOG(TLVL_DEBUG) << "Invalid destination specification: " << d;
89  }
90  catch (const cet::exception& ex)
91  {
92  TLOG(TLVL_WARNING) << "Caught cet::exception: " << ex.what();
93  }
94  catch (...)
95  {
96  TLOG(TLVL_WARNING) << "Non-cet exception while setting up TransferPlugin: " << d << ".";
97  }
98  }
99  if (destinations_.empty())
100  {
101  TLOG(TLVL_ERROR) << "No destinations specified!";
102  }
103  else
104  {
105  auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
106  if (enabled_dests.empty())
107  {
108  TLOG(TLVL_INFO) << "enabled_destinations not specified, assuming all destinations enabled.";
109  for (auto& d : destinations_)
110  {
111  enabled_destinations_.insert(d.first);
112  }
113  }
114  else
115  {
116  for (auto& d : enabled_dests)
117  {
118  enabled_destinations_.insert(d);
119  }
120  }
121  }
122  if (use_routing_manager_)
123  {
124  startTableReceiverThread_();
125  }
126 }
127 
129 {
130  TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager BEGIN";
131  should_stop_ = true;
132  for (auto& dest : enabled_destinations_)
133  {
134  if (destinations_.count(dest) != 0u)
135  {
136  auto sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
138  {
139  TLOG(TLVL_ERROR) << "Error sending EOD Fragment to sender rank " << dest;
140  }
141  // sendFragTo(std::move(*Fragment::eodFrag(nFragments)), dest, true);
142  }
143  }
144  try
145  {
146  if (routing_thread_.joinable())
147  {
148  routing_thread_.join();
149  }
150  }
151  catch (...)
152  { // IGNORED
153  }
154  TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager END. Sent " << count() << " fragments.";
155 }
156 
157 void artdaq::DataSenderManager::setupTableListener_()
158 {
159  int sts;
160  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
161  if (table_socket_ < 0)
162  {
163  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
164  exit(1);
165  }
166 
167  struct sockaddr_in si_me_request;
168 
169  int yes = 1;
170  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
171  {
172  TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
173  exit(1);
174  }
175  memset(&si_me_request, 0, sizeof(si_me_request));
176  si_me_request.sin_family = AF_INET;
177  si_me_request.sin_port = htons(table_port_);
178  //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
179  struct in_addr in_addr_s;
180  sts = inet_aton(table_address_.c_str(), &in_addr_s);
181  if (sts == 0)
182  {
183  TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
184  }
185  si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
186  if (bind(table_socket_, reinterpret_cast<struct sockaddr*>(&si_me_request), sizeof(si_me_request)) == -1) // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
187  {
188  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
189  exit(1);
190  }
191 
192  struct ip_mreq mreq;
193  sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
194  if (sts == -1)
195  {
196  TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
197  exit(1);
198  }
199  sts = GetInterfaceForNetwork(table_multicast_interface_.c_str(), mreq.imr_interface);
200  if (sts == -1)
201  {
202  TLOG(TLVL_ERROR) << "Unable to determine the multicast interface for table updates using " << table_multicast_interface_;
203  exit(1);
204  }
205  char addr_str[INET_ADDRSTRLEN];
206  inet_ntop(AF_INET, &(mreq.imr_interface), addr_str, INET_ADDRSTRLEN);
207  TLOG(TLVL_INFO) << "Successfully determined the multicast network interface for " << table_multicast_interface_ << ": " << addr_str << " (DataSenderManager receiving routing table updates)";
208  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
209  {
210  TLOG(TLVL_ERROR) << "Unable to join multicast group";
211  exit(1);
212  }
213 }
214 void artdaq::DataSenderManager::startTableReceiverThread_()
215 {
216  if (routing_thread_.joinable())
217  {
218  routing_thread_.join();
219  }
220  TLOG(TLVL_INFO) << "Starting Routing Thread";
221  try
222  {
223  routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
224  }
225  catch (const boost::exception& e)
226  {
227  TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
228  std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
229  exit(5);
230  }
231 }
232 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
233 {
234  while (true)
235  {
236  if (should_stop_)
237  {
238  TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
239  artdaq::detail::RoutingAckPacket endOfDataAck = detail::RoutingAckPacket::makeEndOfDataRoutingAckPacket(my_rank);
240 
241  TLOG(TLVL_DEBUG) << __func__ << ": Sending RoutingAckPacket with end of run markers to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")";
242  sendto(ack_socket_, &endOfDataAck, sizeof(artdaq::detail::RoutingAckPacket), 0, reinterpret_cast<struct sockaddr*>(&ack_addr_), sizeof(ack_addr_)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
243  return;
244  }
245 
246  TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes (interface,address,port = "
247  << table_multicast_interface_ << "," << table_address_ << "," << table_port_ << ")";
248  if (table_socket_ == -1)
249  {
250  TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
251  setupTableListener_();
252  }
253  if (table_socket_ == -1)
254  {
255  TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
256  return;
257  }
258  if (ack_socket_ == -1)
259  {
260  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
261  auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
262  if (sts == -1)
263  {
264  TLOG(TLVL_ERROR) << __func__ << ": Unable to resolve routing_manager_address";
265  exit(1);
266  }
267  TLOG(TLVL_DEBUG) << __func__ << ": Ack socket is fd " << ack_socket_;
268  char addr_str[INET_ADDRSTRLEN];
269  inet_ntop(AF_INET, &(ack_addr_.sin_addr), addr_str, INET_ADDRSTRLEN);
270  TLOG(TLVL_INFO) << "Successfully determined the network interface for " << ack_address_ << ": " << addr_str << " (DataSenderManager sending table update acknowledgements)";
271  }
272 
273  struct pollfd fd;
274  fd.fd = table_socket_;
275  fd.events = POLLIN | POLLPRI;
276 
277  auto res = poll(&fd, 1, 1000);
278  if (res > 0)
279  {
280  auto first = artdaq::Fragment::InvalidSequenceID;
281  auto last = artdaq::Fragment::InvalidSequenceID;
282  std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
284 
285  TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
286  struct sockaddr_in from;
287  socklen_t len = sizeof(from);
288  auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, reinterpret_cast<struct sockaddr*>(&from), &len); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
289  TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
290 
291  if (stss > static_cast<ssize_t>(sizeof(hdr)))
292  {
293  memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader));
294  }
295  else
296  {
297  TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
298  continue;
299  }
300 
301  TLOG(TLVL_DEBUG) << "receiveTableUpdatesLoop_: Checking for valid header with nEntries=" << hdr.nEntries << "header=" << std::hex << hdr.header;
302  if (hdr.header != ROUTING_MAGIC)
303  {
304  TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss;
305  }
306  else
307  {
308  if (routing_manager_mode_ != detail::RoutingManagerMode::INVALID && routing_manager_mode_ != hdr.mode)
309  {
310  TLOG(TLVL_ERROR) << __func__ << ": Received table has different RoutingManagerMode than expected!";
311  exit(1);
312  }
313  routing_manager_mode_ = hdr.mode;
314 
316  assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
317  memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
318 
319  first = buffer.front().sequence_id;
320  last = buffer.back().sequence_id;
321 
322  if (first + hdr.nEntries - 1 != last)
323  {
324  TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
325  continue;
326  }
327  auto thisSeqID = first;
328 
329  {
330  std::unique_lock<std::mutex> lck(routing_mutex_);
331  if (routing_table_.count(last) == 0)
332  {
333  for (auto entry : buffer)
334  {
335  if (thisSeqID != entry.sequence_id)
336  {
337  TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
338  last = thisSeqID - 1;
339  break;
340  }
341  thisSeqID++;
342  if (routing_table_.count(entry.sequence_id) != 0u)
343  {
344  if (routing_table_[entry.sequence_id] != entry.destination_rank)
345  {
346  TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
347  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
348  << " I will use the original value!";
349  }
350  continue;
351  }
352  if (entry.sequence_id < routing_table_last_)
353  {
354  continue;
355  }
356  routing_table_[entry.sequence_id] = entry.destination_rank;
357  TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
358  << " -> Rank " << entry.destination_rank;
359  }
360  }
361 
362  TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
363  if (!routing_table_.empty())
364  {
365  TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
366  }
367 
368  auto counter = 0;
369  for (auto& entry : routing_table_)
370  {
371  TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
372  counter++;
373  }
374  }
375 
377  ack.rank = my_rank;
378  ack.first_sequence_id = first;
379  ack.last_sequence_id = last;
380 
381  if (last > routing_table_last_)
382  {
383  routing_table_last_ = last;
384  }
385 
386  if (my_rank < static_cast<int>(8 * sizeof(hdr.already_acknowledged_ranks)) && hdr.already_acknowledged_ranks.test(my_rank))
387  {
388  TLOG(TLVL_DEBUG) << __func__ << ": Skipping RoutingAckPacket since this Routing Table Update has already been acknowledged (my_rank = " << my_rank << ")";
389  }
390  else
391  {
392  TLOG(TLVL_DEBUG) << __func__ << ": Sending RoutingAckPacket with first= " << first << " and last= " << last << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")";
393  sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, reinterpret_cast<struct sockaddr*>(&ack_addr_), sizeof(ack_addr_)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
394  }
395  }
396  }
397  }
398 }
399 
401 {
402  std::unique_lock<std::mutex> lck(routing_mutex_);
403  return routing_table_.size();
404 }
405 
407 {
408  std::unique_lock<std::mutex> lck(routing_mutex_);
409  // Find the distance from the next highest sequence ID to the end of the list
410  size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
411  return dist; // If dist == 1, there is one entry left.
412 }
413 
414 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
415 {
416  if (enabled_destinations_.empty())
417  {
418  return TransferInterface::RECV_TIMEOUT; // No destinations configured.
419  }
420  if (!use_routing_manager_ && enabled_destinations_.size() == 1)
421  {
422  return *enabled_destinations_.begin(); // Trivial case
423  }
424 
425  if (use_routing_manager_)
426  {
427  auto start = std::chrono::steady_clock::now();
428  TLOG(15) << "calcDest_ use_routing_manager check for routing info for seqID=" << sequence_id << " routing_timeout_ms=" << routing_timeout_ms_ << " should_stop_=" << should_stop_;
429  while (!should_stop_ && (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_)))
430  {
431  {
432  std::unique_lock<std::mutex> lck(routing_mutex_);
433  if (routing_manager_mode_ == detail::RoutingManagerMode::RouteBySequenceID && (routing_table_.count(sequence_id) != 0u))
434  {
435  if (sequence_id > highest_sequence_id_routed_)
436  {
437  highest_sequence_id_routed_ = sequence_id;
438  }
439  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
440  return routing_table_.at(sequence_id);
441  }
442  if (routing_manager_mode_ == detail::RoutingManagerMode::RouteBySendCount && (routing_table_.count(sent_frag_count_.count() + 1) != 0u))
443  {
444  if (sent_frag_count_.count() + 1 > highest_sequence_id_routed_)
445  {
446  highest_sequence_id_routed_ = sent_frag_count_.count() + 1;
447  }
448  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
449  return routing_table_.at(sent_frag_count_.count() + 1);
450  }
451  }
452  usleep(routing_timeout_ms_ * 10);
453  }
454  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
455  if (routing_manager_mode_ == detail::RoutingManagerMode::RouteBySequenceID)
456  {
457  TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for seqID " << sequence_id
458  << " and the Routing Manager did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
459  }
460  else
461  {
462  TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for send number " << sent_frag_count_.count()
463  << " and the Routing Manager did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
464  }
465  }
466  else
467  {
468  auto index = sequence_id % enabled_destinations_.size();
469  auto it = enabled_destinations_.begin();
470  for (; index > 0; --index)
471  {
472  ++it;
473  if (it == enabled_destinations_.end())
474  {
475  it = enabled_destinations_.begin();
476  }
477  }
478  return *it;
479  }
481 }
482 
483 void artdaq::DataSenderManager::RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
484 {
485  TLOG(15) << "RemoveRoutingTableEntry: Removing sequence ID " << seq << " from routing table. Sent " << GetSentSequenceIDCount(seq) << " Fragments with this Sequence ID.";
486  std::unique_lock<std::mutex> lck(routing_mutex_);
487  // while (routing_table_.size() > routing_table_max_size_)
488  // {
489  // routing_table_.erase(routing_table_.begin());
490  // }
491  if (routing_table_.find(seq) != routing_table_.end())
492  {
493  routing_table_.erase(routing_table_.find(seq));
494  }
495 
496  if (sent_sequence_id_count_.find(seq) != sent_sequence_id_count_.end())
497  {
498  sent_sequence_id_count_.erase(sent_sequence_id_count_.find(seq));
499  }
500 }
501 
502 size_t artdaq::DataSenderManager::GetSentSequenceIDCount(Fragment::sequence_id_t seq)
503 {
504  std::unique_lock<std::mutex> lck(routing_mutex_);
505  if (sent_sequence_id_count_.count(seq) == 0u)
506  {
507  return 0;
508  }
509  return sent_sequence_id_count_[seq];
510 }
511 
512 std::pair<int, artdaq::TransferInterface::CopyStatus> artdaq::DataSenderManager::sendFragment(Fragment&& frag)
513 {
514  // Precondition: Fragment must be complete and consistent (including
515  // header information).
516  auto start_time = std::chrono::steady_clock::now();
517  if (frag.type() == Fragment::EndOfDataFragmentType)
518  {
519  throw cet::exception("LogicError") // NOLINT(cert-err60-cpp)
520  << "EOD fragments should not be sent on as received: "
521  << "use sendEODFrag() instead.";
522  }
523  size_t seqID = frag.sequenceID();
524  size_t fragSize = frag.sizeBytes();
525  auto latency_s = frag.getLatency(true);
526  auto isSystemBroadcast = frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType;
527 
528  double latency = latency_s.tv_sec + (latency_s.tv_nsec / 1000000000.0);
529  TLOG(13) << "sendFragment start frag.fragmentHeader()=" << std::hex << static_cast<void*>(frag.headerBeginBytes()) << ", szB=" << std::dec << fragSize
530  << ", seqID=" << seqID << ", fragID=" << frag.fragmentID() << ", type=" << frag.typeString();
533  if (broadcast_sends_ || isSystemBroadcast)
534  {
535  for (auto& bdest : enabled_destinations_)
536  {
537  TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << bdest << " (broadcast)";
538  // Gross, we have to copy.
540  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
541  while (sts == TransferInterface::CopyStatus::kTimeout && retries <= send_retry_count_)
542  {
543  if (!non_blocking_mode_)
544  {
545  sts = destinations_[bdest]->transfer_fragment_reliable_mode(Fragment(frag));
546  }
547  else
548  {
549  sts = destinations_[bdest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
550  }
551  ++retries;
552  }
554  {
555  outsts = sts;
556  }
557  sent_frag_count_.incSlot(bdest);
558  }
559  }
560  else if (non_blocking_mode_)
561  {
562  auto count = routing_retry_count_;
563  while (dest == TransferInterface::RECV_TIMEOUT && count > 0)
564  {
565  dest = calcDest_(seqID);
567  {
568  count--;
569  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << (count > 0 ? ", retrying." : ".");
570  }
571  }
572  if (dest != TransferInterface::RECV_TIMEOUT && (destinations_.count(dest) != 0u) && (enabled_destinations_.count(dest) != 0u))
573  {
574  TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
576  auto lastWarnTime = std::chrono::steady_clock::now();
577  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
578  while (sts != TransferInterface::CopyStatus::kSuccess && retries <= send_retry_count_)
579  {
580  sts = destinations_[dest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
581  if (sts != TransferInterface::CopyStatus::kSuccess && TimeUtils::GetElapsedTime(lastWarnTime) >= 1)
582  {
583  TLOG(TLVL_WARNING) << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying...";
584  lastWarnTime = std::chrono::steady_clock::now();
585  }
586  ++retries;
587  }
589  {
590  outsts = sts;
591  }
592  //sendFragTo(std::move(frag), dest);
593  sent_frag_count_.incSlot(dest);
594  }
595  else if (!should_stop_)
596  {
597  TLOG(TLVL_ERROR) << "(in non_blocking) calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
598  << ". enabled_destinantions_.size()=" << enabled_destinations_.size();
599  }
600  }
601  else
602  {
603  auto start = std::chrono::steady_clock::now();
604  while (!should_stop_ && dest == TransferInterface::RECV_TIMEOUT)
605  {
606  dest = calcDest_(seqID);
608  {
609  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << ", send number " << sent_frag_count_.count() << ", retrying. Waited " << TimeUtils::GetElapsedTime(start) << " s for routing information.";
610  usleep(10000);
611  }
612  }
613  if (dest != TransferInterface::RECV_TIMEOUT && (destinations_.count(dest) != 0u) && (enabled_destinations_.count(dest) != 0u))
614  {
615  TLOG(5) << "DataSenderManager::sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
617 
618  sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(frag));
620  {
621  TLOG(TLVL_ERROR) << "sendFragment: Sending fragment " << seqID << " to destination "
622  << dest << " failed! Data has been lost!";
623  }
624 
625  //sendFragTo(std::move(frag), dest);
626  sent_frag_count_.incSlot(dest);
627  outsts = sts;
628  }
629  else if (!should_stop_)
630  {
631  TLOG(TLVL_ERROR) << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
632  << ". enabled_destinantions_.size()=" << enabled_destinations_.size();
633  }
634  }
635 
636  if (!isSystemBroadcast)
637  {
638  std::unique_lock<std::mutex> lck(routing_mutex_);
639  sent_sequence_id_count_[seqID]++;
640  }
641 
642  auto delta_t = TimeUtils::GetElapsedTime(start_time);
643 
644  if (metricMan)
645  {
646  TLOG(5) << "sendFragment: sending metrics";
647  metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 5, MetricMode::Accumulate);
648  metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 5, MetricMode::Accumulate);
649  metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 5, MetricMode::Average);
650  metricMan->sendMetric("Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest), "fragments", 3, MetricMode::LastPoint);
651  metricMan->sendMetric("Fragment Latency at Send", latency, "s", 4, MetricMode::Average | MetricMode::Maximum);
652 
653  if (use_routing_manager_)
654  {
655  metricMan->sendMetric("Routing Table Size", GetRoutingTableEntryCount(), "events", 2, MetricMode::LastPoint);
656  if (routing_wait_time_ > 0)
657  {
658  metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 2, MetricMode::Average);
659  routing_wait_time_ = 0;
660  }
661  }
662  }
663  TLOG(5) << "sendFragment: Done sending fragment " << seqID << " to dest=" << dest;
664  return std::make_pair(dest, outsts);
665 } // artdaq::DataSenderManager::sendFragment
void RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
Remove the given sequence ID from the routing table and sent_count lists.
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
A row of the Routing Table.
Events should be routed by sequence ID (BR -&gt; EB)
virtual ~DataSenderManager()
DataSenderManager Destructor.
std::bitset< 1024 > already_acknowledged_ranks
Bitset of ranks which have already sent valid acknowledgements and therefore do not need to send agai...
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
int GetInterfaceForNetwork(char const *host_in, in_addr &addr)
Convert an IP address to the network address of the interface sharing the subnet mask.
Definition: TCPConnect.cc:223
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
size_t GetSentSequenceIDCount(Fragment::sequence_id_t seq)
Get the number of Fragments sent with a given Sequence ID.
std::pair< int, TransferInterface::CopyStatus > sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
The header of the Routing Table, containing the magic bytes and the number of entries.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, std::string > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
Definition: HostMap.hh:46
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Events should be routed by send count (EB -&gt; Agg)
Some error occurred, but no exception was thrown.
RoutingManagerMode mode
The current mode of the RoutingManager.
The send operation completed successfully.
Value to be returned upon receive timeout.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:39
hostMap_t MakeHostMap(fhicl::ParameterSet const &pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:65
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.