artdaq  v3_10_01
DataSenderManager.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_DataSenderManager").c_str()
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 #include "artdaq/TransferPlugins/detail/HostMap.hh"
6 
7 #include <arpa/inet.h>
8 #include <netinet/in.h>
9 #include <poll.h>
10 #include <sys/socket.h>
11 #include <sys/types.h>
12 #include <chrono>
14 #include "canvas/Utilities/Exception.h"
15 
16 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
17  : sent_frag_count_()
18  , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
19  , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
20  , send_timeout_us_(pset.get<size_t>("send_timeout_usec", 5000000))
21  , send_retry_count_(pset.get<size_t>("send_retry_count", 2))
22  , routing_manager_mode_(detail::RoutingManagerMode::INVALID)
23  , should_stop_(false)
24  , ack_socket_(-1)
25  , table_socket_(-1)
26  , routing_table_last_(0)
27  , routing_table_max_size_(pset.get<size_t>("routing_table_max_size", 1000))
28  , highest_sequence_id_routed_(0)
29 {
30  TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
31 
32  // Validate parameters
33  if (send_timeout_us_ == 0)
34  {
35  send_timeout_us_ = std::numeric_limits<size_t>::max();
36  }
37 
38  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
39  use_routing_manager_ = rmConfig.get<bool>("use_routing_manager", false);
40  table_port_ = rmConfig.get<int>("table_update_port", 35556);
41  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
42  table_multicast_interface_ = rmConfig.get<std::string>("table_update_multicast_interface", "localhost");
43  ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
44  ack_address_ = rmConfig.get<std::string>("routing_manager_hostname", "localhost");
45  routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
46  routing_retry_count_ = rmConfig.get<int>("routing_retry_count", 5);
47 
48  hostMap_t host_map = MakeHostMap(pset);
49  auto tcp_send_buffer_size = pset.get<size_t>("tcp_send_buffer_size", 0);
50  auto max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0);
51 
52  auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
53  for (auto& d : dests.get_pset_names())
54  {
55  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
56  host_map = MakeHostMap(dest_pset, host_map);
57  }
58  auto host_map_pset = MakeHostMapPset(host_map);
59  fhicl::ParameterSet dests_mod;
60  for (auto& d : dests.get_pset_names())
61  {
62  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
63  dest_pset.erase("host_map");
64  dest_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
65 
66  if (tcp_send_buffer_size != 0 && !dest_pset.has_key("tcp_send_buffer_size"))
67  {
68  dest_pset.put<size_t>("tcp_send_buffer_size", tcp_send_buffer_size);
69  }
70  if (max_fragment_size_words != 0 && !dest_pset.has_key("max_fragment_size_words"))
71  {
72  dest_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words);
73  }
74 
75  dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
76  }
77 
78  for (auto& d : dests_mod.get_pset_names())
79  {
80  try
81  {
82  auto transfer = MakeTransferPlugin(dests_mod, d, TransferInterface::Role::kSend);
83  auto destination_rank = transfer->destination_rank();
84  destinations_.emplace(destination_rank, std::move(transfer));
85  }
86  catch (const std::invalid_argument&)
87  {
88  TLOG(TLVL_DEBUG) << "Invalid destination specification: " << d;
89  }
90  catch (const cet::exception& ex)
91  {
92  TLOG(TLVL_WARNING) << "Caught cet::exception: " << ex.what();
93  }
94  catch (...)
95  {
96  TLOG(TLVL_WARNING) << "Non-cet exception while setting up TransferPlugin: " << d << ".";
97  }
98  }
99  if (destinations_.empty())
100  {
101  TLOG(TLVL_ERROR) << "No destinations specified!";
102  }
103  else
104  {
105  auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
106  if (enabled_dests.empty())
107  {
108  TLOG(TLVL_INFO) << "enabled_destinations not specified, assuming all destinations enabled.";
109  for (auto& d : destinations_)
110  {
111  enabled_destinations_.insert(d.first);
112  }
113  }
114  else
115  {
116  for (auto& d : enabled_dests)
117  {
118  enabled_destinations_.insert(d);
119  }
120  }
121  }
122  if (use_routing_manager_)
123  {
124  startTableReceiverThread_();
125  }
126 }
127 
129 {
130  TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager BEGIN";
131  should_stop_ = true;
132  for (auto& dest : enabled_destinations_)
133  {
134  if (destinations_.count(dest) != 0u)
135  {
136  auto sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
138  {
139  TLOG(TLVL_ERROR) << "Error sending EOD Fragment to sender rank " << dest;
140  }
141  // sendFragTo(std::move(*Fragment::eodFrag(nFragments)), dest, true);
142  }
143  }
144  try
145  {
146  if (routing_thread_.joinable())
147  {
148  routing_thread_.join();
149  }
150  }
151  catch (...)
152  { // IGNORED
153  }
154  TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager END. Sent " << count() << " fragments.";
155 }
156 
157 void artdaq::DataSenderManager::setupTableListener_()
158 {
159  int sts;
160  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
161  if (table_socket_ < 0)
162  {
163  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
164  exit(1);
165  }
166 
167  struct sockaddr_in si_me_request;
168 
169  int yes = 1;
170  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
171  {
172  TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
173  exit(1);
174  }
175  memset(&si_me_request, 0, sizeof(si_me_request));
176  si_me_request.sin_family = AF_INET;
177  si_me_request.sin_port = htons(table_port_);
178  //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
179  struct in_addr in_addr_s;
180  sts = inet_aton(table_address_.c_str(), &in_addr_s);
181  if (sts == 0)
182  {
183  TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
184  }
185  si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
186  if (bind(table_socket_, reinterpret_cast<struct sockaddr*>(&si_me_request), sizeof(si_me_request)) == -1) // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
187  {
188  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
189  exit(1);
190  }
191 
192  struct ip_mreq mreq;
193  sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
194  if (sts == -1)
195  {
196  TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
197  exit(1);
198  }
199  sts = GetInterfaceForNetwork(table_multicast_interface_.c_str(), mreq.imr_interface);
200  if (sts == -1)
201  {
202  TLOG(TLVL_ERROR) << "Unable to determine the multicast interface for table updates using " << table_multicast_interface_;
203  exit(1);
204  }
205  char addr_str[INET_ADDRSTRLEN];
206  inet_ntop(AF_INET, &(mreq.imr_interface), addr_str, INET_ADDRSTRLEN);
207  TLOG(TLVL_INFO) << "Successfully determined the multicast network interface for " << table_multicast_interface_ << ": " << addr_str << " (DataSenderManager receiving routing table updates)";
208  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
209  {
210  TLOG(TLVL_ERROR) << "Unable to join multicast group";
211  exit(1);
212  }
213 }
214 void artdaq::DataSenderManager::startTableReceiverThread_()
215 {
216  if (routing_thread_.joinable())
217  {
218  routing_thread_.join();
219  }
220  TLOG(TLVL_INFO) << "Starting Routing Thread";
221  try
222  {
223  routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
224  char tname[16]; // Size 16 - see man page pthread_setname_np(3) and/or prctl(2)
225  snprintf(tname, sizeof(tname) - 1, "%d-RouteRECV", my_rank); // NOLINT
226  tname[sizeof(tname) - 1] = '\0'; // assure term. snprintf is not too evil :)
227  auto handle = routing_thread_.native_handle();
228  pthread_setname_np(handle, tname);
229  }
230  catch (const boost::exception& e)
231  {
232  TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
233  std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
234  exit(5);
235  }
236 }
237 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
238 {
239  while (true)
240  {
241  if (should_stop_)
242  {
243  TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
245 
246  TLOG(TLVL_DEBUG) << __func__ << ": Sending RoutingAckPacket with end of run markers to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")";
247  sendto(ack_socket_, &endOfDataAck, sizeof(artdaq::detail::RoutingAckPacket), 0, reinterpret_cast<struct sockaddr*>(&ack_addr_), sizeof(ack_addr_)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
248  return;
249  }
250 
251  TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes (interface,address,port = "
252  << table_multicast_interface_ << "," << table_address_ << "," << table_port_ << ")";
253  if (table_socket_ == -1)
254  {
255  TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
256  setupTableListener_();
257  }
258  if (table_socket_ == -1)
259  {
260  TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
261  return;
262  }
263  if (ack_socket_ == -1)
264  {
265  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
266  auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
267  if (sts == -1)
268  {
269  TLOG(TLVL_ERROR) << __func__ << ": Unable to resolve routing_manager_address";
270  exit(1);
271  }
272  TLOG(TLVL_DEBUG) << __func__ << ": Ack socket is fd " << ack_socket_;
273  char addr_str[INET_ADDRSTRLEN];
274  inet_ntop(AF_INET, &(ack_addr_.sin_addr), addr_str, INET_ADDRSTRLEN);
275  TLOG(TLVL_INFO) << "Successfully determined the network interface for " << ack_address_ << ": " << addr_str << " (DataSenderManager sending table update acknowledgements)";
276  }
277 
278  struct pollfd fd;
279  fd.fd = table_socket_;
280  fd.events = POLLIN | POLLPRI;
281 
282  auto res = poll(&fd, 1, 1000);
283  if (res > 0)
284  {
285  auto first = artdaq::Fragment::InvalidSequenceID;
286  auto last = artdaq::Fragment::InvalidSequenceID;
287  std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
289 
290  TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
291  struct sockaddr_in from;
292  socklen_t len = sizeof(from);
293  auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, reinterpret_cast<struct sockaddr*>(&from), &len); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
294  TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
295 
296  if (stss > static_cast<ssize_t>(sizeof(hdr)))
297  {
298  memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader));
299  }
300  else
301  {
302  TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
303  continue;
304  }
305 
306  TLOG(TLVL_DEBUG) << "receiveTableUpdatesLoop_: Checking for valid header with nEntries=" << hdr.nEntries << "header=" << std::hex << hdr.header;
307  if (hdr.header != ROUTING_MAGIC)
308  {
309  TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss;
310  }
311  else
312  {
313  if (routing_manager_mode_ != detail::RoutingManagerMode::INVALID && routing_manager_mode_ != hdr.mode)
314  {
315  TLOG(TLVL_ERROR) << __func__ << ": Received table has different RoutingManagerMode than expected!";
316  exit(1);
317  }
318  routing_manager_mode_ = hdr.mode;
319 
321  assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
322  memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
323 
324  first = buffer.front().sequence_id;
325  last = buffer.back().sequence_id;
326 
327  if (first + hdr.nEntries - 1 != last)
328  {
329  TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
330  continue;
331  }
332  auto thisSeqID = first;
333 
334  {
335  std::unique_lock<std::mutex> lck(routing_mutex_);
336  if (routing_table_.count(last) == 0)
337  {
338  for (auto entry : buffer)
339  {
340  if (thisSeqID != entry.sequence_id)
341  {
342  TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
343  last = thisSeqID - 1;
344  break;
345  }
346  thisSeqID++;
347  if (routing_table_.count(entry.sequence_id) != 0u)
348  {
349  if (routing_table_[entry.sequence_id] != entry.destination_rank)
350  {
351  TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
352  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
353  << " I will use the original value!";
354  }
355  continue;
356  }
357  if (entry.sequence_id < routing_table_last_)
358  {
359  continue;
360  }
361  routing_table_[entry.sequence_id] = entry.destination_rank;
362  TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
363  << " -> Rank " << entry.destination_rank;
364  }
365  }
366 
367  TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
368  if (!routing_table_.empty())
369  {
370  TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
371  }
372 
373  auto counter = 0;
374  for (auto& entry : routing_table_)
375  {
376  TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
377  counter++;
378  }
379  }
380 
382  ack.rank = my_rank;
383  ack.first_sequence_id = first;
384  ack.last_sequence_id = last;
385 
386  if (last > routing_table_last_)
387  {
388  routing_table_last_ = last;
389  }
390 
391  if (my_rank < static_cast<int>(8 * sizeof(hdr.already_acknowledged_ranks)) && hdr.already_acknowledged_ranks.test(my_rank))
392  {
393  TLOG(TLVL_DEBUG) << __func__ << ": Skipping RoutingAckPacket since this Routing Table Update has already been acknowledged (my_rank = " << my_rank << ")";
394  }
395  else
396  {
397  TLOG(TLVL_DEBUG) << __func__ << ": Sending RoutingAckPacket with first= " << first << " and last= " << last << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")";
398  sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, reinterpret_cast<struct sockaddr*>(&ack_addr_), sizeof(ack_addr_)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
399  }
400  }
401  }
402  }
403 }
404 
406 {
407  std::unique_lock<std::mutex> lck(routing_mutex_);
408  return routing_table_.size();
409 }
410 
412 {
413  std::unique_lock<std::mutex> lck(routing_mutex_);
414  // Find the distance from the next highest sequence ID to the end of the list
415  size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
416  return dist; // If dist == 1, there is one entry left.
417 }
418 
419 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
420 {
421  if (enabled_destinations_.empty())
422  {
423  return TransferInterface::RECV_TIMEOUT; // No destinations configured.
424  }
425  if (!use_routing_manager_ && enabled_destinations_.size() == 1)
426  {
427  return *enabled_destinations_.begin(); // Trivial case
428  }
429 
430  if (use_routing_manager_)
431  {
432  auto start = std::chrono::steady_clock::now();
433  TLOG(15) << "calcDest_ use_routing_manager check for routing info for seqID=" << sequence_id << " routing_timeout_ms=" << routing_timeout_ms_ << " should_stop_=" << should_stop_;
434  while (!should_stop_ && (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_)))
435  {
436  {
437  std::unique_lock<std::mutex> lck(routing_mutex_);
438  if (routing_manager_mode_ == detail::RoutingManagerMode::RouteBySequenceID && (routing_table_.count(sequence_id) != 0u))
439  {
440  if (sequence_id > highest_sequence_id_routed_)
441  {
442  highest_sequence_id_routed_ = sequence_id;
443  }
444  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
445  return routing_table_.at(sequence_id);
446  }
447  if (routing_manager_mode_ == detail::RoutingManagerMode::RouteBySendCount && (routing_table_.count(sent_frag_count_.count() + 1) != 0u))
448  {
449  if (sent_frag_count_.count() + 1 > highest_sequence_id_routed_)
450  {
451  highest_sequence_id_routed_ = sent_frag_count_.count() + 1;
452  }
453  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
454  return routing_table_.at(sent_frag_count_.count() + 1);
455  }
456  }
457  usleep(routing_timeout_ms_ * 10);
458  }
459  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
460  if (routing_manager_mode_ == detail::RoutingManagerMode::RouteBySequenceID)
461  {
462  TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for seqID " << sequence_id
463  << " and the Routing Manager did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
464  }
465  else
466  {
467  TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for send number " << sent_frag_count_.count()
468  << " and the Routing Manager did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
469  }
470  }
471  else
472  {
473  auto index = sequence_id % enabled_destinations_.size();
474  auto it = enabled_destinations_.begin();
475  for (; index > 0; --index)
476  {
477  ++it;
478  if (it == enabled_destinations_.end())
479  {
480  it = enabled_destinations_.begin();
481  }
482  }
483  return *it;
484  }
486 }
487 
488 void artdaq::DataSenderManager::RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
489 {
490  TLOG(15) << "RemoveRoutingTableEntry: Removing sequence ID " << seq << " from routing table. Sent " << GetSentSequenceIDCount(seq) << " Fragments with this Sequence ID.";
491  std::unique_lock<std::mutex> lck(routing_mutex_);
492  // while (routing_table_.size() > routing_table_max_size_)
493  // {
494  // routing_table_.erase(routing_table_.begin());
495  // }
496  if (routing_table_.find(seq) != routing_table_.end())
497  {
498  routing_table_.erase(routing_table_.find(seq));
499  }
500 
501  if (sent_sequence_id_count_.find(seq) != sent_sequence_id_count_.end())
502  {
503  sent_sequence_id_count_.erase(sent_sequence_id_count_.find(seq));
504  }
505 }
506 
507 size_t artdaq::DataSenderManager::GetSentSequenceIDCount(Fragment::sequence_id_t seq)
508 {
509  std::unique_lock<std::mutex> lck(routing_mutex_);
510  if (sent_sequence_id_count_.count(seq) == 0u)
511  {
512  return 0;
513  }
514  return sent_sequence_id_count_[seq];
515 }
516 
517 std::pair<int, artdaq::TransferInterface::CopyStatus> artdaq::DataSenderManager::sendFragment(Fragment&& frag)
518 {
519  // Precondition: Fragment must be complete and consistent (including
520  // header information).
521  auto start_time = std::chrono::steady_clock::now();
522  if (frag.type() == Fragment::EndOfDataFragmentType)
523  {
524  throw cet::exception("LogicError") // NOLINT(cert-err60-cpp)
525  << "EOD fragments should not be sent on as received: "
526  << "use sendEODFrag() instead.";
527  }
528  size_t seqID = frag.sequenceID();
529  size_t fragSize = frag.sizeBytes();
530  auto latency_s = frag.getLatency(true);
531  auto isSystemBroadcast = frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType;
532 
533  double latency = latency_s.tv_sec + (latency_s.tv_nsec / 1000000000.0);
534  TLOG(13) << "sendFragment start frag.fragmentHeader()=" << std::hex << static_cast<void*>(frag.headerBeginBytes()) << ", szB=" << std::dec << fragSize
535  << ", seqID=" << seqID << ", fragID=" << frag.fragmentID() << ", type=" << frag.typeString();
538  if (broadcast_sends_ || isSystemBroadcast)
539  {
540  for (auto& bdest : enabled_destinations_)
541  {
542  TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << bdest << " (broadcast)";
543  // Gross, we have to copy.
545  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
546  while (sts == TransferInterface::CopyStatus::kTimeout && retries <= send_retry_count_)
547  {
548  if (!non_blocking_mode_)
549  {
550  sts = destinations_[bdest]->transfer_fragment_reliable_mode(Fragment(frag));
551  }
552  else
553  {
554  sts = destinations_[bdest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
555  }
556  ++retries;
557  }
559  {
560  outsts = sts;
561  }
562  sent_frag_count_.incSlot(bdest);
563  }
564  }
565  else if (non_blocking_mode_)
566  {
567  auto count = routing_retry_count_;
568  while (dest == TransferInterface::RECV_TIMEOUT && count > 0)
569  {
570  dest = calcDest_(seqID);
572  {
573  count--;
574  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << (count > 0 ? ", retrying." : ".");
575  }
576  }
577  if (dest != TransferInterface::RECV_TIMEOUT && (destinations_.count(dest) != 0u) && (enabled_destinations_.count(dest) != 0u))
578  {
579  TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
581  auto lastWarnTime = std::chrono::steady_clock::now();
582  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
583  while (sts != TransferInterface::CopyStatus::kSuccess && retries <= send_retry_count_)
584  {
585  sts = destinations_[dest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
586  if (sts != TransferInterface::CopyStatus::kSuccess && TimeUtils::GetElapsedTime(lastWarnTime) >= 1)
587  {
588  TLOG(TLVL_WARNING) << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying...";
589  lastWarnTime = std::chrono::steady_clock::now();
590  }
591  ++retries;
592  }
594  {
595  outsts = sts;
596  }
597  //sendFragTo(std::move(frag), dest);
598  sent_frag_count_.incSlot(dest);
599  }
600  else if (!should_stop_)
601  {
602  TLOG(TLVL_ERROR) << "(in non_blocking) calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
603  << ". enabled_destinantions_.size()=" << enabled_destinations_.size();
604  }
605  }
606  else
607  {
608  auto start = std::chrono::steady_clock::now();
609  while (!should_stop_ && dest == TransferInterface::RECV_TIMEOUT)
610  {
611  dest = calcDest_(seqID);
613  {
614  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << ", send number " << sent_frag_count_.count() << ", retrying. Waited " << TimeUtils::GetElapsedTime(start) << " s for routing information.";
615  usleep(10000);
616  }
617  }
618  if (dest != TransferInterface::RECV_TIMEOUT && (destinations_.count(dest) != 0u) && (enabled_destinations_.count(dest) != 0u))
619  {
620  TLOG(TLVL_DEBUG + 2) << "DataSenderManager::sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
622 
623  sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(frag));
625  {
626  TLOG(TLVL_ERROR) << "sendFragment: Sending fragment " << seqID << " to destination "
627  << dest << " failed! Data has been lost!";
628  }
629 
630  //sendFragTo(std::move(frag), dest);
631  sent_frag_count_.incSlot(dest);
632  outsts = sts;
633  }
634  else if (!should_stop_)
635  {
636  TLOG(TLVL_ERROR) << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
637  << ". enabled_destinantions_.size()=" << enabled_destinations_.size();
638  }
639  }
640 
641  if (!isSystemBroadcast)
642  {
643  std::unique_lock<std::mutex> lck(routing_mutex_);
644  sent_sequence_id_count_[seqID]++;
645  }
646 
647  auto delta_t = TimeUtils::GetElapsedTime(start_time);
648 
649  if (metricMan)
650  {
651  TLOG(TLVL_DEBUG + 2) << "sendFragment: sending metrics";
652  metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 5, MetricMode::Accumulate);
653  metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 5, MetricMode::Accumulate);
654  metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 5, MetricMode::Average);
655  metricMan->sendMetric("Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest), "fragments", 3, MetricMode::LastPoint);
656  metricMan->sendMetric("Fragment Latency at Send", latency, "s", 4, MetricMode::Average | MetricMode::Maximum);
657 
658  if (use_routing_manager_)
659  {
660  metricMan->sendMetric("Routing Table Size", GetRoutingTableEntryCount(), "events", 2, MetricMode::LastPoint);
661  if (routing_wait_time_ > 0)
662  {
663  metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 2, MetricMode::Average);
664  routing_wait_time_ = 0;
665  }
666  }
667  }
668  TLOG(TLVL_DEBUG + 2) << "sendFragment: Done sending fragment " << seqID << " to dest=" << dest;
669  return std::make_pair(dest, outsts);
670 } // artdaq::DataSenderManager::sendFragment
void RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
Remove the given sequence ID from the routing table and sent_count lists.
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
A row of the Routing Table.
Events should be routed by sequence ID (BR -&gt; EB)
virtual ~DataSenderManager()
DataSenderManager Destructor.
std::bitset< 1024 > already_acknowledged_ranks
Bitset of ranks which have already sent valid acknowledgements and therefore do not need to send agai...
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
int GetInterfaceForNetwork(char const *host_in, in_addr &addr)
Convert an IP address to the network address of the interface sharing the subnet mask.
Definition: TCPConnect.cc:223
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
size_t GetSentSequenceIDCount(Fragment::sequence_id_t seq)
Get the number of Fragments sent with a given Sequence ID.
std::pair< int, TransferInterface::CopyStatus > sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
The header of the Routing Table, containing the magic bytes and the number of entries.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, std::string > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
Definition: HostMap.hh:46
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Events should be routed by send count (EB -&gt; Agg)
static RoutingAckPacket makeEndOfDataRoutingAckPacket(int rank)
Create an EndOfData RoutingAckPacket.
Some error occurred, but no exception was thrown.
RoutingManagerMode mode
The current mode of the RoutingManager.
The send operation completed successfully.
Value to be returned upon receive timeout.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:39
hostMap_t MakeHostMap(fhicl::ParameterSet const &pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:65
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.