artdaq  v3_06_00
DataSenderManager.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_DataSenderManager").c_str()
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 #include "artdaq/TransferPlugins/detail/HostMap.hh"
6 
7 #include <chrono>
8 #include "canvas/Utilities/Exception.h"
9 #include <arpa/inet.h>
10 #include <netinet/in.h>
11 #include <sys/types.h>
12 #include <poll.h>
13 #include <sys/socket.h>
15 
16 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
17  : destinations_()
18  , enabled_destinations_()
19  , sent_frag_count_()
20  , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
21  , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
22  , send_timeout_us_(pset.get<size_t>("send_timeout_usec", 5000000))
23  , send_retry_count_(pset.get<size_t>("send_retry_count", 2))
24  , routing_master_mode_(detail::RoutingMasterMode::INVALID)
25  , should_stop_(false)
26  , ack_socket_(-1)
27  , table_socket_(-1)
28  , routing_table_last_(0)
29  , routing_table_max_size_(pset.get<size_t>("routing_table_max_size", 1000))
30  , highest_sequence_id_routed_(0)
31 {
32  TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
33 
34  // Validate parameters
35  if (send_timeout_us_ == 0) send_timeout_us_ = std::numeric_limits<size_t>::max();
36 
37  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
38  use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
39  table_port_ = rmConfig.get<int>("table_update_port", 35556);
40  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
41  table_multicast_interface_ = rmConfig.get<std::string>("table_update_multicast_interface", "localhost");
42  ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
43  ack_address_ = rmConfig.get<std::string>("routing_master_hostname", "localhost");
44  routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
45  routing_retry_count_ = rmConfig.get<int>("routing_retry_count", 5);
46 
47  hostMap_t host_map = MakeHostMap(pset);
48  size_t tcp_send_buffer_size = pset.get<size_t>("tcp_send_buffer_size", 0);
49  size_t max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0);
50 
51  auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
52  for (auto& d : dests.get_pset_names())
53  {
54  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
55  host_map = MakeHostMap(dest_pset, host_map);
56  }
57  auto host_map_pset = MakeHostMapPset(host_map);
58  fhicl::ParameterSet dests_mod;
59  for (auto& d : dests.get_pset_names())
60  {
61  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
62  dest_pset.erase("host_map");
63  dest_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
64 
65  if (tcp_send_buffer_size != 0 && !dest_pset.has_key("tcp_send_buffer_size"))
66  {
67  dest_pset.put<size_t>("tcp_send_buffer_size", tcp_send_buffer_size);
68  }
69  if (max_fragment_size_words != 0 && !dest_pset.has_key("max_fragment_size_words"))
70  {
71  dest_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words);
72  }
73 
74  dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
75  }
76 
77  for (auto& d : dests_mod.get_pset_names())
78  {
79  try
80  {
81  auto transfer = MakeTransferPlugin(dests_mod, d, TransferInterface::Role::kSend);
82  auto destination_rank = transfer->destination_rank();
83  destinations_.emplace(destination_rank, std::move(transfer));
84  }
85  catch (const std::invalid_argument&)
86  {
87  TLOG(TLVL_DEBUG) << "Invalid destination specification: " << d;
88  }
89  catch (const cet::exception& ex)
90  {
91  TLOG(TLVL_WARNING) << "Caught cet::exception: " << ex.what();
92  }
93  catch (...)
94  {
95  TLOG(TLVL_WARNING) << "Non-cet exception while setting up TransferPlugin: " << d << ".";
96  }
97  }
98  if (destinations_.size() == 0)
99  {
100  TLOG(TLVL_ERROR) << "No destinations specified!";
101  }
102  else
103  {
104  auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
105  if (enabled_dests.size() == 0)
106  {
107  TLOG(TLVL_INFO) << "enabled_destinations not specified, assuming all destinations enabled.";
108  for (auto& d : destinations_)
109  {
110  enabled_destinations_.insert(d.first);
111  }
112  }
113  else
114  {
115  for (auto& d : enabled_dests)
116  {
117  enabled_destinations_.insert(d);
118  }
119  }
120  }
121  if (use_routing_master_) startTableReceiverThread_();
122 }
123 
125 {
126  TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager BEGIN";
127  should_stop_ = true;
128  for (auto& dest : enabled_destinations_)
129  {
130  if (destinations_.count(dest))
131  {
132  auto sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
133  if (sts != TransferInterface::CopyStatus::kSuccess) TLOG(TLVL_ERROR) << "Error sending EOD Fragment to sender rank " << dest;
134  // sendFragTo(std::move(*Fragment::eodFrag(nFragments)), dest, true);
135  }
136  }
137  if (routing_thread_.joinable()) routing_thread_.join();
138  TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager END. Sent " << count() << " fragments.";
139 }
140 
141 
142 void artdaq::DataSenderManager::setupTableListener_()
143 {
144  int sts;
145  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
146  if (table_socket_ < 0)
147  {
148  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
149  exit(1);
150  }
151 
152  struct sockaddr_in si_me_request;
153 
154  int yes = 1;
155  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
156  {
157  TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
158  exit(1);
159  }
160  memset(&si_me_request, 0, sizeof(si_me_request));
161  si_me_request.sin_family = AF_INET;
162  si_me_request.sin_port = htons(table_port_);
163  //si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
164  struct in_addr in_addr_s;
165  sts = inet_aton(table_address_.c_str(), &in_addr_s);
166  if (sts == 0)
167  {
168  TLOG(TLVL_ERROR) << "inet_aton says table_address " << table_address_ << " is invalid";
169  }
170  si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
171  if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
172  {
173  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
174  exit(1);
175  }
176 
177  struct ip_mreq mreq;
178  sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
179  if (sts == -1)
180  {
181  TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
182  exit(1);
183  }
184  sts = GetInterfaceForNetwork(table_multicast_interface_.c_str(), mreq.imr_interface);
185  if (sts == -1)
186  {
187  TLOG(TLVL_ERROR) << "Unable to resolve multicast interface for table updates" << table_multicast_interface_;
188  exit(1);
189  }
190  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
191  {
192  TLOG(TLVL_ERROR) << "Unable to join multicast group";
193  exit(1);
194  }
195 }
196 void artdaq::DataSenderManager::startTableReceiverThread_()
197 {
198  if (routing_thread_.joinable()) routing_thread_.join();
199  TLOG(TLVL_INFO) << "Starting Routing Thread";
200  try {
201  routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
202  }
203  catch (const boost::exception& e)
204  {
205  TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
206  std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
207  exit(5);
208  }
209 }
210 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
211 {
212  while (true)
213  {
214  if (should_stop_)
215  {
216  TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
217  return;
218  }
219 
220  TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes (interface,address,port = "
221  << table_multicast_interface_ << "," << table_address_ << "," << table_port_ << ")";
222  if (table_socket_ == -1)
223  {
224  TLOG(TLVL_DEBUG) << __func__ << ": Opening table listener socket";
225  setupTableListener_();
226  }
227  if (table_socket_ == -1)
228  {
229  TLOG(TLVL_DEBUG) << __func__ << ": The listen socket was not opened successfully.";
230  return;
231  }
232  if (ack_socket_ == -1)
233  {
234  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
235  auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
236  if (sts == -1)
237  {
238  TLOG(TLVL_ERROR) << __func__ << ": Unable to resolve routing_master_address";
239  exit(1);
240  }
241  TLOG(TLVL_DEBUG) << __func__ << ": Ack socket is fd " << ack_socket_;
242  }
243 
244  struct pollfd fd;
245  fd.fd = table_socket_;
246  fd.events = POLLIN | POLLPRI;
247 
248  auto res = poll(&fd, 1, 1000);
249  if (res > 0)
250  {
251  auto first = artdaq::Fragment::InvalidSequenceID;
252  auto last = artdaq::Fragment::InvalidSequenceID;
253  std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
255 
256  TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
257  struct sockaddr_in from;
258  socklen_t len = sizeof(from);
259  auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, (struct sockaddr*)&from, &len);
260  TLOG(TLVL_DEBUG) << __func__ << ": Received " << stss << " bytes from " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
261 
262  if (stss > static_cast<ssize_t>(sizeof(hdr)))
263  {
264  memcpy(&hdr, &buf[0], sizeof(artdaq::detail::RoutingPacketHeader));
265  }
266  else
267  {
268  TLOG(TLVL_TRACE) << __func__ << ": Incorrect size received. Discarding.";
269  continue;
270  }
271 
272  TRACE(TLVL_DEBUG, "receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx", hdr.nEntries, ((unsigned long*)&hdr)[0], ((unsigned long*)&hdr)[1]);
273  if (hdr.header != ROUTING_MAGIC)
274  {
275  TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss;
276  }
277  else
278  {
279  if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
280  {
281  TLOG(TLVL_ERROR) << __func__ << ": Received table has different RoutingMasterMode than expected!";
282  exit(1);
283  }
284  routing_master_mode_ = hdr.mode;
285 
287  assert(static_cast<size_t>(stss) == sizeof(artdaq::detail::RoutingPacketHeader) + sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
288  memcpy(&buffer[0], &buf[sizeof(artdaq::detail::RoutingPacketHeader)], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
289  TRACE(6, "receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx", stss, ((unsigned long*)&buffer[0])[0], ((unsigned long*)&buffer[0])[1]);
290 
291  first = buffer[0].sequence_id;
292  last = buffer[buffer.size() - 1].sequence_id;
293 
294  if (first + hdr.nEntries - 1 != last)
295  {
296  TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
297  continue;
298  }
299  auto thisSeqID = first;
300 
301  {
302  std::unique_lock<std::mutex> lck(routing_mutex_);
303  if (routing_table_.count(last) == 0)
304  {
305  for (auto entry : buffer)
306  {
307  if (thisSeqID != entry.sequence_id)
308  {
309  TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
310  last = thisSeqID - 1;
311  break;
312  }
313  thisSeqID++;
314  if (routing_table_.count(entry.sequence_id))
315  {
316  if (routing_table_[entry.sequence_id] != entry.destination_rank)
317  {
318  TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
319  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
320  << " I will use the original value!";
321  }
322  continue;
323  }
324  if (entry.sequence_id < routing_table_last_) continue;
325  routing_table_[entry.sequence_id] = entry.destination_rank;
326  TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
327  << " -> Rank " << entry.destination_rank;
328  }
329  }
330 
331  TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
332  if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
333 
334  auto counter = 0;
335  for (auto& entry : routing_table_)
336  {
337  TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
338  counter++;
339  }
340  }
341 
343  ack.rank = my_rank;
344  ack.first_sequence_id = first;
345  ack.last_sequence_id = last;
346 
347  if (last > routing_table_last_) routing_table_last_ = last;
348 
349  if (my_rank < static_cast<int>(8*sizeof(hdr.already_acknowledged_ranks)) && hdr.already_acknowledged_ranks.test(my_rank))
350  {
351  TLOG(TLVL_DEBUG) << __func__ << ": Skipping RoutingAckPacket since this Routing Table Update has already been acknowledged (my_rank = " << my_rank << ")";
352  }
353  else
354  {
355  TLOG(TLVL_DEBUG) << __func__ << ": Sending RoutingAckPacket with first= " << first << " and last= " << last << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")";
356  sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr_, sizeof(ack_addr_));
357  }
358  }
359  }
360 }
361 }
362 
364 {
365  std::unique_lock<std::mutex> lck(routing_mutex_);
366  return routing_table_.size();
367 }
368 
370 {
371  std::unique_lock<std::mutex> lck(routing_mutex_);
372  // Find the distance from the next highest sequence ID to the end of the list
373  size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
374  return dist; // If dist == 1, there is one entry left.
375 }
376 
377 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
378 {
379  if (enabled_destinations_.size() == 0) return TransferInterface::RECV_TIMEOUT; // No destinations configured.
380  if (!use_routing_master_ && enabled_destinations_.size() == 1) return *enabled_destinations_.begin(); // Trivial case
381 
382  if (use_routing_master_)
383  {
384  auto start = std::chrono::steady_clock::now();
385  TLOG(15) << "calcDest_ use_routing_master check for routing info for seqID=" << sequence_id << " routing_timeout_ms=" << routing_timeout_ms_ << " should_stop_=" << should_stop_;
386  while (!should_stop_ && (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_)))
387  {
388  {
389  std::unique_lock<std::mutex> lck(routing_mutex_);
390  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id))
391  {
392  if (sequence_id > highest_sequence_id_routed_) highest_sequence_id_routed_ = sequence_id;
393  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
394  return routing_table_.at(sequence_id);
395  }
396  else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count() + 1))
397  {
398  if (sent_frag_count_.count() + 1 > highest_sequence_id_routed_) highest_sequence_id_routed_ = sent_frag_count_.count() + 1;
399  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
400  return routing_table_.at(sent_frag_count_.count() + 1);
401  }
402  }
403  usleep(routing_timeout_ms_ * 10);
404  }
405  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
406  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID)
407  {
408  TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for seqID " << sequence_id
409  << " and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
410  }
411  else
412  {
413  TLOG(TLVL_WARNING) << "Bad Omen: I don't have routing information for send number " << sent_frag_count_.count()
414  << " and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ << " ms)!";
415  }
416  }
417  else
418  {
419  auto index = sequence_id % enabled_destinations_.size();
420  auto it = enabled_destinations_.begin();
421  for (; index > 0; --index)
422  {
423  ++it;
424  if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
425  }
426  return *it;
427  }
429 }
430 
431 void artdaq::DataSenderManager::RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
432 {
433  TLOG(15) << "RemoveRoutingTableEntry: Removing sequence ID " << seq << " from routing table. Sent " << GetSentSequenceIDCount(seq) << " Fragments with this Sequence ID.";
434  std::unique_lock<std::mutex> lck(routing_mutex_);
435  // while (routing_table_.size() > routing_table_max_size_)
436  // {
437  // routing_table_.erase(routing_table_.begin());
438  // }
439  if (routing_table_.find(seq) != routing_table_.end())
440  routing_table_.erase(routing_table_.find(seq));
441 
442  if (sent_sequence_id_count_.find(seq) != sent_sequence_id_count_.end())
443  {
444  sent_sequence_id_count_.erase(sent_sequence_id_count_.find(seq));
445  }
446 }
447 
448 size_t artdaq::DataSenderManager::GetSentSequenceIDCount(Fragment::sequence_id_t seq)
449 {
450  std::unique_lock<std::mutex> lck(routing_mutex_);
451  if (!sent_sequence_id_count_.count(seq)) return 0;
452  return sent_sequence_id_count_[seq];
453 }
454 
455 std::pair<int, artdaq::TransferInterface::CopyStatus> artdaq::DataSenderManager::sendFragment(Fragment&& frag)
456 {
457  // Precondition: Fragment must be complete and consistent (including
458  // header information).
459  auto start_time = std::chrono::steady_clock::now();
460  if (frag.type() == Fragment::EndOfDataFragmentType)
461  {
462  throw cet::exception("LogicError")
463  << "EOD fragments should not be sent on as received: "
464  << "use sendEODFrag() instead.";
465  }
466  size_t seqID = frag.sequenceID();
467  size_t fragSize = frag.sizeBytes();
468  auto latency_s = frag.getLatency(true);
469  double latency = latency_s.tv_sec + (latency_s.tv_nsec / 1000000000.0);
470  TLOG(13) << "sendFragment start frag.fragmentHeader()=" << std::hex << (void*)(frag.headerBeginBytes()) << ", szB=" << std::dec << fragSize
471  << ", seqID=" << seqID << ", fragID=" << frag.fragmentID() << ", type=" << frag.typeString();
474  if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
475  {
476  for (auto& bdest : enabled_destinations_)
477  {
478  TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << bdest << " (broadcast)";
479  // Gross, we have to copy.
481  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
482  while (sts == TransferInterface::CopyStatus::kTimeout && retries <= send_retry_count_)
483  {
484  if (!non_blocking_mode_)
485  {
486  sts = destinations_[bdest]->transfer_fragment_reliable_mode(Fragment(frag));
487  }
488  else
489  {
490  sts = destinations_[bdest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
491  }
492  ++retries;
493  }
494  if (sts != TransferInterface::CopyStatus::kSuccess) outsts = sts;
495  sent_frag_count_.incSlot(bdest);
496  }
497  }
498  else if (non_blocking_mode_)
499  {
500  auto count = routing_retry_count_;
501  while (dest == TransferInterface::RECV_TIMEOUT && count > 0)
502  {
503  dest = calcDest_(seqID);
505  {
506  count--;
507  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << (count > 0 ? ", retrying." : ".");
508  }
509  }
510  if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest))
511  {
512  TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
514  auto lastWarnTime = std::chrono::steady_clock::now();
515  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
516  while (sts != TransferInterface::CopyStatus::kSuccess && retries <= send_retry_count_)
517  {
518  sts = destinations_[dest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
519  if (sts != TransferInterface::CopyStatus::kSuccess && TimeUtils::GetElapsedTime(lastWarnTime) >= 1)
520  {
521  TLOG(TLVL_WARNING) << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying...";
522  lastWarnTime = std::chrono::steady_clock::now();
523  }
524  ++retries;
525  }
526  if (sts != TransferInterface::CopyStatus::kSuccess) outsts = sts;
527  //sendFragTo(std::move(frag), dest);
528  sent_frag_count_.incSlot(dest);
529  }
530  else if (!should_stop_)
531  TLOG(TLVL_ERROR) << "(in non_blocking) calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
532  << ". enabled_destinantions_.size()=" << enabled_destinations_.size();
533  }
534  else
535  {
536  auto start = std::chrono::steady_clock::now();
537  while (!should_stop_ && dest == TransferInterface::RECV_TIMEOUT)
538  {
539  dest = calcDest_(seqID);
541  {
542  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << ", send number " << sent_frag_count_.count() << ", retrying. Waited " << TimeUtils::GetElapsedTime(start) << " s for routing information.";
543  usleep(10000);
544  }
545  }
546  if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest))
547  {
548  TLOG(5) << "DataSenderManager::sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
550 
551  sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(frag));
553  TLOG(TLVL_ERROR) << "sendFragment: Sending fragment " << seqID << " to destination "
554  << dest << " failed! Data has been lost!";
555 
556  //sendFragTo(std::move(frag), dest);
557  sent_frag_count_.incSlot(dest);
558  outsts = sts;
559  }
560  else if (!should_stop_)
561  TLOG(TLVL_ERROR) << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
562  << ". enabled_destinantions_.size()=" << enabled_destinations_.size();
563  }
564 
565  {
566  std::unique_lock<std::mutex> lck(routing_mutex_);
567  sent_sequence_id_count_[seqID]++;
568  }
569 
570  auto delta_t = TimeUtils::GetElapsedTime(start_time);
571 
572  if (metricMan )
573  {
574  TLOG(5) << "sendFragment: sending metrics";
575  metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 5, MetricMode::Accumulate);
576  metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 5, MetricMode::Accumulate);
577  metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 5, MetricMode::Average);
578  metricMan->sendMetric("Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest), "fragments", 3, MetricMode::LastPoint);
579  metricMan->sendMetric("Fragment Latency at Send", latency, "s", 4, MetricMode::Average | MetricMode::Maximum);
580 
581  if (use_routing_master_)
582  {
583  metricMan->sendMetric("Routing Table Size", GetRoutingTableEntryCount(), "events", 2, MetricMode::LastPoint);
584  if (routing_wait_time_ > 0)
585  {
586  metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 2, MetricMode::Average);
587  routing_wait_time_ = 0;
588  }
589  }
590  }
591  TLOG(5) << "sendFragment: Done sending fragment " << seqID << " to dest="<<dest;
592  return std::make_pair(dest, outsts);
593 } // artdaq::DataSenderManager::sendFragment
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
A row of the Routing Table.
hostMap_t MakeHostMap(fhicl::ParameterSet pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:66
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
int GetInterfaceForNetwork(char const *host_in, in_addr &addr)
Convert an IP address to the network address of the interface sharing the subnet mask.
Definition: TCPConnect.cc:217
Events should be routed by sequence ID (BR -&gt; EB)
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
std::pair< int, TransferInterface::CopyStatus > sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
The header of the Routing Table, containing the magic bytes and the number of entries.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, std::string > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
Definition: HostMap.hh:46
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
RoutingMasterMode mode
The current mode of the RoutingMaster.
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
Value to be returned upon receive timeout.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:39
Events should be routed by send count (EB -&gt; Agg)
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.