artdaq  v3_02_00
DataSenderManager.cc
1 #define TRACE_NAME "DataSenderManager"
2 #include "artdaq/DAQdata/Globals.hh"
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 #include "artdaq/TransferPlugins/detail/HostMap.hh"
6 
7 #include <chrono>
8 #include "canvas/Utilities/Exception.h"
9 #include <arpa/inet.h>
10 #include <netinet/in.h>
11 #include <sys/types.h>
12 #include <poll.h>
13 #include <sys/socket.h>
15 
16 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
17  : destinations_()
18  , enabled_destinations_()
19  , sent_frag_count_()
20  , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
21  , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
22  , send_timeout_us_(pset.get<size_t>("send_timeout_usec", 5000000))
23  , send_retry_count_(pset.get<size_t>("send_retry_count", 2))
24  , routing_master_mode_(detail::RoutingMasterMode::INVALID)
25  , should_stop_(false)
26  , ack_socket_(-1)
27  , table_socket_(-1)
28 {
29  TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
30 
31  // Validate parameters
32  if (send_timeout_us_ == 0) send_timeout_us_ = std::numeric_limits<size_t>::max();
33 
34  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
35  use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
36  table_port_ = rmConfig.get<int>("table_update_port", 35556);
37  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
38  ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
39  ack_address_ = rmConfig.get<std::string>("routing_master_hostname", "localhost");
40  routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
41  routing_retry_count_ = rmConfig.get<int>("routing_retry_count", 5);
42 
43 
44  hostMap_t host_map = MakeHostMap(pset);
45 
46  auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
47  for (auto& d : dests.get_pset_names())
48  {
49  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
50  host_map = MakeHostMap(dest_pset, 0, host_map);
51  }
52  auto host_map_pset = MakeHostMapPset(host_map);
53  fhicl::ParameterSet dests_mod;
54  for (auto& d : dests.get_pset_names())
55  {
56  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
57  dest_pset.erase("host_map");
58  dest_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
59  dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
60  }
61 
62  for (auto& d : dests.get_pset_names())
63  {
64  try
65  {
66  auto transfer = MakeTransferPlugin(dests, d, TransferInterface::Role::kSend);
67  auto destination_rank = transfer->destination_rank();
68  destinations_.emplace(destination_rank, std::move(transfer));
69  }
70  catch (std::invalid_argument)
71  {
72  TLOG(TLVL_DEBUG) << "Invalid destination specification: " << d;
73  }
74  catch (cet::exception ex)
75  {
76  TLOG(TLVL_WARNING) << "Caught cet::exception: " << ex.what();
77  }
78  catch (...)
79  {
80  TLOG(TLVL_WARNING) << "Non-cet exception while setting up TransferPlugin: " << d << ".";
81  }
82  }
83  if (destinations_.size() == 0)
84  {
85  TLOG(TLVL_ERROR) << "No destinations specified!";
86  }
87  else
88  {
89  auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
90  if (enabled_dests.size() == 0)
91  {
92  TLOG(TLVL_INFO) << "enabled_destinations not specified, assuming all destinations enabled.";
93  for (auto& d : destinations_)
94  {
95  enabled_destinations_.insert(d.first);
96  }
97  }
98  else
99  {
100  for (auto& d : enabled_dests)
101  {
102  enabled_destinations_.insert(d);
103  }
104  }
105  }
106  if (use_routing_master_) startTableReceiverThread_();
107 }
108 
110 {
111  TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager BEGIN";
112  should_stop_ = true;
113  for (auto& dest : enabled_destinations_)
114  {
115  if (destinations_.count(dest))
116  {
117  auto sts = destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
118  if (sts != TransferInterface::CopyStatus::kSuccess) TLOG(TLVL_ERROR) << "Error sending EOD Fragment to sender rank " << dest;
119  // sendFragTo(std::move(*Fragment::eodFrag(nFragments)), dest, true);
120  }
121  }
122  if (routing_thread_.joinable()) routing_thread_.join();
123  TLOG(TLVL_DEBUG) << "Shutting down DataSenderManager END. Sent " << count() << " fragments.";
124 }
125 
126 
127 void artdaq::DataSenderManager::setupTableListener_()
128 {
129  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
130  if (table_socket_ < 0)
131  {
132  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
133  exit(1);
134  }
135 
136  struct sockaddr_in si_me_request;
137 
138  int yes = 1;
139  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
140  {
141  TLOG(TLVL_ERROR) << " Unable to enable port reuse on request socket";
142  exit(1);
143  }
144  memset(&si_me_request, 0, sizeof(si_me_request));
145  si_me_request.sin_family = AF_INET;
146  si_me_request.sin_port = htons(table_port_);
147  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
148  if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
149  {
150  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << table_port_;
151  exit(1);
152  }
153 
154  struct ip_mreq mreq;
155  int sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
156  if (sts == -1)
157  {
158  TLOG(TLVL_ERROR) << "Unable to resolve multicast address for table updates";
159  exit(1);
160  }
161  mreq.imr_interface.s_addr = htonl(INADDR_ANY);
162  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
163  {
164  TLOG(TLVL_ERROR) << "Unable to join multicast group";
165  exit(1);
166  }
167 }
168 void artdaq::DataSenderManager::startTableReceiverThread_()
169 {
170  if (routing_thread_.joinable()) routing_thread_.join();
171  TLOG(TLVL_INFO) << "Starting Routing Thread";
172  routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
173 }
174 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
175 {
176  while (true)
177  {
178  if (should_stop_)
179  {
180  TLOG(TLVL_DEBUG) << "receiveTableUpdatesLoop: should_stop is " << std::boolalpha << should_stop_ << ", stopping";
181  return;
182  }
183 
184  TLOG(TLVL_TRACE) << "DataSenderManager::receiveTableUpdatesLoop: Polling Request socket for new requests";
185  if (table_socket_ == -1)
186  {
187  TLOG(TLVL_DEBUG) << "Opening table listener socket";
188  setupTableListener_();
189  }
190  if (table_socket_ == -1)
191  {
192  TLOG(TLVL_DEBUG) << "The listen socket was not opened successfully.";
193  return;
194  }
195  if (ack_socket_ == -1)
196  {
197  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
198  auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
199  if (sts == -1)
200  {
201  TLOG(TLVL_ERROR) << "Unable to resolve routing_master_address";
202  exit(1);
203  }
204  TLOG(TLVL_DEBUG) << "Ack socket is fd " << ack_socket_;
205  }
206 
207  struct pollfd fd;
208  fd.fd = table_socket_;
209  fd.events = POLLIN | POLLPRI;
210 
211  auto res = poll(&fd, 1, 1000);
212  if (res > 0)
213  {
214  auto first = artdaq::Fragment::InvalidSequenceID;
215  auto last = artdaq::Fragment::InvalidSequenceID;
217 
218  TLOG(TLVL_DEBUG) << "Going to receive RoutingPacketHeader";
219  auto stss = recvfrom(table_socket_, &hdr, sizeof(artdaq::detail::RoutingPacketHeader), 0, NULL, NULL);
220  TLOG(TLVL_DEBUG) << "Received " << std::to_string(stss) << " bytes. (sizeof(RoutingPacketHeader) == " << std::to_string(sizeof(detail::RoutingPacketHeader));
221 
222  TLOG(TLVL_DEBUG) << "Checking for valid header";
223  if (hdr.header == ROUTING_MAGIC)
224  {
225  if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
226  {
227  TLOG(TLVL_ERROR) << "Received table has different RoutingMasterMode than expected!";
228  exit(1);
229  }
230  routing_master_mode_ = hdr.mode;
231 
233  TLOG(TLVL_DEBUG) << "Receiving data buffer";
234  auto sts = recv(table_socket_, &buffer[0], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries, 0);
235  assert(static_cast<size_t>(sts) == sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
236  TLOG(6) << "Received a packet of " << sts << " bytes";
237 
238  first = buffer[0].sequence_id;
239  last = buffer[buffer.size() - 1].sequence_id;
240 
241  if (first + hdr.nEntries - 1 != last)
242  {
243  TLOG(TLVL_ERROR) << "Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
244  continue;
245  }
246  auto thisSeqID = first;
247 
248  if (routing_table_.count(last) == 0)
249  {
250  for (auto entry : buffer)
251  {
252  if (thisSeqID != entry.sequence_id)
253  {
254  TLOG(TLVL_ERROR) << "Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
255  last = thisSeqID - 1;
256  break;
257  }
258  thisSeqID++;
259  if (routing_table_.count(entry.sequence_id))
260  {
261  if (routing_table_[entry.sequence_id] != entry.destination_rank)
262  {
263  TLOG(TLVL_ERROR) << "Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
264  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
265  << " I will use the original value!";
266  }
267  continue;
268  }
269  routing_table_[entry.sequence_id] = entry.destination_rank;
270  TLOG(TLVL_DEBUG) << "DataSenderManager " << std::to_string(my_rank) << ": received update: SeqID " << std::to_string(entry.sequence_id) << " -> Rank " << std::to_string(entry.destination_rank);
271  }
272  }
273 
275  ack.rank = my_rank;
276  ack.first_sequence_id = first;
277  ack.last_sequence_id = last;
278 
279  TLOG(TLVL_DEBUG) << "Sending RoutingAckPacket with first= " << std::to_string(first) << " and last= " << std::to_string(last) << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")";
280  TLOG(TLVL_DEBUG) << "There are now " << routing_table_.size() << " entries in the Routing Table";
281  sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr_, sizeof(ack_addr_));
282  }
283  }
284  }
285 }
286 
288 {
289  std::unique_lock<std::mutex> lck(routing_mutex_);
290  return routing_table_.size();
291 }
292 
293 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
294 {
295  if (enabled_destinations_.size() == 0) return TransferInterface::RECV_TIMEOUT; // No destinations configured.
296  if (enabled_destinations_.size() == 1) return *enabled_destinations_.begin(); // Trivial case
297 
298  if (use_routing_master_)
299  {
300  auto start = std::chrono::steady_clock::now();
301  while (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_))
302  {
303  std::unique_lock<std::mutex> lck(routing_mutex_);
304  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id))
305  {
306  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
307  return routing_table_.at(sequence_id);
308  }
309  else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count()))
310  {
311  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
312  return routing_table_.at(sent_frag_count_.count());
313  }
314  usleep(routing_timeout_ms_ * 10);
315  }
316  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
317  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID)
318  {
319  TLOG(TLVL_ERROR) << "Bad Omen: I don't have routing information for seqID " << std::to_string(sequence_id)
320  << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!";
321  }
322  else
323  {
324  TLOG(TLVL_ERROR) << "Bad Omen: I don't have routing information for send number " << std::to_string(sent_frag_count_.count())
325  << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!";
326  }
327  }
328  else
329  {
330  auto index = sequence_id % enabled_destinations_.size();
331  auto it = enabled_destinations_.begin();
332  for (; index > 0; --index)
333  {
334  ++it;
335  if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
336  }
337  return *it;
338  }
340 }
341 
342 std::pair<int, artdaq::TransferInterface::CopyStatus> artdaq::DataSenderManager::sendFragment(Fragment&& frag)
343 {
344  // Precondition: Fragment must be complete and consistent (including
345  // header information).
346  auto start_time = std::chrono::steady_clock::now();
347  if (frag.type() == Fragment::EndOfDataFragmentType)
348  {
349  throw cet::exception("LogicError")
350  << "EOD fragments should not be sent on as received: "
351  << "use sendEODFrag() instead.";
352  }
353  size_t seqID = frag.sequenceID();
354  size_t fragSize = frag.sizeBytes();
355  TLOG(13) << "sendFragment start frag.fragmentHeader()=" << std::hex << (void*)(frag.headerBeginBytes()) << ", szB=" << std::dec << std::to_string(fragSize)
356  << ", seqID=" << std::to_string(seqID) << ", type=" << frag.typeString();
359  if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
360  {
361  for (auto& bdest : enabled_destinations_)
362  {
363  TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << std::to_string(seqID) << " to destination " << bdest << " (broadcast)";
364  // Gross, we have to copy.
365  Fragment fragCopy(frag);
366  auto sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
367  size_t retries = 0; // Tried once, so retries < send_retry_count_ will have it retry send_retry_count_ times
368  while (sts == TransferInterface::CopyStatus::kTimeout && retries < send_retry_count_)
369  {
370  sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
371  retries++;
372  }
373  if (sts != TransferInterface::CopyStatus::kSuccess) outsts = sts;
374  sent_frag_count_.incSlot(bdest);
375  }
376  }
377  else if (non_blocking_mode_)
378  {
379  auto count = routing_retry_count_;
380  while (dest == TransferInterface::RECV_TIMEOUT && count > 0)
381  {
382  dest = calcDest_(seqID);
384  {
385  count--;
386  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << std::to_string(seqID) << (count > 0 ? ", retrying." : ".");
387  }
388  }
389  if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest))
390  {
391  TLOG(TLVL_TRACE) << "sendFragment: Sending fragment with seqId " << std::to_string(seqID) << " to destination " << dest;
393  auto lastWarnTime = std::chrono::steady_clock::now();
394  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
395  while (sts != TransferInterface::CopyStatus::kSuccess && retries <= send_retry_count_)
396  {
397  sts = destinations_[dest]->copyFragment(frag, send_timeout_us_);
398  if (sts != TransferInterface::CopyStatus::kSuccess && TimeUtils::GetElapsedTime(lastWarnTime) >= 1)
399  {
400  TLOG(TLVL_ERROR) << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying...";
401  lastWarnTime = std::chrono::steady_clock::now();
402  }
403  }
404  if (sts != TransferInterface::CopyStatus::kSuccess) outsts = sts;
405  //sendFragTo(std::move(frag), dest);
406  sent_frag_count_.incSlot(dest);
407  }
408  else
409  {
410  TLOG(TLVL_WARNING) << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID;
411  }
412  }
413  else
414  {
415  auto count = routing_retry_count_;
416  while (dest == TransferInterface::RECV_TIMEOUT && count > 0)
417  {
418  dest = calcDest_(seqID);
420  {
421  count--;
422  TLOG(TLVL_WARNING) << "Could not get destination for seqID "
423  << std::to_string(seqID) << ", send number " << sent_frag_count_.count()
424  << (count > 0 ? ", retrying." : ".");
425  }
426  }
427  if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest))
428  {
429  TLOG(5) << "DataSenderManager::sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
431 
432  sts = destinations_[dest]->moveFragment(std::move(frag));
434  TLOG(TLVL_ERROR) << "sendFragment: Sending fragment " << seqID << " to destination "
435  << dest << " failed! Data has been lost!";
436 
437  //sendFragTo(std::move(frag), dest);
438  sent_frag_count_.incSlot(dest);
439  outsts = sts;
440  }
441  else
442  TLOG(TLVL_WARNING) << "calcDest returned invalid destination rank " << dest
443  << "! This event has been lost: " << seqID;
444  }
445  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID
446  && routing_table_.find(seqID - 1) != routing_table_.end())
447  {
448  std::unique_lock<std::mutex> lck(routing_mutex_);
449  routing_table_.erase(routing_table_.begin(), routing_table_.find(seqID - 1));
450  }
451  else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount)
452  {
453  std::unique_lock<std::mutex> lck(routing_mutex_);
454  routing_table_.erase(routing_table_.begin(), routing_table_.find(sent_frag_count_.count()));
455  }
456  if (metricMan)
457  {//&& sent_frag_count_.slotCount(dest) % 100 == 0) {
458  TLOG(5) << "sendFragment: sending metrics";
459  auto delta_t = TimeUtils::GetElapsedTime(start_time);
460  metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 5, MetricMode::Accumulate);
461  metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 5, MetricMode::Accumulate);
462  metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 5, MetricMode::Average);
463  metricMan->sendMetric("Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest),
464  "fragments", 3, MetricMode::LastPoint);
465  if (use_routing_master_)
466  {
467  metricMan->sendMetric("Routing Table Size", routing_table_.size(), "events", 2, MetricMode::LastPoint);
468  if (routing_wait_time_ > 0)
469  metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 2,
470  MetricMode::Average);
471  }
472  }
473  TLOG(5) << "sendFragment: Done sending fragment " << seqID;
474  return std::make_pair(dest, outsts);
475 } // artdaq::DataSenderManager::sendFragment
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:32
Value to be returned upon receive timeout.
A row of the Routing Table.
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
Events should be routed by sequence ID (BR -&gt; EB)
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
std::pair< int, TransferInterface::CopyStatus > sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
The header of the Routing Table, containing the magic bytes and the number of entries.
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
RoutingMasterMode mode
The current mode of the RoutingMaster.
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
Events should be routed by send count (EB -&gt; Agg)
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.