artdaq  v3_00_03
DataSenderManager.cc
1 #define TRACE_NAME "DataSenderManager"
2 #include "artdaq/DAQdata/Globals.hh"
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 
6 #include <chrono>
7 #include "canvas/Utilities/Exception.h"
8 #include <arpa/inet.h>
9 #include <netinet/in.h>
10 #include <sys/types.h>
11 #include <poll.h>
12 #include <sys/socket.h>
14 
15 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
16  : destinations_()
17  , enabled_destinations_()
18  , sent_frag_count_()
19  , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
20  , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
21  , send_timeout_us_(pset.get<size_t>("send_timeout_usec", 0))
22  , routing_master_mode_(detail::RoutingMasterMode::INVALID)
23  , should_stop_(false)
24  , ack_socket_(-1)
25  , table_socket_(-1)
26 {
27  TLOG_DEBUG("DataSenderManager") << "Received pset: " << pset.to_string() << TLOG_ENDL;
28  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
29  use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
30  table_port_ = rmConfig.get<int>("table_update_port", 35556);
31  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
32  ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
33  ack_address_ = rmConfig.get<std::string>("routing_master_hostname", "localhost");
34  routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
35  routing_retry_count_ = rmConfig.get<int>("routing_retry_count", 5);
36 
37 
38  auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
39  for (auto& d : dests.get_pset_names())
40  {
41  try
42  {
43  auto transfer = MakeTransferPlugin(dests, d, TransferInterface::Role::kSend);
44  auto destination_rank = transfer->destination_rank();
45  destinations_.emplace(destination_rank, std::move(transfer));
46  }
47  catch (std::invalid_argument)
48  {
49  TRACE(3, "Invalid destination specification: " + d);
50  }
51  catch (cet::exception ex)
52  {
53  TLOG_WARNING("DataSenderManager") << "Caught cet::exception: " << ex.what() << TLOG_ENDL;
54  }
55  catch (...)
56  {
57  TLOG_WARNING("DataSenderManager") << "Non-cet exception while setting up TransferPlugin: " << d << "." << TLOG_ENDL;
58  }
59  }
60  if (destinations_.size() == 0)
61  {
62  TLOG_ERROR("DataSenderManager") << "No destinations specified!" << TLOG_ENDL;
63  }
64  else
65  {
66  auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
67  if (enabled_dests.size() == 0)
68  {
69  TLOG_INFO("DataSenderManager") << "enabled_destinations not specified, assuming all destinations enabled." << TLOG_ENDL;
70  for (auto& d : destinations_)
71  {
72  enabled_destinations_.insert(d.first);
73  }
74  }
75  else
76  {
77  for (auto& d : enabled_dests)
78  {
79  enabled_destinations_.insert(d);
80  }
81  }
82  }
83  if (use_routing_master_) startTableReceiverThread_();
84 }
85 
87 {
88  TLOG_DEBUG("DataSenderManager") << "Shutting down DataSenderManager BEGIN" << TLOG_ENDL;
89  should_stop_ = true;
90  for (auto& dest : enabled_destinations_)
91  {
92  if (destinations_.count(dest))
93  {
94  auto sts = destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
95  if (sts != TransferInterface::CopyStatus::kSuccess) TLOG_ERROR("DataSenderManager") << "Error sending EOD Fragment to sender rank " << dest << TLOG_ENDL;
96  // sendFragTo(std::move(*Fragment::eodFrag(nFragments)), dest, true);
97  }
98  }
99  if (routing_thread_.joinable()) routing_thread_.join();
100  TLOG_DEBUG("DataSenderManager") << "Shutting down DataSenderManager END. Sent " << count() << " fragments." << TLOG_ENDL;
101 }
102 
103 
104 void artdaq::DataSenderManager::setupTableListener_()
105 {
106  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
107  if (table_socket_ < 0)
108  {
109  TLOG_ERROR("DataSenderManager") << "Error creating socket for receiving table updates!" << TLOG_ENDL;
110  exit(1);
111  }
112 
113  struct sockaddr_in si_me_request;
114 
115  int yes = 1;
116  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
117  {
118  TLOG_ERROR("DataSenderManager") << " Unable to enable port reuse on request socket" << TLOG_ENDL;
119  exit(1);
120  }
121  memset(&si_me_request, 0, sizeof(si_me_request));
122  si_me_request.sin_family = AF_INET;
123  si_me_request.sin_port = htons(table_port_);
124  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
125  if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
126  {
127  TLOG_ERROR("DataSenderManager") << "Cannot bind request socket to port " << table_port_ << TLOG_ENDL;
128  exit(1);
129  }
130 
131  struct ip_mreq mreq;
132  int sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
133  if (sts == -1)
134  {
135  TLOG_ERROR("DataSenderManager") << "Unable to resolve multicast address for table updates" << TLOG_ENDL;
136  exit(1);
137  }
138  mreq.imr_interface.s_addr = htonl(INADDR_ANY);
139  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
140  {
141  TLOG_ERROR("DataSenderManager") << "Unable to join multicast group" << TLOG_ENDL;
142  exit(1);
143  }
144 }
145 void artdaq::DataSenderManager::startTableReceiverThread_()
146 {
147  if (routing_thread_.joinable()) routing_thread_.join();
148  TLOG_INFO("DataSenderManager") << "Starting Routing Thread" << TLOG_ENDL;
149  routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
150 }
151 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
152 {
153  while (true)
154  {
155  if (should_stop_)
156  {
157  TLOG_DEBUG("DataSenderManager") << "receiveTableUpdatesLoop: should_stop is " << std::boolalpha << should_stop_ << ", stopping" << TLOG_ENDL;
158  return;
159  }
160 
161  TRACE(4, "DataSenderManager::receiveTableUpdatesLoop: Polling Request socket for new requests");
162  if (table_socket_ == -1)
163  {
164  TLOG_DEBUG("DataSenderManager") << "Opening table listener socket" << TLOG_ENDL;
165  setupTableListener_();
166  }
167  if (table_socket_ == -1)
168  {
169  TLOG_DEBUG("DataSenderManager") << "The listen socket was not opened successfully." << TLOG_ENDL;
170  return;
171  }
172  if (ack_socket_ == -1)
173  {
174  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
175  auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
176  if (sts == -1)
177  {
178  TLOG_ERROR("DataSenderManager") << "Unable to resolve routing_master_address" << TLOG_ENDL;
179  exit(1);
180  }
181  TLOG_DEBUG("DataSenderManager") << "Ack socket is fd " << ack_socket_ << TLOG_ENDL;
182  }
183 
184  struct pollfd fd;
185  fd.fd = table_socket_;
186  fd.events = POLLIN | POLLPRI;
187 
188  auto res = poll(&fd, 1, 1000);
189  if (res > 0) {
190  auto first = artdaq::Fragment::InvalidSequenceID;
191  auto last = artdaq::Fragment::InvalidSequenceID;
193 
194  TLOG_DEBUG("DataSenderManager") << "Going to receive RoutingPacketHeader" << TLOG_ENDL;
195  auto stss = recvfrom(table_socket_, &hdr, sizeof(artdaq::detail::RoutingPacketHeader), 0, NULL, NULL);
196  TLOG_DEBUG("DataSenderManager") << "Received " << std::to_string(stss) << " bytes. (sizeof(RoutingPacketHeader) == " << std::to_string(sizeof(detail::RoutingPacketHeader)) << TLOG_ENDL;
197 
198  TLOG_DEBUG("DataSenderManager") << "Checking for valid header" << TLOG_ENDL;
199  if (hdr.header == ROUTING_MAGIC) {
200  if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
201  {
202  TLOG_ERROR("DataSenderManager") << "Received table has different RoutingMasterMode than expected!" << TLOG_ENDL;
203  exit(1);
204  }
205  routing_master_mode_ = hdr.mode;
206 
208  TLOG_DEBUG("DataSenderManager") << "Receiving data buffer" << TLOG_ENDL;
209  auto sts = recv(table_socket_, &buffer[0], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries, 0);
210  assert(static_cast<size_t>(sts) == sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
211  TRACE(6, "Received a packet of %zu bytes", sts);
212 
213  first = buffer[0].sequence_id;
214  last = buffer[buffer.size() - 1].sequence_id;
215 
216  if (first + hdr.nEntries - 1 != last)
217  {
218  TLOG_ERROR("DataSenderManager") << "Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!" << TLOG_ENDL;
219  continue;
220  }
221  auto thisSeqID = first;
222 
223  if (routing_table_.count(last) == 0) {
224  for (auto entry : buffer)
225  {
226  if (thisSeqID != entry.sequence_id)
227  {
228  TLOG_ERROR("DataSenderManager") << "Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!" << TLOG_ENDL;
229  last = thisSeqID - 1;
230  break;
231  }
232  thisSeqID++;
233  if (routing_table_.count(entry.sequence_id))
234  {
235  if (routing_table_[entry.sequence_id] != entry.destination_rank)
236  {
237  TLOG_ERROR("DataSenderManager") << "Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
238  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
239  << " I will use the original value!" << TLOG_ENDL;
240  }
241  continue;
242  }
243  routing_table_[entry.sequence_id] = entry.destination_rank;
244  TLOG_DEBUG("DataSenderManager") << "DataSenderManager " << std::to_string(my_rank) << ": received update: SeqID " << std::to_string(entry.sequence_id) << " -> Rank " << std::to_string(entry.destination_rank) << TLOG_ENDL;
245  }
246  }
247 
249  ack.rank = my_rank;
250  ack.first_sequence_id = first;
251  ack.last_sequence_id = last;
252 
253  TLOG_DEBUG("DataSenderManager") << "Sending RoutingAckPacket with first= " << std::to_string(first) << " and last= " << std::to_string(last) << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")" << TLOG_ENDL;
254  TLOG_DEBUG("DataSenderManager") << "There are now " << routing_table_.size() << " entries in the Routing Table" << TLOG_ENDL;
255  sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr_, sizeof(ack_addr_));
256  }
257  }
258  }
259 }
260 
262 {
263  std::unique_lock<std::mutex> lck(routing_mutex_);
264  return routing_table_.size();
265 }
266 
267 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
268 {
269  if (enabled_destinations_.size() == 0) return TransferInterface::RECV_TIMEOUT; // No destinations configured.
270  if (enabled_destinations_.size() == 1) return *enabled_destinations_.begin(); // Trivial case
271 
272  if (use_routing_master_)
273  {
274  auto start = std::chrono::steady_clock::now();
275  while (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_))
276  {
277  std::unique_lock<std::mutex> lck(routing_mutex_);
278  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id)) {
279  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
280  return routing_table_.at(sequence_id);
281  }
282  else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count()))
283  {
284  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
285  return routing_table_.at(sent_frag_count_.count());
286  }
287  usleep(routing_timeout_ms_ * 10);
288  }
289  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
290  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID) {
291  TLOG_ERROR("DataSenderManager") << "Bad Omen: I don't have routing information for seqID " << std::to_string(sequence_id)
292  << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!" << TLOG_ENDL;
293  }
294  else
295  {
296  TLOG_ERROR("DataSenderManager") << "Bad Omen: I don't have routing information for send number " << std::to_string(sent_frag_count_.count())
297  << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!" << TLOG_ENDL;
298  }
299  }
300  else {
301  auto index = sequence_id % enabled_destinations_.size();
302  auto it = enabled_destinations_.begin();
303  for (; index > 0; --index)
304  {
305  ++it;
306  if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
307  }
308  return *it;
309  }
311 }
312 
313 int
315 sendFragment(Fragment&& frag)
316 {
317  // Precondition: Fragment must be complete and consistent (including
318  // header information).
319  auto start_time = std::chrono::steady_clock::now();
320  if (frag.type() == Fragment::EndOfDataFragmentType)
321  {
322  throw cet::exception("LogicError")
323  << "EOD fragments should not be sent on as received: "
324  << "use sendEODFrag() instead.";
325  }
326  size_t seqID = frag.sequenceID();
327  size_t fragSize = frag.sizeBytes();
328  TLOG_ARB(13, "DataSenderManager") << "sendFragment start frag.fragmentHeader()=" << std::hex << (void*)(frag.headerBeginBytes()) << ", szB=" << std::dec << std::to_string(fragSize)
329  << ", seqID=" << std::to_string(seqID) << ", type=" << frag.typeString() << TLOG_ENDL;
331  if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
332  {
333  for (auto& bdest : enabled_destinations_)
334  {
335  TLOG_TRACE("DataSenderManager") << "sendFragment: Sending fragment with seqId " << std::to_string(seqID) << " to destination " << bdest << " (broadcast)" << TLOG_ENDL;
336  // Gross, we have to copy.
337  Fragment fragCopy(frag);
338  auto sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
340  {
341  sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
342  }
343  sent_frag_count_.incSlot(bdest);
344  }
345  }
346  else if (non_blocking_mode_)
347  {
348  auto count = routing_retry_count_;
349  while (dest == TransferInterface::RECV_TIMEOUT && count > 0) {
350  dest = calcDest_(seqID);
352  {
353  count--;
354  TLOG_WARNING("DataSenderManager") << "Could not get destination for seqID " << std::to_string(seqID) << (count > 0 ? ", retrying." : ".") << TLOG_ENDL;
355  }
356  }
357  if (dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest))
358  {
359  TLOG_TRACE("DataSenderManager") << "sendFragment: Sending fragment with seqId " << std::to_string(seqID) << " to destination " << dest << TLOG_ENDL;
361  auto lastWarnTime = std::chrono::steady_clock::now();
363  {
364  sts = destinations_[dest]->copyFragment(frag, send_timeout_us_);
365  if (sts != TransferInterface::CopyStatus::kSuccess && TimeUtils::GetElapsedTime(lastWarnTime) >= 1)
366  {
367  TLOG_ERROR("DataSenderManager") << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying..." << TLOG_ENDL;
368  lastWarnTime = std::chrono::steady_clock::now();
369  }
370  }
371  //sendFragTo(std::move(frag), dest);
372  sent_frag_count_.incSlot(dest);
373  }
374  else
375  {
376  TLOG_WARNING("DataSenderManager") << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID << TLOG_ENDL;
377  }
378  }
379  else {
380  auto count = routing_retry_count_;
381  while (dest == TransferInterface::RECV_TIMEOUT && count > 0) {
382  dest = calcDest_(seqID);
383  if (dest == TransferInterface::RECV_TIMEOUT) {
384  count--;
385  TLOG_WARNING("DataSenderManager") << "Could not get destination for seqID "
386  << std::to_string(seqID) << ", send number " << sent_frag_count_.count()
387  << (count > 0 ? ", retrying." : ".") << TLOG_ENDL;
388  }
389  }
390  if(dest != TransferInterface::RECV_TIMEOUT && destinations_.count(dest) && enabled_destinations_.count(dest)) {
391  TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
393 
394  sts = destinations_[dest]->moveFragment(std::move(frag), send_timeout_us_);
396  TLOG_ERROR("DataSenderManager") << "sendFragment: Sending fragment " << seqID << " to destination "
397  << dest << " failed! Data has been lost!" << TLOG_ENDL;
398 
399  //sendFragTo(std::move(frag), dest);
400  sent_frag_count_.incSlot(dest);
401  }
402  else
403  TLOG_WARNING("DataSenderManager") << "calcDest returned invalid destination rank " << dest
404  << "! This event has been lost: " << seqID << TLOG_ENDL;
405  }
406  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID
407  && routing_table_.find(seqID - 1) != routing_table_.end()) {
408  std::unique_lock<std::mutex> lck(routing_mutex_);
409  routing_table_.erase(routing_table_.begin(), routing_table_.find(seqID - 1));
410  }
411  else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount) {
412  std::unique_lock<std::mutex> lck(routing_mutex_);
413  routing_table_.erase(routing_table_.begin(), routing_table_.find(sent_frag_count_.count()));
414  }
415  if (metricMan) {//&& sent_frag_count_.slotCount(dest) % 100 == 0) {
416  TRACE(5, "sendFragment: sending metrics");
417  auto delta_t = TimeUtils::GetElapsedTime(start_time);
418  metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 3, MetricMode::Accumulate);
419  metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 3, MetricMode::Accumulate);
420  metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 3, MetricMode::Average);
421  metricMan->sendMetric("Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest),
422  "fragments", 3, MetricMode::LastPoint);
423  if (use_routing_master_) {
424  metricMan->sendMetric("Routing Table Size", routing_table_.size(), "events", 1, MetricMode::LastPoint);
425  if (routing_wait_time_ > 0)
426  metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 1,
427  MetricMode::Average);
428  }
429  }
430  TRACE(5, "sendFragment: Done sending fragment %zu", seqID);
431  return dest;
432 } // artdaq::DataSenderManager::sendFragment
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:27
A row of the Routing Table.
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
Events should be routed by sequence ID (BR -&gt; EB)
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
int sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
The header of the Routing Table, containing the magic bytes and the number of entries.
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
RoutingMasterMode mode
The current mode of the RoutingMaster.
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
Events should be routed by send count (EB -&gt; Agg)
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.