artdaq  v2_03_02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Pages
DataSenderManager.cc
1 #include "artdaq/DAQrate/DataSenderManager.hh"
2 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
3 #include "artdaq/DAQdata/Globals.hh"
4 
5 #include <chrono>
6 #include "canvas/Utilities/Exception.h"
7 #include <arpa/inet.h>
8 #include <netinet/in.h>
9 #include <sys/types.h>
10 #include <poll.h>
11 #include <sys/socket.h>
12 #include "artdaq/Application/Routing/RoutingPacket.hh"
14 
15 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
16  : destinations_()
17  , enabled_destinations_()
18  , sent_frag_count_()
19  , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
20  , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
21  , routing_master_mode_(detail::RoutingMasterMode::INVALID)
22  , should_stop_(false)
23  , ack_socket_(-1)
24  , table_socket_(-1)
25 {
26  TLOG_DEBUG("DataSenderManager") << "Received pset: " << pset.to_string() << TLOG_ENDL;
27  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
28  use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
29  table_port_ = rmConfig.get<int>("table_update_port", 35556);
30  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
31  ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
32  ack_address_ = rmConfig.get<std::string>("routing_master_hostname", "localhost");
33  routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
34 
35 
36  auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
37  for (auto& d : dests.get_pset_names())
38  {
39  try
40  {
41  auto transfer = MakeTransferPlugin(dests, d, TransferInterface::Role::kSend);
42  auto destination_rank = transfer->destination_rank();
43  destinations_.emplace( destination_rank, std::move(transfer));
44  }
45  catch (std::invalid_argument)
46  {
47  TRACE(3, "Invalid destination specification: " + d);
48  }
49  catch (cet::exception ex)
50  {
51  TLOG_WARNING("DataSenderManager") << "Caught cet::exception: " << ex.what() << TLOG_ENDL;
52  }
53  catch (...)
54  {
55  TLOG_WARNING("DataSenderManager") << "Non-cet exception while setting up TransferPlugin: " << d << "." << TLOG_ENDL;
56  }
57  }
58  if (destinations_.size() == 0)
59  {
60  TLOG_ERROR("DataSenderManager") << "No destinations specified!" << TLOG_ENDL;
61  }
62  else
63  {
64  auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
65  if (enabled_dests.size() == 0)
66  {
67  TLOG_INFO("DataSenderManager") << "enabled_destinations not specified, assuming all destinations enabled." << TLOG_ENDL;
68  for (auto& d : destinations_)
69  {
70  enabled_destinations_.insert(d.first);
71  }
72  }
73  else
74  {
75  for (auto& d : enabled_dests)
76  {
77  enabled_destinations_.insert(d);
78  }
79  }
80  }
81  if (use_routing_master_) startTableReceiverThread_();
82 }
83 
85 {
86  TLOG_DEBUG("DataSenderManager") << "Shutting down DataSenderManager BEGIN" << TLOG_ENDL;
87  should_stop_ = true;
88  for (auto& dest : enabled_destinations_)
89  {
90  if (destinations_.count(dest))
91  {
92  destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
93  // sendFragTo(std::move(*Fragment::eodFrag(nFragments)), dest, true);
94  }
95  }
96  if (routing_thread_.joinable()) routing_thread_.join();
97  TLOG_DEBUG("DataSenderManager") << "Shutting down DataSenderManager END. Sent " << count() << " fragments." << TLOG_ENDL;
98 }
99 
100 
101 void artdaq::DataSenderManager::setupTableListener_()
102 {
103  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
104  if (!table_socket_)
105  {
106  TLOG_ERROR("DataSenderManager") << "Error creating socket for receiving table updates!" << TLOG_ENDL;
107  exit(1);
108  }
109 
110  struct sockaddr_in si_me_request;
111 
112  int yes = 1;
113  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
114  {
115  TLOG_ERROR("DataSenderManager") << " Unable to enable port reuse on request socket" << TLOG_ENDL;
116  exit(1);
117  }
118  memset(&si_me_request, 0, sizeof(si_me_request));
119  si_me_request.sin_family = AF_INET;
120  si_me_request.sin_port = htons(table_port_);
121  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
122  if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
123  {
124  TLOG_ERROR("DataSenderManager") << "Cannot bind request socket to port " << table_port_ << TLOG_ENDL;
125  exit(1);
126  }
127 
128  struct ip_mreq mreq;
129  int sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
130  if (sts == -1)
131  {
132  TLOG_ERROR("DataSenderManager") << "Unable to resolve multicast address for table updates" << TLOG_ENDL;
133  exit(1);
134  }
135  mreq.imr_interface.s_addr = htonl(INADDR_ANY);
136  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
137  {
138  TLOG_ERROR("DataSenderManager") << "Unable to join multicast group" << TLOG_ENDL;
139  exit(1);
140  }
141 }
142 void artdaq::DataSenderManager::startTableReceiverThread_()
143 {
144  if (routing_thread_.joinable()) routing_thread_.join();
145  TLOG_INFO("DataSenderManager") << "Starting Routing Thread" << TLOG_ENDL;
146  routing_thread_ = std::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
147 }
148 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
149 {
150  while (true)
151  {
152  if (should_stop_)
153  {
154  TLOG_DEBUG("DataSenderManager") << "receiveTableUpdatesLoop: should_stop is " << std::boolalpha << should_stop_ << ", stopping" << TLOG_ENDL;
155  return;
156  }
157 
158  TRACE(4, "DataSenderManager::receiveTableUpdatesLoop: Polling Request socket for new requests");
159  if (table_socket_ == -1)
160  {
161  TLOG_DEBUG("DataSenderManager") << "Opening table listener socket" << TLOG_ENDL;
162  setupTableListener_();
163  }
164  if (table_socket_ == -1)
165  {
166  TLOG_DEBUG("DataSenderManager") << "The listen socket was not opened successfully." << TLOG_ENDL;
167  return;
168  }
169  if (ack_socket_ == -1)
170  {
171  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
172  auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
173  if (sts == -1)
174  {
175  TLOG_ERROR("DataSenderManager") << "Unable to resolve routing_master_address" << TLOG_ENDL;
176  exit(1);
177  }
178  TLOG_DEBUG("DataSenderManager") << "Ack socket is fd " << ack_socket_ << TLOG_ENDL;
179  }
180 
181  struct pollfd fd;
182  fd.fd = table_socket_;
183  fd.events = POLLIN | POLLPRI;
184 
185  auto res = poll(&fd, 1, 1000);
186  if (res > 0) {
187  auto first = artdaq::Fragment::InvalidSequenceID;
188  auto last = artdaq::Fragment::InvalidSequenceID;
190 
191  TLOG_DEBUG("DataSenderManager") << "Going to receive RoutingPacketHeader" << TLOG_ENDL;
192  auto stss = recvfrom(table_socket_, &hdr, sizeof(artdaq::detail::RoutingPacketHeader), 0, NULL, NULL);
193  TLOG_DEBUG("DataSenderManager") << "Received " << std::to_string(stss) << " bytes. (sizeof(RoutingPacketHeader) == " << std::to_string(sizeof(detail::RoutingPacketHeader)) << TLOG_ENDL;
194 
195  TLOG_DEBUG("DataSenderManager") << "Checking for valid header" << TLOG_ENDL;
196  if (hdr.header == ROUTING_MAGIC) {
197  if(routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
198  {
199  TLOG_ERROR("DataSenderManager") << "Received table has different RoutingMasterMode than expected!" << TLOG_ENDL;
200  exit(1);
201  }
202  routing_master_mode_ = hdr.mode;
203 
205  TLOG_DEBUG("DataSenderManager") << "Receiving data buffer" << TLOG_ENDL;
206  auto sts = recv(table_socket_, &buffer[0], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries, 0);
207  assert(static_cast<size_t>(sts) == sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
208  TRACE(6, "Received a packet of %zu bytes", sts);
209 
210  first = buffer[0].sequence_id;
211  last = buffer[buffer.size() - 1].sequence_id;
212 
213  if (first + hdr.nEntries - 1 != last)
214  {
215  TLOG_ERROR("DataSenderManager") << "Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!" << TLOG_ENDL;
216  continue;
217  }
218  auto thisSeqID = first;
219 
220  if (routing_table_.count(last) == 0) {
221  for (auto entry : buffer)
222  {
223  if (thisSeqID != entry.sequence_id)
224  {
225  TLOG_ERROR("DataSenderManager") << "Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!" << TLOG_ENDL;
226  last = thisSeqID - 1;
227  break;
228  }
229  thisSeqID++;
230  if (routing_table_.count(entry.sequence_id))
231  {
232  if (routing_table_[entry.sequence_id] != entry.destination_rank)
233  {
234  TLOG_ERROR("DataSenderManager") << "Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
235  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
236  << " I will use the original value!" << TLOG_ENDL;
237  }
238  continue;
239  }
240  routing_table_[entry.sequence_id] = entry.destination_rank;
241  TLOG_DEBUG("DataSenderManager") << "DataSenderManager " << std::to_string(my_rank) << ": received update: SeqID " << std::to_string(entry.sequence_id) << " -> Rank " << std::to_string(entry.destination_rank) << TLOG_ENDL;
242  }
243  }
244 
246  ack.rank = my_rank;
247  ack.first_sequence_id = first;
248  ack.last_sequence_id = last;
249 
250  TLOG_DEBUG("DataSenderManager") << "Sending RoutingAckPacket with first= " << std::to_string(first) << " and last= " << std::to_string(last) << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")"<< TLOG_ENDL;
251  TLOG_DEBUG("DataSenderManager") << "There are now " << routing_table_.size() << " entries in the Routing Table" << TLOG_ENDL;
252  sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr_, sizeof(ack_addr_));
253  }
254  }
255  }
256 }
257 
259 {
260  std::unique_lock<std::mutex> lck(routing_mutex_);
261  return routing_table_.size();
262 }
263 
264 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
265 {
266  if (enabled_destinations_.size() == 0) return TransferInterface::RECV_TIMEOUT; // No destinations configured.
267  if (enabled_destinations_.size() == 1) return *enabled_destinations_.begin(); // Trivial case
268 
269  if (use_routing_master_)
270  {
271  auto start = std::chrono::steady_clock::now();
272  while (routing_timeout_ms_ <= 0 || std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count() < routing_timeout_ms_)
273  {
274  std::unique_lock<std::mutex> lck(routing_mutex_);
275  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id)) {
276  routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
277  return routing_table_.at(sequence_id);
278  }
279  else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count()))
280  {
281  routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
282  return routing_table_.at(sent_frag_count_.count());
283  }
284  usleep(routing_timeout_ms_ * 10);
285  }
286  routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
287  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID) {
288  TLOG_ERROR("DataSenderManager") << "Bad Omen: I don't have routing information for seqID " << std::to_string(sequence_id)
289  << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!" << TLOG_ENDL;
290  }
291  else
292  {
293  TLOG_ERROR("DataSenderManager") << "Bad Omen: I don't have routing information for send number " << std::to_string(sent_frag_count_.count())
294  << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!" << TLOG_ENDL;
295  }
296  }
297  else {
298  auto index = sequence_id % enabled_destinations_.size();
299  auto it = enabled_destinations_.begin();
300  for (; index > 0; --index)
301  {
302  ++it;
303  if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
304  }
305  return *it;
306  }
308 }
309 
310 int
312 sendFragment(Fragment&& frag)
313 {
314  // Precondition: Fragment must be complete and consistent (including
315  // header information).
316  auto start_time = std::chrono::steady_clock::now();
317  if (frag.type() == Fragment::EndOfDataFragmentType)
318  {
319  throw cet::exception("LogicError")
320  << "EOD fragments should not be sent on as received: "
321  << "use sendEODFrag() instead.";
322  }
323  size_t seqID = frag.sequenceID();
324  size_t fragSize = frag.sizeBytes();
325  TLOG_ARB(13, "DataSenderManager") << "sendFragment start frag.fragmentHeader()=" << std::hex << (void*)(frag.headerBeginBytes()) << ", szB=" << std::dec << std::to_string(fragSize) << ", seqID=" << std::to_string(seqID) << TLOG_ENDL;
327  if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
328  {
329  for (auto& bdest : enabled_destinations_)
330  {
331  TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d (broadcast)", seqID, bdest);
332  // Gross, we have to copy.
333  Fragment fragCopy(frag);
334  auto sts = destinations_[bdest]->copyFragment(fragCopy);
336  {
337  sts = destinations_[bdest]->copyFragment(fragCopy);
338  }
339  sent_frag_count_.incSlot(bdest);
340  }
341  }
342  else if (non_blocking_mode_)
343  {
344  while (dest == TransferInterface::RECV_TIMEOUT) {
345  dest = calcDest_(seqID);
347  {
348  TLOG_WARNING("DataSenderManager") << "Could not get destination for seqID " << std::to_string(seqID) << ", retrying." << TLOG_ENDL;
349  }
350  }
351  if (destinations_.count(dest) && enabled_destinations_.count(dest))
352  {
353  TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
355  auto lastWarnTime = std::chrono::steady_clock::now();
357  {
358  sts = destinations_[dest]->copyFragment(frag);
359  if (sts != TransferInterface::CopyStatus::kSuccess && std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - lastWarnTime).count() >= 1)
360  {
361  TLOG_ERROR("DataSenderManager") << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying..." << TLOG_ENDL;
362  lastWarnTime = std::chrono::steady_clock::now();
363  }
364  }
365  //sendFragTo(std::move(frag), dest);
366  sent_frag_count_.incSlot(dest);
367  }
368  else
369  {
370  TLOG_WARNING("DataSenderManager") << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID << TLOG_ENDL;
371  }
372  }
373  else
374  {
375  while (dest == TransferInterface::RECV_TIMEOUT) {
376  dest = calcDest_(seqID);
378  {
379  TLOG_WARNING("DataSenderManager") << "Could not get destination for seqID " << std::to_string(seqID) << ", send number " << sent_frag_count_.count() << ", retrying." << TLOG_ENDL;
380  }
381  }
382  if (destinations_.count(dest) && enabled_destinations_.count(dest))
383  {
384  TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
386  auto lastWarnTime = std::chrono::steady_clock::now();
388  {
389  sts = destinations_[dest]->moveFragment(std::move(frag));
390  if (sts != TransferInterface::CopyStatus::kSuccess && std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - lastWarnTime).count() >= 1)
391  {
392  TLOG_ERROR("DataSenderManager") << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying..." << TLOG_ENDL;
393  lastWarnTime = std::chrono::steady_clock::now();
394  }
395  }
396  //sendFragTo(std::move(frag), dest);
397  sent_frag_count_.incSlot(dest);
398  }
399  else
400  {
401  TLOG_WARNING("DataSenderManager") << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID << TLOG_ENDL;
402  }
403  }
404  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.find(seqID - 1) != routing_table_.end())
405  {
406  std::unique_lock<std::mutex> lck(routing_mutex_);
407  routing_table_.erase(routing_table_.begin(), routing_table_.find(seqID - 1));
408  }
409  else if(routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount)
410  {
411  std::unique_lock<std::mutex> lck(routing_mutex_);
412  routing_table_.erase(routing_table_.begin(), routing_table_.find(sent_frag_count_.count()));
413  }
414  if (metricMan)
415  {//&& sent_frag_count_.slotCount(dest) % 100 == 0) {
416  auto delta_t = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - start_time).count();
417  metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 1);
418  metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 1);
419  metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 1);
420  if (use_routing_master_) {
421  metricMan->sendMetric("Routing Table Size", routing_table_.size(), "events", 1);
422  if (routing_wait_time_ > 0)
423  {
424  size_t wttemp = routing_wait_time_;
425  routing_wait_time_ = 0;
426  metricMan->sendMetric("Routing Wait Time", wttemp / 1000000000, "s", 1);
427  }
428  }
429  }
430  TRACE(5, "DataSenderManager::sendFragment: Done sending fragment %zu", seqID);
431  return dest;
432 }
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:29
A row of the Routing Table.
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
int sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
The header of the Routing Table, containing the magic bytes and the number of entries.
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.