artdaq  v2_02_03
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Pages
DataSenderManager.cc
1 #include "artdaq/DAQrate/DataSenderManager.hh"
2 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
3 #include "artdaq/DAQdata/Globals.hh"
4 
5 #include <chrono>
6 #include <canvas/Utilities/Exception.h>
7 #include <arpa/inet.h>
8 #include <netinet/in.h>
9 #include <sys/types.h>
10 #include <poll.h>
11 #include <sys/socket.h>
12 #include "artdaq/Application/Routing/RoutingPacket.hh"
14 
15 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
16  : destinations_()
17  , enabled_destinations_()
18  , sent_frag_count_()
19  , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
20  , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
21  , routing_master_mode_(detail::RoutingMasterMode::INVALID)
22  , should_stop_(false)
23  , ack_socket_(-1)
24  , table_socket_(-1)
25 {
26  TLOG_DEBUG("DataSenderManager") << "Received pset: " << pset.to_string() << TLOG_ENDL;
27  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
28  use_routing_master_ = rmConfig.get<bool>("use_routing_master", false);
29  table_port_ = rmConfig.get<int>("table_update_port", 35556);
30  table_address_ = rmConfig.get<std::string>("table_update_address", "227.128.12.28");
31  ack_port_ = rmConfig.get<int>("table_acknowledge_port", 35557);
32  ack_address_ = rmConfig.get<std::string>("routing_master_hostname", "localhost");
33  routing_timeout_ms_ = (rmConfig.get<int>("routing_timeout_ms", 1000));
34 
35 
36  auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
37  for (auto& d : dests.get_pset_names())
38  {
39  try
40  {
41  auto dd = dests.get<fhicl::ParameterSet>(d).get<int>("destination_rank");
42  destinations_.emplace(dd, MakeTransferPlugin(dests, d, TransferInterface::Role::kSend));
43  }
44  catch (std::invalid_argument)
45  {
46  TRACE(3, "Invalid destination specification: " + d);
47  }
48  catch (cet::exception ex)
49  {
50  TLOG_WARNING("DataSenderManager") << "Caught cet::exception: " << ex.what() << TLOG_ENDL;
51  }
52  catch (...)
53  {
54  TLOG_WARNING("DataSenderManager") << "Non-cet exception while setting up TransferPlugin: " << d << "." << TLOG_ENDL;
55  }
56  }
57  if (destinations_.size() == 0)
58  {
59  TLOG_ERROR("DataSenderManager") << "No destinations specified!" << TLOG_ENDL;
60  }
61  else
62  {
63  auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
64  if (enabled_dests.size() == 0)
65  {
66  TLOG_INFO("DataSenderManager") << "enabled_destinations not specified, assuming all destinations enabled." << TLOG_ENDL;
67  for (auto& d : destinations_)
68  {
69  enabled_destinations_.insert(d.first);
70  }
71  }
72  else
73  {
74  for (auto& d : enabled_dests)
75  {
76  enabled_destinations_.insert(d);
77  }
78  }
79  }
80  if (use_routing_master_) startTableReceiverThread_();
81 }
82 
84 {
85  TLOG_DEBUG("DataSenderManager") << "Shutting down DataSenderManager BEGIN" << TLOG_ENDL;
86  should_stop_ = true;
87  for (auto& dest : enabled_destinations_)
88  {
89  if (destinations_.count(dest))
90  {
91  destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
92  // sendFragTo(std::move(*Fragment::eodFrag(nFragments)), dest, true);
93  }
94  }
95  if (routing_thread_.joinable()) routing_thread_.join();
96  TLOG_DEBUG("DataSenderManager") << "Shutting down DataSenderManager END. Sent " << count() << " fragments." << TLOG_ENDL;
97 }
98 
99 
100 void artdaq::DataSenderManager::setupTableListener_()
101 {
102  table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
103  if (!table_socket_)
104  {
105  TLOG_ERROR("DataSenderManager") << "Error creating socket for receiving table updates!" << TLOG_ENDL;
106  exit(1);
107  }
108 
109  struct sockaddr_in si_me_request;
110 
111  int yes = 1;
112  if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
113  {
114  TLOG_ERROR("DataSenderManager") << " Unable to enable port reuse on request socket" << TLOG_ENDL;
115  exit(1);
116  }
117  memset(&si_me_request, 0, sizeof(si_me_request));
118  si_me_request.sin_family = AF_INET;
119  si_me_request.sin_port = htons(table_port_);
120  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
121  if (bind(table_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
122  {
123  TLOG_ERROR("DataSenderManager") << "Cannot bind request socket to port " << table_port_ << TLOG_ENDL;
124  exit(1);
125  }
126 
127  struct ip_mreq mreq;
128  int sts = ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
129  if (sts == -1)
130  {
131  TLOG_ERROR("DataSenderManager") << "Unable to resolve multicast address for table updates" << TLOG_ENDL;
132  exit(1);
133  }
134  mreq.imr_interface.s_addr = htonl(INADDR_ANY);
135  if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
136  {
137  TLOG_ERROR("DataSenderManager") << "Unable to join multicast group" << TLOG_ENDL;
138  exit(1);
139  }
140 }
141 void artdaq::DataSenderManager::startTableReceiverThread_()
142 {
143  if (routing_thread_.joinable()) routing_thread_.join();
144  TLOG_INFO("DataSenderManager") << "Starting Routing Thread" << TLOG_ENDL;
145  routing_thread_ = std::thread(&DataSenderManager::receiveTableUpdatesLoop_, this);
146 }
147 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
148 {
149  while (true)
150  {
151  if (should_stop_)
152  {
153  TLOG_DEBUG("DataSenderManager") << "receiveTableUpdatesLoop: should_stop is " << std::boolalpha << should_stop_ << ", stopping" << TLOG_ENDL;
154  return;
155  }
156 
157  TRACE(4, "DataSenderManager::receiveTableUpdatesLoop: Polling Request socket for new requests");
158  if (table_socket_ == -1)
159  {
160  TLOG_DEBUG("DataSenderManager") << "Opening table listener socket" << TLOG_ENDL;
161  setupTableListener_();
162  }
163  if (table_socket_ == -1)
164  {
165  TLOG_DEBUG("DataSenderManager") << "The listen socket was not opened successfully." << TLOG_ENDL;
166  return;
167  }
168  if (ack_socket_ == -1)
169  {
170  ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
171  auto sts = ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
172  if (sts == -1)
173  {
174  TLOG_ERROR("DataSenderManager") << "Unable to resolve routing_master_address" << TLOG_ENDL;
175  exit(1);
176  }
177  TLOG_DEBUG("DataSenderManager") << "Ack socket is fd " << ack_socket_ << TLOG_ENDL;
178  }
179 
180  struct pollfd fd;
181  fd.fd = table_socket_;
182  fd.events = POLLIN | POLLPRI;
183 
184  auto res = poll(&fd, 1, 1000);
185  if (res > 0) {
186  auto first = artdaq::Fragment::InvalidSequenceID;
187  auto last = artdaq::Fragment::InvalidSequenceID;
189 
190  TLOG_DEBUG("DataSenderManager") << "Going to receive RoutingPacketHeader" << TLOG_ENDL;
191  auto stss = recvfrom(table_socket_, &hdr, sizeof(artdaq::detail::RoutingPacketHeader), 0, NULL, NULL);
192  TLOG_DEBUG("DataSenderManager") << "Received " << std::to_string(stss) << " bytes. (sizeof(RoutingPacketHeader) == " << std::to_string(sizeof(detail::RoutingPacketHeader)) << TLOG_ENDL;
193 
194  TLOG_DEBUG("DataSenderManager") << "Checking for valid header" << TLOG_ENDL;
195  if (hdr.header == ROUTING_MAGIC) {
196  if(routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
197  {
198  TLOG_ERROR("DataSenderManager") << "Received table has different RoutingMasterMode than expected!" << TLOG_ENDL;
199  exit(1);
200  }
201  routing_master_mode_ = hdr.mode;
202 
204  TLOG_DEBUG("DataSenderManager") << "Receiving data buffer" << TLOG_ENDL;
205  auto sts = recv(table_socket_, &buffer[0], sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries, 0);
206  assert(static_cast<size_t>(sts) == sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries);
207  TRACE(6, "Received a packet of %zu bytes", sts);
208 
209  first = buffer[0].sequence_id;
210  last = buffer[buffer.size() - 1].sequence_id;
211 
212  if (first + hdr.nEntries - 1 != last)
213  {
214  TLOG_ERROR("DataSenderManager") << "Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!" << TLOG_ENDL;
215  continue;
216  }
217  auto thisSeqID = first;
218 
219  if (routing_table_.count(last) == 0) {
220  for (auto entry : buffer)
221  {
222  if (thisSeqID != entry.sequence_id)
223  {
224  TLOG_ERROR("DataSenderManager") << "Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!" << TLOG_ENDL;
225  last = thisSeqID - 1;
226  break;
227  }
228  thisSeqID++;
229  if (routing_table_.count(entry.sequence_id))
230  {
231  if (routing_table_[entry.sequence_id] != entry.destination_rank)
232  {
233  TLOG_ERROR("DataSenderManager") << "Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
234  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
235  << " I will use the original value!" << TLOG_ENDL;
236  }
237  continue;
238  }
239  routing_table_[entry.sequence_id] = entry.destination_rank;
240  TLOG_DEBUG("DataSenderManager") << "DataSenderManager " << std::to_string(my_rank) << ": received update: SeqID " << std::to_string(entry.sequence_id) << " -> Rank " << std::to_string(entry.destination_rank) << TLOG_ENDL;
241  }
242  }
243 
245  ack.rank = my_rank;
246  ack.first_sequence_id = first;
247  ack.last_sequence_id = last;
248 
249  TLOG_DEBUG("DataSenderManager") << "Sending RoutingAckPacket with first= " << std::to_string(first) << " and last= " << std::to_string(last) << " to " << ack_address_ << ", port " << ack_port_ << " (my_rank = " << my_rank << ")"<< TLOG_ENDL;
250  TLOG_DEBUG("DataSenderManager") << "There are now " << routing_table_.size() << " entries in the Routing Table" << TLOG_ENDL;
251  sendto(ack_socket_, &ack, sizeof(artdaq::detail::RoutingAckPacket), 0, (struct sockaddr *)&ack_addr_, sizeof(ack_addr_));
252  }
253  }
254  }
255 }
256 
258 {
259  std::unique_lock<std::mutex> lck(routing_mutex_);
260  return routing_table_.size();
261 }
262 
263 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
264 {
265  if (enabled_destinations_.size() == 0) return TransferInterface::RECV_TIMEOUT; // No destinations configured.
266  if (enabled_destinations_.size() == 1) return *enabled_destinations_.begin(); // Trivial case
267 
268  if (use_routing_master_)
269  {
270  auto start = std::chrono::steady_clock::now();
271  while (routing_timeout_ms_ <= 0 || std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count() < routing_timeout_ms_)
272  {
273  std::unique_lock<std::mutex> lck(routing_mutex_);
274  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id)) {
275  routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
276  return routing_table_.at(sequence_id);
277  }
278  else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count()))
279  {
280  routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
281  return routing_table_.at(sent_frag_count_.count());
282  }
283  usleep(routing_timeout_ms_ * 10);
284  }
285  routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
286  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID) {
287  TLOG_ERROR("DataSenderManager") << "Bad Omen: I don't have routing information for seqID " << std::to_string(sequence_id)
288  << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!" << TLOG_ENDL;
289  }
290  else
291  {
292  TLOG_ERROR("DataSenderManager") << "Bad Omen: I don't have routing information for send number " << std::to_string(sent_frag_count_.count())
293  << " and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) << ")!" << TLOG_ENDL;
294  }
295  }
296  else {
297  auto index = sequence_id % enabled_destinations_.size();
298  auto it = enabled_destinations_.begin();
299  for (; index > 0; --index)
300  {
301  ++it;
302  if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
303  }
304  return *it;
305  }
307 }
308 
309 int
311 sendFragment(Fragment&& frag)
312 {
313  // Precondition: Fragment must be complete and consistent (including
314  // header information).
315  auto start_time = std::chrono::steady_clock::now();
316  if (frag.type() == Fragment::EndOfDataFragmentType)
317  {
318  throw cet::exception("LogicError")
319  << "EOD fragments should not be sent on as received: "
320  << "use sendEODFrag() instead.";
321  }
322  size_t seqID = frag.sequenceID();
323  size_t fragSize = frag.sizeBytes();
324  TLOG_ARB(13, "DataSenderManager") << "sendFragment start frag.fragmentHeader()=" << std::hex << (void*)(frag.headerBeginBytes()) << ", szB=" << std::dec << std::to_string(fragSize) << ", seqID=" << std::to_string(seqID) << TLOG_ENDL;
326  if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
327  {
328  for (auto& bdest : enabled_destinations_)
329  {
330  TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d (broadcast)", seqID, bdest);
331  // Gross, we have to copy.
332  Fragment fragCopy(frag);
333  auto sts = destinations_[bdest]->copyFragment(fragCopy);
335  {
336  sts = destinations_[bdest]->copyFragment(fragCopy);
337  }
338  sent_frag_count_.incSlot(bdest);
339  }
340  }
341  else if (non_blocking_mode_)
342  {
343  while (dest == TransferInterface::RECV_TIMEOUT) {
344  dest = calcDest_(seqID);
346  {
347  TLOG_WARNING("DataSenderManager") << "Could not get destination for seqID " << std::to_string(seqID) << ", retrying." << TLOG_ENDL;
348  }
349  }
350  if (destinations_.count(dest) && enabled_destinations_.count(dest))
351  {
352  TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
354  auto lastWarnTime = std::chrono::steady_clock::now();
356  {
357  sts = destinations_[dest]->copyFragment(frag);
358  if (sts != TransferInterface::CopyStatus::kSuccess && std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - lastWarnTime).count() >= 1)
359  {
360  TLOG_ERROR("DataSenderManager") << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying..." << TLOG_ENDL;
361  lastWarnTime = std::chrono::steady_clock::now();
362  }
363  }
364  //sendFragTo(std::move(frag), dest);
365  sent_frag_count_.incSlot(dest);
366  }
367  else
368  {
369  TLOG_WARNING("DataSenderManager") << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID << TLOG_ENDL;
370  }
371  }
372  else
373  {
374  while (dest == TransferInterface::RECV_TIMEOUT) {
375  dest = calcDest_(seqID);
377  {
378  TLOG_WARNING("DataSenderManager") << "Could not get destination for seqID " << std::to_string(seqID) << ", send number " << sent_frag_count_.count() << ", retrying." << TLOG_ENDL;
379  }
380  }
381  if (destinations_.count(dest) && enabled_destinations_.count(dest))
382  {
383  TRACE(5, "DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
385  auto lastWarnTime = std::chrono::steady_clock::now();
387  {
388  sts = destinations_[dest]->moveFragment(std::move(frag));
389  if (sts != TransferInterface::CopyStatus::kSuccess && std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - lastWarnTime).count() >= 1)
390  {
391  TLOG_ERROR("DataSenderManager") << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying..." << TLOG_ENDL;
392  lastWarnTime = std::chrono::steady_clock::now();
393  }
394  }
395  //sendFragTo(std::move(frag), dest);
396  sent_frag_count_.incSlot(dest);
397  }
398  else
399  {
400  TLOG_WARNING("DataSenderManager") << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID << TLOG_ENDL;
401  }
402  }
403  if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.find(seqID - 1) != routing_table_.end())
404  {
405  std::unique_lock<std::mutex> lck(routing_mutex_);
406  routing_table_.erase(routing_table_.begin(), routing_table_.find(seqID - 1));
407  }
408  else if(routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount)
409  {
410  std::unique_lock<std::mutex> lck(routing_mutex_);
411  routing_table_.erase(routing_table_.begin(), routing_table_.find(sent_frag_count_.count()));
412  }
413  if (metricMan)
414  {//&& sent_frag_count_.slotCount(dest) % 100 == 0) {
415  auto delta_t = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - start_time).count();
416  metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 1);
417  metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 1);
418  metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 1);
419  if (use_routing_master_) {
420  metricMan->sendMetric("Routing Table Size", routing_table_.size(), "events", 1);
421  if (routing_wait_time_ > 0)
422  {
423  size_t wttemp = routing_wait_time_;
424  routing_wait_time_ = 0;
425  metricMan->sendMetric("Routing Wait Time", wttemp / 1000000000, "s", 1);
426  }
427  }
428  }
429  TRACE(5, "DataSenderManager::sendFragment: Done sending fragment %zu", seqID);
430  return dest;
431 }
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:29
A row of the Routing Table.
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
int sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
The header of the Routing Table, containing the magic bytes and the number of entries.
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
size_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.