artdaq  v3_12_02
DataSenderManager.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_DataSenderManager").c_str()
3 #include "artdaq/DAQdata/HostMap.hh"
4 #include "artdaq/DAQrate/DataSenderManager.hh"
5 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
6 
7 #include <arpa/inet.h>
8 #include <netinet/in.h>
9 #include <poll.h>
10 #include <sys/socket.h>
11 #include <sys/types.h>
12 #include <chrono>
14 #include "canvas/Utilities/Exception.h"
15 
16 artdaq::DataSenderManager::DataSenderManager(const fhicl::ParameterSet& pset)
17  : sent_frag_count_()
18  , broadcast_sends_(pset.get<bool>("broadcast_sends", false))
19  , non_blocking_mode_(pset.get<bool>("nonblocking_sends", false))
20  , send_timeout_us_(pset.get<size_t>("send_timeout_usec", 5000000))
21  , send_retry_count_(pset.get<size_t>("send_retry_count", 2))
22  , should_stop_(false)
23  , highest_sequence_id_routed_(0)
24 {
25  TLOG(TLVL_DEBUG + 32) << "Received pset: " << pset.to_string();
26 
27  // Validate parameters
28  if (send_timeout_us_ == 0)
29  {
30  send_timeout_us_ = std::numeric_limits<size_t>::max();
31  }
32 
33  auto rmConfig = pset.get<fhicl::ParameterSet>("routing_table_config", fhicl::ParameterSet());
34  table_receiver_.reset(new TableReceiver(rmConfig));
35 
36  hostMap_t host_map = MakeHostMap(pset);
37  auto tcp_send_buffer_size = pset.get<size_t>("tcp_send_buffer_size", 0);
38  auto max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0);
39 
40  auto dests = pset.get<fhicl::ParameterSet>("destinations", fhicl::ParameterSet());
41  for (auto& d : dests.get_pset_names())
42  {
43  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
44  host_map = MakeHostMap(dest_pset, host_map);
45  }
46  auto host_map_pset = MakeHostMapPset(host_map);
47  fhicl::ParameterSet dests_mod;
48  for (auto& d : dests.get_pset_names())
49  {
50  auto dest_pset = dests.get<fhicl::ParameterSet>(d);
51  dest_pset.erase("host_map");
52  dest_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
53 
54  if (tcp_send_buffer_size != 0 && !dest_pset.has_key("tcp_send_buffer_size"))
55  {
56  dest_pset.put<size_t>("tcp_send_buffer_size", tcp_send_buffer_size);
57  }
58  if (max_fragment_size_words != 0 && !dest_pset.has_key("max_fragment_size_words"))
59  {
60  dest_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words);
61  }
62 
63  dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
64  }
65 
66  for (auto& d : dests_mod.get_pset_names())
67  {
68  try
69  {
70  auto transfer = MakeTransferPlugin(dests_mod, d, TransferInterface::Role::kSend);
71  auto destination_rank = transfer->destination_rank();
72  destinations_.emplace(destination_rank, std::move(transfer));
73  }
74  catch (const std::invalid_argument&)
75  {
76  TLOG(TLVL_DEBUG + 32) << "Invalid destination specification: " << d;
77  }
78  catch (const cet::exception& ex)
79  {
80  TLOG(TLVL_WARNING) << "Caught cet::exception: " << ex.what();
81  }
82  catch (...)
83  {
84  TLOG(TLVL_WARNING) << "Non-cet exception while setting up TransferPlugin: " << d << ".";
85  }
86  }
87  if (destinations_.empty())
88  {
89  TLOG(TLVL_ERROR) << "No destinations specified!";
90  }
91  else
92  {
93  auto enabled_dests = pset.get<std::vector<size_t>>("enabled_destinations", std::vector<size_t>());
94  if (enabled_dests.empty())
95  {
96  TLOG(TLVL_INFO) << "enabled_destinations not specified, assuming all destinations enabled.";
97  for (auto& d : destinations_)
98  {
99  enabled_destinations_.insert(d.first);
100  }
101  }
102  else
103  {
104  for (auto& d : enabled_dests)
105  {
106  enabled_destinations_.insert(d);
107  }
108  }
109  }
110 }
111 
113 {
114  TLOG(TLVL_DEBUG + 32) << "Shutting down DataSenderManager BEGIN";
115  should_stop_ = true;
116  for (auto& dest : enabled_destinations_)
117  {
118  if (destinations_.count(dest) != 0u)
119  {
120  auto sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
122  {
123  TLOG(TLVL_ERROR) << "Error sending EOD Fragment to sender rank " << dest;
124  }
125  // sendFragTo(std::move(*Fragment::eodFrag(nFragments)), dest, true);
126  }
127  }
128  TLOG(TLVL_DEBUG + 32) << "Shutting down DataSenderManager END. Sent " << count() << " fragments.";
129 }
130 
132 {
133  return table_receiver_->GetRoutingTableEntryCount();
134 }
135 
137 {
138  return table_receiver_->GetRemainingRoutingTableEntries();
139 }
140 
141 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id) const
142 {
143  if (enabled_destinations_.empty())
144  {
145  return TableReceiver::ROUTING_FAILED; // No destinations configured.
146  }
147 
148  if (table_receiver_->RoutingManagerEnabled())
149  {
150  TLOG(TLVL_DEBUG + 35) << "calcDest_ use_routing_manager check for routing info for seqID=" << sequence_id << " should_stop_=" << should_stop_;
151  return table_receiver_->GetRoutingTableEntry(sequence_id);
152  }
153  if (enabled_destinations_.size() == 1)
154  {
155  return *enabled_destinations_.begin(); // Trivial case
156  }
157  auto index = sequence_id % enabled_destinations_.size();
158  auto it = enabled_destinations_.begin();
159  for (; index > 0; --index)
160  {
161  ++it;
162  if (it == enabled_destinations_.end())
163  {
164  it = enabled_destinations_.begin();
165  }
166  }
167  return *it;
168 }
169 
170 void artdaq::DataSenderManager::RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
171 {
172  TLOG(TLVL_DEBUG + 35) << "RemoveRoutingTableEntry: Removing sequence ID " << seq << " from routing table. Sent " << GetSentSequenceIDCount(seq) << " Fragments with this Sequence ID.";
173  table_receiver_->RemoveRoutingTableEntry(seq);
174 
175  std::unique_lock<std::mutex> lck(sent_sequence_id_mutex_);
176  if (sent_sequence_id_count_.find(seq) != sent_sequence_id_count_.end())
177  {
178  sent_sequence_id_count_.erase(sent_sequence_id_count_.find(seq));
179  }
180 }
181 
182 size_t artdaq::DataSenderManager::GetSentSequenceIDCount(Fragment::sequence_id_t seq)
183 {
184  std::unique_lock<std::mutex> lck(sent_sequence_id_mutex_);
185  if (sent_sequence_id_count_.count(seq) == 0u)
186  {
187  return 0;
188  }
189  return sent_sequence_id_count_[seq];
190 }
191 
192 std::pair<int, artdaq::TransferInterface::CopyStatus> artdaq::DataSenderManager::sendFragment(Fragment&& frag)
193 {
194  // Precondition: Fragment must be complete and consistent (including
195  // header information).
196  auto start_time = std::chrono::steady_clock::now();
197  if (frag.type() == Fragment::EndOfDataFragmentType)
198  {
199  throw cet::exception("LogicError") // NOLINT(cert-err60-cpp)
200  << "EOD fragments should not be sent on as received: "
201  << "use sendEODFrag() instead.";
202  }
203  size_t seqID = frag.sequenceID();
204  size_t fragSize = frag.sizeBytes();
205  auto latency_s = frag.getLatency(true);
206  auto isSystemBroadcast = frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType;
207 
208  double latency = latency_s.tv_sec + (latency_s.tv_nsec / 1000000000.0);
209  TLOG(TLVL_DEBUG + 36) << "sendFragment start frag.fragmentHeader()=" << std::hex << static_cast<void*>(frag.headerBeginBytes()) << ", szB=" << std::dec << fragSize
210  << ", seqID=" << seqID << ", fragID=" << frag.fragmentID() << ", type=" << frag.typeString();
213  if (broadcast_sends_ || isSystemBroadcast)
214  {
215  for (auto& bdest : enabled_destinations_)
216  {
217  TLOG(TLVL_DEBUG + 33) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << bdest << " (broadcast)";
218  // Gross, we have to copy.
220  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
221  while (sts == TransferInterface::CopyStatus::kTimeout && retries <= send_retry_count_)
222  {
223  if (!non_blocking_mode_)
224  {
225  sts = destinations_[bdest]->transfer_fragment_reliable_mode(Fragment(frag));
226  }
227  else
228  {
229  sts = destinations_[bdest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
230  }
231  ++retries;
232  }
234  {
235  outsts = sts;
236  }
237  sent_frag_count_.incSlot(bdest);
238  }
239  }
240  else if (non_blocking_mode_)
241  {
242  dest = calcDest_(seqID);
243  if (dest == TableReceiver::ROUTING_FAILED)
244  {
245  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID;
246  }
247 
248  if (dest != TableReceiver::ROUTING_FAILED && (destinations_.count(dest) != 0u) && (enabled_destinations_.count(dest) != 0u))
249  {
250  TLOG(TLVL_DEBUG + 33) << "sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
252  auto lastWarnTime = std::chrono::steady_clock::now();
253  size_t retries = 0; // Have NOT yet tried, so retries <= send_retry_count_ will have it RETRY send_retry_count_ times
254  while (sts != TransferInterface::CopyStatus::kSuccess && retries <= send_retry_count_)
255  {
256  sts = destinations_[dest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
257  if (sts != TransferInterface::CopyStatus::kSuccess && TimeUtils::GetElapsedTime(lastWarnTime) >= 1)
258  {
259  TLOG(TLVL_WARNING) << "sendFragment: Sending fragment " << seqID << " to destination " << dest << " failed! Retrying...";
260  lastWarnTime = std::chrono::steady_clock::now();
261  }
262  ++retries;
263  }
265  {
266  outsts = sts;
267  }
268  // sendFragTo(std::move(frag), dest);
269  sent_frag_count_.incSlot(dest);
270  }
271  else if (!should_stop_)
272  {
273  TLOG(TLVL_ERROR) << "(in non_blocking) calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
274  << ". enabled_destinantions_.size()=" << enabled_destinations_.size();
275  }
276  }
277  else
278  {
279  auto start = std::chrono::steady_clock::now();
280  while (!should_stop_ && dest == TableReceiver::ROUTING_FAILED)
281  {
282  dest = calcDest_(seqID);
283  if (dest == TableReceiver::ROUTING_FAILED)
284  {
285  TLOG(TLVL_WARNING) << "Could not get destination for seqID " << seqID << ", send number " << sent_frag_count_.count() << ", retrying. Waited " << TimeUtils::GetElapsedTime(start) << " s for routing information.";
286  usleep(10000);
287  }
288  }
289  if (dest != TableReceiver::ROUTING_FAILED && (destinations_.count(dest) != 0u) && (enabled_destinations_.count(dest) != 0u))
290  {
291  TLOG(TLVL_DEBUG + 34) << "DataSenderManager::sendFragment: Sending fragment with seqId " << seqID << " to destination " << dest;
293 
294  sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(frag));
296  {
297  TLOG(TLVL_ERROR) << "sendFragment: Sending fragment " << seqID << " to destination "
298  << dest << " failed! Data has been lost!";
299  }
300 
301  // sendFragTo(std::move(frag), dest);
302  sent_frag_count_.incSlot(dest);
303  outsts = sts;
304  }
305  else if (!should_stop_)
306  {
307  TLOG(TLVL_ERROR) << "calcDest returned invalid destination rank " << dest << "! This event has been lost: " << seqID
308  << ". enabled_destinantions_.size()=" << enabled_destinations_.size();
309  }
310  }
311 
312  if (!isSystemBroadcast)
313  {
314  std::unique_lock<std::mutex> lck(sent_sequence_id_mutex_);
315  sent_sequence_id_count_[seqID]++;
316  }
317 
318  auto delta_t = TimeUtils::GetElapsedTime(start_time);
319 
320  if (metricMan)
321  {
322  TLOG(TLVL_DEBUG + 34) << "sendFragment: sending metrics";
323  metricMan->sendMetric("Data Send Time to Rank " + std::to_string(dest), delta_t, "s", 5, MetricMode::Accumulate);
324  metricMan->sendMetric("Data Send Size to Rank " + std::to_string(dest), fragSize, "B", 5, MetricMode::Accumulate | MetricMode::Maximum);
325  metricMan->sendMetric("Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t, "B/s", 5, MetricMode::Average);
326  metricMan->sendMetric("Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest), "fragments", 3, MetricMode::LastPoint);
327 
328  metricMan->sendMetric("Rank", std::to_string(my_rank), "", 3, MetricMode::LastPoint);
329  metricMan->sendMetric("App Name", app_name, "", 3, MetricMode::LastPoint);
330 
331  metricMan->sendMetric("Fragment Latency at Send", latency, "s", 4, MetricMode::Average | MetricMode::Maximum);
332  }
333 
334  TLOG(TLVL_DEBUG + 34) << "sendFragment: Done sending fragment " << seqID << " to dest=" << dest;
335  return std::make_pair(dest, outsts);
336 } // artdaq::DataSenderManager::sendFragment
void RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
Remove the given sequence ID from the routing table and sent_count lists.
virtual ~DataSenderManager()
DataSenderManager Destructor.
static constexpr int ROUTING_FAILED
Value used to indicate that a route was not properly generated.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
size_t GetSentSequenceIDCount(Fragment::sequence_id_t seq)
Get the number of Fragments sent with a given Sequence ID.
std::pair< int, TransferInterface::CopyStatus > sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, std::string > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
Definition: HostMap.hh:49
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
This TransferInterface is a Sender.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
Sends Fragment objects using TransferInterface plugins. Uses Routing Tables if confgiured, otherwise will Round-Robin Fragments to the destinations.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:42
hostMap_t MakeHostMap(fhicl::ParameterSet const &pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:68
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.