artdaq  v3_12_02
TransferTest.cc
1 #include "artdaq/DAQrate/TransferTest.hh"
2 
3 #include "artdaq-core/Data/Fragment.hh"
4 #include "artdaq/DAQrate/DataSenderManager.hh"
5 #include "artdaq/DAQrate/FragmentReceiverManager.hh"
6 
7 #define TRACE_NAME "TransferTest"
8 #include "artdaq/DAQdata/Globals.hh"
9 
10 #include <future>
11 
12 artdaq::TransferTest::TransferTest(fhicl::ParameterSet psi)
13  : senders_(psi.get<int>("num_senders"))
14  , receivers_(psi.get<int>("num_receivers"))
15  , sending_threads_(psi.get<int>("sending_threads", 1))
16  , sends_each_sender_(psi.get<int>("sends_per_sender"))
17  , receives_each_receiver_(0)
18  , buffer_count_(psi.get<int>("buffer_count", 10))
19  , error_count_max_(psi.get<int>("max_errors_before_abort", 3))
20  , fragment_size_(psi.get<size_t>("fragment_size", 0x100000))
21  , validate_mode_(psi.get<bool>("validate_data_mode", false))
22  , partition_number_(psi.get<int>("partition_number", rand() % 0x7F)) // NOLINT(cert-msc50-cpp)
23 {
24  TLOG(TLVL_DEBUG + 35) << "CONSTRUCTOR";
25 
26  if (fragment_size_ < artdaq::detail::RawFragmentHeader::num_words() * sizeof(artdaq::RawDataType))
27  {
28  fragment_size_ = artdaq::detail::RawFragmentHeader::num_words() * sizeof(artdaq::RawDataType);
29  }
30 
31  fhicl::ParameterSet metric_pset;
32 
33  try
34  {
35  metric_pset = psi.get<fhicl::ParameterSet>("metrics");
36  }
37  catch (...)
38  {} // OK if there's no metrics table defined in the FHiCL
39 
40  try
41  {
42  std::string name = "TransferTest" + std::to_string(my_rank);
43  metricMan->initialize(metric_pset, name);
44  metricMan->do_start();
45  }
46  catch (...)
47  {}
48 
49  auto type(psi.get<std::string>("transfer_plugin_type", "Shmem"));
50 
51  bool broadcast_mode = psi.get<bool>("broadcast_sends", false);
52  if (broadcast_mode)
53  {
54  receives_each_receiver_ = senders_ * sending_threads_ * sends_each_sender_;
55  }
56  else
57  {
58  if (receivers_ > 0)
59  {
60  if (senders_ * sending_threads_ * sends_each_sender_ % receivers_ != 0)
61  {
62  TLOG(TLVL_DEBUG + 33) << "Adding sends so that sends_each_sender * num_sending_ranks is a multiple of num_receiving_ranks" << std::endl;
63  while (senders_ * sends_each_sender_ % receivers_ != 0)
64  {
65  sends_each_sender_++;
66  }
67  receives_each_receiver_ = senders_ * sending_threads_ * sends_each_sender_ / receivers_;
68  TLOG(TLVL_DEBUG + 33) << "sends_each_sender is now " << sends_each_sender_ << std::endl;
69  psi.put_or_replace("sends_per_sender", sends_each_sender_);
70  }
71  else
72  {
73  receives_each_receiver_ = senders_ * sending_threads_ * sends_each_sender_ / receivers_;
74  }
75  }
76  }
77 
78  std::string hostmap;
79  if (psi.has_key("hostmap"))
80  {
81  hostmap = " host_map: @local::hostmap";
82  }
83 
84  std::stringstream ss;
85  ss << psi.to_string() << std::endl;
86 
87  ss << " sources: {";
88  for (int ii = 0; ii < senders_; ++ii)
89  {
90  ss << "s" << ii << ": { transferPluginType: " << type << " source_rank: " << ii << " max_fragment_size_words : " << fragment_size_ << " buffer_count : " << buffer_count_ << " partition_number : " << partition_number_ << hostmap << " }" << std::endl;
91  }
92  ss << "}" << std::endl
93  << " destinations: {";
94  for (int jj = senders_; jj < senders_ + receivers_; ++jj)
95  {
96  ss << "d" << jj << ": { transferPluginType: " << type << " destination_rank: " << jj << " max_fragment_size_words : " << fragment_size_ << " buffer_count : " << buffer_count_ << " partition_number : " << partition_number_ << hostmap << " }" << std::endl;
97  }
98  ss << "}" << std::endl;
99 
100  ps_ = fhicl::ParameterSet::make(ss.str());
101 
102  TLOG(TLVL_DEBUG + 32) << "Going to configure with ParameterSet: " << ps_.to_string() << std::endl;
103 }
104 
106 {
107  TLOG(TLVL_INFO) << "runTest BEGIN: " << (my_rank < senders_ ? "sending" : "receiving");
108  start_time_ = std::chrono::steady_clock::now();
109  std::pair<size_t, double> result;
110  if (my_rank >= senders_ + receivers_)
111  {
112  return 0;
113  }
114  if (my_rank < senders_)
115  {
116  std::vector<std::future<std::pair<size_t, double>>> results_futures(sending_threads_);
117  for (int ii = 0; ii < sending_threads_; ++ii)
118  {
119  results_futures[ii] = std::async(std::bind(&TransferTest::do_sending, this, ii));
120  }
121  for (auto& future : results_futures)
122  {
123  if (future.valid())
124  {
125  auto thisresult = future.get();
126  result.first += thisresult.first;
127  result.second += thisresult.second;
128  }
129  }
130  }
131  else
132  {
133  result = do_receiving();
134  }
135  auto duration = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - start_time_).count();
136  TLOG(TLVL_INFO) << (my_rank < senders_ ? "Sent " : "Received ") << result.first << " bytes in " << duration << " seconds ( " << formatBytes(result.first / duration) << "/s )." << std::endl;
137  TLOG(TLVL_INFO) << "Rate of " << (my_rank < senders_ ? "sending" : "receiving") << ": " << formatBytes(result.first / result.second) << "/s." << std::endl;
138  metricMan->do_stop();
139  metricMan->shutdown();
140  TLOG(TLVL_DEBUG + 36) << "runTest DONE";
141  return return_code_;
142 }
143 
144 std::pair<size_t, double> artdaq::TransferTest::do_sending(int index)
145 {
146  TLOG(TLVL_DEBUG + 34) << "do_sending entered RawFragmentHeader::num_words()=" << artdaq::detail::RawFragmentHeader::num_words();
147 
148  size_t totalSize = 0;
149  double totalTime = 0;
150  artdaq::DataSenderManager sender(ps_);
151 
152  unsigned data_size_wrds = (fragment_size_ / sizeof(artdaq::RawDataType)) - artdaq::detail::RawFragmentHeader::num_words();
153  artdaq::Fragment frag(data_size_wrds);
154 
155  if (validate_mode_)
156  {
157  artdaq::RawDataType gen_seed = 0;
158 
159  std::generate_n(frag.dataBegin(), data_size_wrds, [&]() { return ++gen_seed; });
160  for (size_t ii = 0; ii < frag.dataSize(); ++ii)
161  {
162  if (*(frag.dataBegin() + ii) != ii + 1)
163  {
164  TLOG(TLVL_ERROR) << "Data corruption detected! (" << (*(frag.dataBegin() + ii)) << " != " << (ii + 1) << ") Aborting!";
165  return_code_ = 255;
166  return std::make_pair(0, 0.0);
167  }
168  }
169  }
170 
171  int metric_send_interval = sends_each_sender_ / 1000 > 1 ? sends_each_sender_ / 1000 : 1;
172  auto init_time_metric = 0.0;
173  auto send_time_metric = 0.0;
174  auto after_time_metric = 0.0;
175  auto send_size_metric = 0.0;
176  auto error_count = 0;
177 
178  for (int ii = 0; ii < sends_each_sender_; ++ii)
179  {
180  auto loop_start = std::chrono::steady_clock::now();
181  TLOG(TLVL_DEBUG + 34) << "sender rank " << my_rank << " #" << ii << " resized bytes=" << frag.sizeBytes();
182  totalSize += frag.sizeBytes();
183 
184  // unsigned sndDatSz = data_size_wrds;
185  frag.setSequenceID(ii * sending_threads_ + index);
186  frag.setFragmentID(my_rank);
187  frag.setSystemType(artdaq::Fragment::DataFragmentType);
188  /*
189  artdaq::Fragment::iterator it = frag.dataBegin();
190  *it = my_rank;
191  *++it = ii;
192  *++it = sndDatSz;*/
193 
194  auto send_start = std::chrono::steady_clock::now();
195  TLOG(TLVL_DEBUG + 32) << "Sender " << my_rank << " sending fragment " << ii;
196  auto stspair = sender.sendFragment(std::move(frag));
197  auto after_send = std::chrono::steady_clock::now();
198  TLOG(TLVL_DEBUG + 33) << "Sender " << my_rank << " sent fragment " << ii;
199  sender.RemoveRoutingTableEntry(ii * sending_threads_ + index);
200  // usleep( (data_size_wrds*sizeof(artdaq::RawDataType))/233 );
201 
203  {
204  error_count++;
205  if (error_count >= error_count_max_)
206  {
207  TLOG(TLVL_ERROR) << "Too many errors sending fragments! Aborting... (sent=" << ii << "/" << sends_each_sender_ << ")";
208  return_code_ = sends_each_sender_ - ii;
209  return std::make_pair(0, 0.0);
210  }
211  }
212 
213  frag = artdaq::Fragment(data_size_wrds); // replace/renew
214  if (validate_mode_)
215  {
216  artdaq::RawDataType gen_seed = ii + 1;
217 
218  std::generate_n(frag.dataBegin(), data_size_wrds, [&]() { return ++gen_seed; });
219  for (size_t jj = 0; jj < frag.dataSize(); ++jj)
220  {
221  if (*(frag.dataBegin() + jj) != (ii + 1) + jj + 1)
222  {
223  TLOG(TLVL_ERROR) << "Input Data corruption detected! (" << *(frag.dataBegin() + jj) << " != " << ii + jj + 2 << " at position " << ii << ") Aborting!";
224  return_code_ = 254;
225  return std::make_pair(0, 0.0);
226  }
227  }
228  }
229  TLOG(TLVL_DEBUG + 37) << "sender rank " << my_rank << " frag replaced";
230 
231  auto total_send_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_send - send_start).count();
232  totalTime += total_send_time;
233  send_time_metric += total_send_time;
234  send_size_metric += data_size_wrds * sizeof(artdaq::RawDataType);
235  after_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - after_send).count();
236  init_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(send_start - loop_start).count();
237 
238  if (metricMan && ii % metric_send_interval == 0)
239  {
240  metricMan->sendMetric("send_init_time", init_time_metric, "seconds", 3, MetricMode::Accumulate);
241  metricMan->sendMetric("total_send_time", send_time_metric, "seconds", 3, MetricMode::Accumulate);
242  metricMan->sendMetric("after_send_time", after_time_metric, "seconds", 3, MetricMode::Accumulate);
243  metricMan->sendMetric("send_rate", send_size_metric / send_time_metric, "B/s", 3, MetricMode::Average);
244  init_time_metric = 0.0;
245  send_time_metric = 0.0;
246  after_time_metric = 0.0;
247  send_size_metric = 0.0;
248  }
249  usleep(0); // Yield execution
250  }
251 
252  return std::make_pair(totalSize, totalTime);
253 } // do_sending
254 
255 std::pair<size_t, double> artdaq::TransferTest::do_receiving()
256 {
257  TLOG(TLVL_DEBUG + 34) << "do_receiving entered";
258 
259  artdaq::FragmentReceiverManager receiver(ps_);
260  receiver.start_threads();
261  int counter = receives_each_receiver_;
262  size_t totalSize = 0;
263  double totalTime = 0;
264  bool first = true;
265  bool nonblocking_mode = ps_.get<bool>("nonblocking_sends", false);
266  std::atomic<int> activeSenders(senders_ * sending_threads_);
267  auto end_loop = std::chrono::steady_clock::now();
268 
269  auto recv_size_metric = 0.0;
270  auto recv_time_metric = 0.0;
271  auto input_wait_metric = 0.0;
272  auto init_wait_metric = 0.0;
273  int metric_send_interval = receives_each_receiver_ / 1000 > 1 ? receives_each_receiver_ : 1;
274 
275  // Only abort when there are no senders if were's > 90% done
276  while ((activeSenders > 0 || (counter > receives_each_receiver_ / 10 && !nonblocking_mode)) && counter > 0)
277  {
278  auto start_loop = std::chrono::steady_clock::now();
279  TLOG(TLVL_DEBUG + 34) << "do_receiving: Counter is " << counter << ", calling recvFragment (activeSenders=" << activeSenders << ")";
281  auto before_receive = std::chrono::steady_clock::now();
282 
283  auto ignoreFragPtr = receiver.recvFragment(senderSlot);
284  auto after_receive = std::chrono::steady_clock::now();
285  init_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(before_receive - start_loop).count();
286  size_t thisSize = 0;
287  if (senderSlot >= artdaq::TransferInterface::RECV_SUCCESS && ignoreFragPtr)
288  {
289  if (ignoreFragPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
290  {
291  TLOG(TLVL_INFO) << "Receiver " << my_rank << " received EndOfData Fragment from Sender " << senderSlot;
292  activeSenders--;
293  TLOG(TLVL_DEBUG + 32) << "Active Senders is now " << activeSenders;
294  }
295  else if (ignoreFragPtr->type() != artdaq::Fragment::DataFragmentType)
296  {
297  TLOG(TLVL_WARNING) << "Receiver " << my_rank << " received Fragment with System type " << artdaq::detail::RawFragmentHeader::SystemTypeToString(ignoreFragPtr->type()) << " (Unexpected!)";
298  }
299  else
300  {
301  if (first)
302  {
303  start_time_ = std::chrono::steady_clock::now();
304  first = false;
305  }
306  counter--;
307  TLOG(TLVL_INFO) << "Receiver " << my_rank << " received fragment " << receives_each_receiver_ - counter
308  << " with seqID " << ignoreFragPtr->sequenceID() << " from Sender " << senderSlot << " (Expecting " << counter << " more)";
309  thisSize = ignoreFragPtr->size() * sizeof(artdaq::RawDataType);
310  totalSize += thisSize;
311  if (validate_mode_)
312  {
313  for (size_t ii = 0; ii < ignoreFragPtr->dataSize(); ++ii)
314  {
315  if (*(ignoreFragPtr->dataBegin() + ii) != ignoreFragPtr->sequenceID() + ii + 1)
316  {
317  TLOG(TLVL_ERROR) << "Output Data corruption detected! (" << *(ignoreFragPtr->dataBegin() + ii) << " != " << (ignoreFragPtr->sequenceID() + ii + 1) << " at position " << ii << ") Aborting!";
318  return_code_ = -3;
319  return std::make_pair(0, 0.0);
320  }
321  }
322  }
323  }
324  input_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - end_loop).count();
325  }
326  else if (senderSlot == artdaq::TransferInterface::DATA_END)
327  {
328  TLOG(TLVL_ERROR) << "Receiver " << my_rank << " detected fatal protocol error! Reducing active sender count by one!" << std::endl;
329  activeSenders--;
330  TLOG(TLVL_DEBUG + 32) << "Active Senders is now " << activeSenders;
331  }
332  TLOG(TLVL_DEBUG + 34) << "do_receiving: Recv Loop end, counter is " << counter;
333 
334  auto total_recv_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - before_receive).count();
335  recv_time_metric += total_recv_time;
336  totalTime += total_recv_time;
337  recv_size_metric += thisSize;
338 
339  if (metricMan && counter % metric_send_interval == 0)
340  {
341  metricMan->sendMetric("input_wait", input_wait_metric, "seconds", 3, MetricMode::Accumulate);
342  metricMan->sendMetric("recv_init_time", init_wait_metric, "seconds", 3, MetricMode::Accumulate);
343  metricMan->sendMetric("total_recv_time", recv_time_metric, "seconds", 3, MetricMode::Accumulate);
344  metricMan->sendMetric("recv_rate", recv_size_metric / recv_time_metric, "B/s", 3, MetricMode::Average);
345 
346  input_wait_metric = 0.0;
347  init_wait_metric = 0.0;
348  recv_time_metric = 0.0;
349  recv_size_metric = 0.0;
350  }
351  end_loop = std::chrono::steady_clock::now();
352  }
353 
354  if (counter != 0 && !nonblocking_mode)
355  {
356  TLOG(TLVL_ERROR) << "Did not receive all expected Fragments! Missing " << counter << " Fragments!";
357  return_code_ = counter;
358  return std::make_pair(0, 0.0);
359  }
360 
361  return std::make_pair(totalSize, totalTime);
362 }
int runTest()
Run the test as configured.
Sends Fragment objects using TransferInterface plugins. Uses Routing Tables if confgiured, otherwise will Round-Robin Fragments to the destinations.
Receives Fragment objects from one or more DataSenderManager instances using TransferInterface plugin...
Value that is to be returned when a Transfer plugin determines that no more data will be arriving...
The send operation completed successfully.
TransferTest(fhicl::ParameterSet psi)
TransferTest Constructor.
Definition: TransferTest.cc:12
For code clarity, things checking for successful receive should check retval &gt;= NO_RANK_INFO.
Value to be returned upon receive timeout.