artdaq  v3_02_00
TransferTest.cc
1 #include "artdaq/DAQrate/TransferTest.hh"
2 
3 #include "artdaq-core/Data/Fragment.hh"
4 #include "artdaq/DAQrate/FragmentReceiverManager.hh"
5 #include "artdaq/DAQrate/DataSenderManager.hh"
6 
7 #define TRACE_NAME "TransferTest"
8 #include "artdaq/DAQdata/Globals.hh"
9 
10 #include "fhiclcpp/make_ParameterSet.h"
11 
12 #include <future>
13 
14 artdaq::TransferTest::TransferTest(fhicl::ParameterSet psi)
15  : senders_(psi.get<int>("num_senders"))
16  , receivers_(psi.get<int>("num_receivers"))
17  , sending_threads_(psi.get<int>("sending_threads", 1))
18  , sends_each_sender_(psi.get<int>("sends_per_sender"))
19  , receives_each_receiver_(senders_ * sending_threads_ * sends_each_sender_ / receivers_)
20  , buffer_count_(psi.get<int>("buffer_count", 10))
21  , error_count_max_(psi.get<int>("max_errors_before_abort", 3))
22  , fragment_size_(psi.get<size_t>("fragment_size", 0x100000))
23  , ps_()
24  , validate_mode_(psi.get<bool>("validate_data_mode", false))
25  , partition_number_(psi.get<int>("partition_number", rand() % 0x7F))
26 {
27  TLOG(10) << "CONSTRUCTOR";
28  metricMan = &metricMan_;
29 
30  if (fragment_size_ < artdaq::detail::RawFragmentHeader::num_words() * sizeof(artdaq::RawDataType))
31  {
32  fragment_size_ = artdaq::detail::RawFragmentHeader::num_words() * sizeof(artdaq::RawDataType);
33  }
34 
35  fhicl::ParameterSet metric_pset;
36 
37  try
38  {
39  metric_pset = psi.get<fhicl::ParameterSet>("metrics");
40  }
41  catch (...) {} // OK if there's no metrics table defined in the FHiCL
42 
43  try
44  {
45  std::string name = "TransferTest" + std::to_string(my_rank);
46  metricMan_.initialize(metric_pset, name);
47  metricMan_.do_start();
48  }
49  catch (...) {}
50 
51  std::string type(psi.get<std::string>("transfer_plugin_type", "Shmem"));
52 
53  if (receivers_ > 0)
54  {
55  if (senders_ * sends_each_sender_ % receivers_ != 0)
56  {
57  TLOG(TLVL_TRACE) << "Adding sends so that sends_each_sender * num_sending_ranks is a multiple of num_receiving_ranks" << std::endl;
58  while (senders_ * sends_each_sender_ % receivers_ != 0)
59  {
60  sends_each_sender_++;
61  }
62  receives_each_receiver_ = senders_ * sends_each_sender_ / receivers_;
63  TLOG(TLVL_TRACE) << "sends_each_sender is now " << sends_each_sender_ << std::endl;
64  psi.put_or_replace("sends_per_sender", sends_each_sender_);
65  }
66  }
67 
68  std::string hostmap = "";
69  if (psi.has_key("hostmap"))
70  {
71  hostmap = " host_map: @local::hostmap";
72  }
73 
74  std::stringstream ss;
75  ss << psi.to_string() << std::endl;
76 
77  ss << " sources: {";
78  for (int ii = 0; ii < senders_; ++ii)
79  {
80  ss << "s" << ii << ": { transferPluginType: " << type << " source_rank: " << ii << " max_fragment_size_words : " << fragment_size_ << " buffer_count : " << buffer_count_ << " partition_number : " << partition_number_ << hostmap << " }" << std::endl;
81  }
82  ss << "}" << std::endl << " destinations: {";
83  for (int jj = senders_; jj < senders_ + receivers_; ++jj)
84  {
85  ss << "d" << jj << ": { transferPluginType: " << type << " destination_rank: " << jj << " max_fragment_size_words : " << fragment_size_ << " buffer_count : " << buffer_count_ << " partition_number : " << partition_number_ << hostmap << " }" << std::endl;
86  }
87  ss << "}" << std::endl;
88 
89  make_ParameterSet(ss.str(), ps_);
90 
91 
92  TLOG(TLVL_DEBUG) << "Going to configure with ParameterSet: " << ps_.to_string() << std::endl;
93 }
94 
96 {
97  TLOG(TLVL_INFO) << "runTest BEGIN: " << (my_rank < senders_ ? "sending" : "receiving");
98  start_time_ = std::chrono::steady_clock::now();
99  std::pair<size_t, double> result;
100  if (my_rank >= senders_ + receivers_) return 0;
101  if (my_rank < senders_)
102  {
103  std::vector<std::future<std::pair<size_t, double>>> results_futures(sending_threads_);
104  for (int ii = 0; ii < sending_threads_; ++ii)
105  {
106  results_futures[ii] = std::async(std::bind(&TransferTest::do_sending, this, ii));
107  }
108  for (auto& future : results_futures)
109  {
110  if (future.valid())
111  {
112  auto thisresult = future.get();
113  result.first += thisresult.first;
114  result.second += thisresult.second;
115  }
116  }
117  }
118  else
119  {
120  result = do_receiving();
121  }
122  auto duration = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - start_time_).count();
123  TLOG(TLVL_INFO) << (my_rank < senders_ ? "Sent " : "Received ") << result.first << " bytes in " << duration << " seconds ( " << formatBytes(result.first / duration) << "/s )." << std::endl;
124  TLOG(TLVL_INFO) << "Rate of " << (my_rank < senders_ ? "sending" : "receiving") << ": " << formatBytes(result.first / result.second) << "/s." << std::endl;
125  metricMan_.do_stop();
126  metricMan_.shutdown();
127  TLOG(11) << "runTest DONE";
128  return 0;
129 }
130 
131 std::pair<size_t, double> artdaq::TransferTest::do_sending(int index)
132 {
133  TLOG(7) << "do_sending entered RawFragmentHeader::num_words()=" << std::to_string(artdaq::detail::RawFragmentHeader::num_words());
134 
135  size_t totalSize = 0;
136  double totalTime = 0;
137  artdaq::DataSenderManager sender(ps_);
138 
139  unsigned data_size_wrds = (fragment_size_ / sizeof(artdaq::RawDataType)) - artdaq::detail::RawFragmentHeader::num_words();
140  artdaq::Fragment frag(data_size_wrds);
141 
142  if (validate_mode_)
143  {
144  artdaq::RawDataType gen_seed = 0;
145 
146  std::generate_n(frag.dataBegin(), data_size_wrds, [&]() { return ++gen_seed; });
147  for (size_t ii = 0; ii < frag.dataSize(); ++ii)
148  {
149  if (*(frag.dataBegin() + ii) != ii + 1)
150  {
151  TLOG(TLVL_ERROR) << "Data corruption detected! (" << std::to_string(*(frag.dataBegin() + ii)) << " != " << std::to_string(ii + 1) << ") Aborting!";
152  exit(1);
153  }
154  }
155  }
156 
157  int metric_send_interval = sends_each_sender_ / 1000 > 1 ? sends_each_sender_ / 1000 : 1;
158  auto init_time_metric = 0.0;
159  auto send_time_metric = 0.0;
160  auto after_time_metric = 0.0;
161  auto send_size_metric = 0.0;
162  auto error_count = 0;
163 
164  for (int ii = 0; ii < sends_each_sender_; ++ii)
165  {
166  auto loop_start = std::chrono::steady_clock::now();
167  TLOG(7) << "sender rank " << my_rank << " #" << ii << " resized bytes=" << std::to_string(frag.sizeBytes());
168  totalSize += frag.sizeBytes();
169 
170  //unsigned sndDatSz = data_size_wrds;
171  frag.setSequenceID(ii * sending_threads_ + index);
172  frag.setFragmentID(my_rank);
173  frag.setSystemType(artdaq::Fragment::DataFragmentType);
174  /*
175  artdaq::Fragment::iterator it = frag.dataBegin();
176  *it = my_rank;
177  *++it = ii;
178  *++it = sndDatSz;*/
179 
180  auto send_start = std::chrono::steady_clock::now();
181  auto stspair = sender.sendFragment(std::move(frag));
182  TLOG(TLVL_DEBUG) << "Sender " << my_rank << " sending fragment " << ii;
183  auto after_send = std::chrono::steady_clock::now();
184  TLOG(TLVL_TRACE) << "Sender " << my_rank << " sent fragment " << ii;
185  //usleep( (data_size_wrds*sizeof(artdaq::RawDataType))/233 );
186 
188  {
189  error_count++;
190  if (error_count >= error_count_max_)
191  {
192  TLOG(TLVL_ERROR) << "Too many errors sending fragments! Aborting... (sent=" << ii << "/" << sends_each_sender_ << ")";
193  exit(sends_each_sender_ - ii);
194  }
195  }
196 
197  frag = artdaq::Fragment(data_size_wrds); // replace/renew
198  if (validate_mode_)
199  {
200  artdaq::RawDataType gen_seed = ii + 1;
201 
202  std::generate_n(frag.dataBegin(), data_size_wrds, [&]() { return ++gen_seed; });
203  for (size_t jj = 0; jj < frag.dataSize(); ++jj)
204  {
205  if (*(frag.dataBegin() + jj) != (ii + 1) + jj + 1)
206  {
207  TLOG(TLVL_ERROR) << "Input Data corruption detected! (" << std::to_string(*(frag.dataBegin() + jj)) << " != " << std::to_string(ii + jj + 2) << " at position " << ii << ") Aborting!";
208  exit(1);
209  }
210  }
211  }
212  TLOG(9) << "sender rank " << my_rank << " frag replaced";
213 
214  auto total_send_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_send - send_start).count();
215  totalTime += total_send_time;
216  send_time_metric += total_send_time;
217  send_size_metric += data_size_wrds * sizeof(artdaq::RawDataType);
218  after_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - after_send).count();
219  init_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(send_start - loop_start).count();
220 
221  if (metricMan && ii % metric_send_interval == 0)
222  {
223  metricMan->sendMetric("send_init_time", init_time_metric, "seconds", 3, MetricMode::Accumulate);
224  metricMan->sendMetric("total_send_time", send_time_metric, "seconds", 3, MetricMode::Accumulate);
225  metricMan->sendMetric("after_send_time", after_time_metric, "seconds", 3, MetricMode::Accumulate);
226  metricMan->sendMetric("send_rate", send_size_metric / send_time_metric, "B/s", 3, MetricMode::Average);
227  init_time_metric = 0.0;
228  send_time_metric = 0.0;
229  after_time_metric = 0.0;
230  send_size_metric = 0.0;
231  }
232  usleep(0); // Yield execution
233  }
234 
235  return std::make_pair(totalSize, totalTime);
236 } // do_sending
237 
238 std::pair<size_t, double> artdaq::TransferTest::do_receiving()
239 {
240  TLOG(7) << "do_receiving entered";
241 
242  artdaq::FragmentReceiverManager receiver(ps_);
243  receiver.start_threads();
244  int counter = receives_each_receiver_;
245  size_t totalSize = 0;
246  double totalTime = 0;
247  bool first = true;
248  int activeSenders = senders_ * sending_threads_;
249  auto end_loop = std::chrono::steady_clock::now();
250 
251  auto recv_size_metric = 0.0;
252  auto recv_time_metric = 0.0;
253  auto input_wait_metric = 0.0;
254  auto init_wait_metric = 0.0;
255  int metric_send_interval = receives_each_receiver_ / 1000 > 1 ? receives_each_receiver_ : 1;
256 
257  // Only abort when there are no senders if were's > 90% done
258  while ((activeSenders > 0 || counter > receives_each_receiver_ / 10) && counter > 0)
259  {
260  auto start_loop = std::chrono::steady_clock::now();
261  TLOG(7) << "do_receiving: Counter is " << counter << ", calling recvFragment";
263  auto before_receive = std::chrono::steady_clock::now();
264  init_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(before_receive - start_loop).count();
265 
266  auto ignoreFragPtr = receiver.recvFragment(senderSlot);
267  auto after_receive = std::chrono::steady_clock::now();
268  size_t thisSize = 0;
269  if (senderSlot >= artdaq::TransferInterface::RECV_SUCCESS && ignoreFragPtr)
270  {
271  if (ignoreFragPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
272  {
273  TLOG(TLVL_INFO) << "Receiver " << my_rank << " received EndOfData Fragment from Sender " << senderSlot;
274  activeSenders--;
275  TLOG(TLVL_DEBUG) << "Active Senders is now " << activeSenders;
276  }
277  else if (ignoreFragPtr->type() != artdaq::Fragment::DataFragmentType)
278  {
279  TLOG(TLVL_WARNING) << "Receiver " << my_rank << " received Fragment with System type " << artdaq::detail::RawFragmentHeader::SystemTypeToString(ignoreFragPtr->type()) << " (Unexpected!)";
280  }
281  else
282  {
283  if (first)
284  {
285  start_time_ = std::chrono::steady_clock::now();
286  first = false;
287  }
288  counter--;
289  TLOG(TLVL_INFO) << "Receiver " << my_rank << " received fragment " << receives_each_receiver_ - counter
290  << " with seqID " << std::to_string(ignoreFragPtr->sequenceID()) << " from Sender " << senderSlot << " (Expecting " << counter << " more)";
291  thisSize = ignoreFragPtr->size() * sizeof(artdaq::RawDataType);
292  totalSize += thisSize;
293  if (validate_mode_)
294  {
295  for (size_t ii = 0; ii < ignoreFragPtr->dataSize(); ++ii)
296  {
297  if (*(ignoreFragPtr->dataBegin() + ii) != ignoreFragPtr->sequenceID() + ii + 1)
298  {
299  TLOG(TLVL_ERROR) << "Output Data corruption detected! (" << std::to_string(*(ignoreFragPtr->dataBegin() + ii)) << " != " << std::to_string(ignoreFragPtr->sequenceID() + ii + 1) << " at position " << ii << ") Aborting!";
300  exit(1);
301  }
302  }
303  }
304  }
305  input_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - end_loop).count();
306  }
307  else if (senderSlot == artdaq::TransferInterface::DATA_END)
308  {
309  TLOG(TLVL_ERROR) << "Receiver " << my_rank << " detected fatal protocol error! Reducing active sender count by one!" << std::endl;
310  activeSenders--;
311  TLOG(TLVL_DEBUG) << "Active Senders is now " << activeSenders;
312  }
313  TLOG(7) << "do_receiving: Recv Loop end, counter is " << counter;
314 
315 
316  auto total_recv_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - before_receive).count();
317  recv_time_metric += total_recv_time;
318  totalTime += total_recv_time;
319  recv_size_metric += thisSize;
320 
321  if (metricMan && counter % metric_send_interval == 0)
322  {
323  metricMan->sendMetric("input_wait", input_wait_metric, "seconds", 3, MetricMode::Accumulate);
324  metricMan->sendMetric("recv_init_time", init_wait_metric, "seconds", 3, MetricMode::Accumulate);
325  metricMan->sendMetric("total_recv_time", recv_time_metric, "seconds", 3, MetricMode::Accumulate);
326  metricMan->sendMetric("recv_rate", recv_size_metric / recv_time_metric, "B/s", 3, MetricMode::Average);
327 
328  input_wait_metric = 0.0;
329  init_wait_metric = 0.0;
330  recv_time_metric = 0.0;
331  recv_size_metric = 0.0;
332  }
333  end_loop = std::chrono::steady_clock::now();
334  }
335 
336  if (counter != 0)
337  {
338  TLOG(TLVL_ERROR) << "Did not receive all expected Fragments! Missing " << counter << " Fragments!";
339  exit(counter);
340  }
341 
342  return std::make_pair(totalSize, totalTime);
343 }
int runTest()
Run the test as configured.
Definition: TransferTest.cc:95
Value to be returned upon receive timeout.
Sends Fragment objects using TransferInterface plugins. Uses Routing Tables if confgiured, otherwise will Round-Robin Fragments to the destinations.
For code clarity, things checking for successful receive should check retval &gt;= RECV_SUCCESS.
Receives Fragment objects from one or more DataSenderManager instances using TransferInterface plugin...
Value that is to be returned when a Transfer plugin determines that no more data will be arriving...
The send operation completed successfully.
TransferTest(fhicl::ParameterSet psi)
TransferTest Constructor.
Definition: TransferTest.cc:14