artdaq  v3_09_01
BrokenTransferTest.cc
1 #include "BrokenTransferTest.hh"
2 
3 #include "artdaq-core/Data/detail/RawFragmentHeader.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 
6 #include <memory>
7 #include <thread>
8 #include "artdaq/DAQdata/Globals.hh"
9 #define TRACE_NAME "BrokenTransferTest"
10 
12  : sender_ready_()
13  , receiver_ready_()
14  , sender_current_fragment_()
15  , ps_(ps)
16  , test_start_time_(std::chrono::steady_clock::now())
17  , test_end_time_(std::chrono::steady_clock::now())
18  , test_end_requested_(false)
19  , fragment_rate_hz_(ps.get<size_t>("fragment_rate_hz", 10))
20  , pause_first_sender_(false)
21  , pause_receiver_(false)
22  , kill_first_sender_(false)
23  , kill_receiver_(false)
24  , reliable_mode_(ps.get<bool>("reliable_mode", true))
25  , fragment_size_(ps.get<size_t>("fragment_size", 0x10000))
26  , send_timeout_us_(ps.get<size_t>("send_timeout_us", 100000))
27  , transfer_buffer_count_(ps.get<size_t>("transfer_buffer_count", 10))
28  , event_buffer_count_(ps.get<size_t>("event_buffer_count", 20))
29  , event_buffer_timeout_us_(ps.get<size_t>("event_buffer_timeout_us", 1000000))
30  , send_throttle_us_(0)
31 {
32  if (fragment_rate_hz_ == 0 || fragment_rate_hz_ > 100000)
33  {
34  TLOG(TLVL_WARNING) << "Invalid rate " << fragment_rate_hz_ << " Hz specified, setting to " << (fragment_rate_hz_ == 0 ? 1 : 1000) << " Hz";
35  fragment_rate_hz_ = (fragment_rate_hz_ == 0 ? 1 : 1000);
36  }
37 }
38 
40 {
41  TLOG(TLVL_INFO) << "TestSenderPause BEGIN";
42  start_test_();
43  usleep_for_n_buffer_epochs_(2);
44 
45  TLOG(TLVL_INFO) << "Pausing First Sender";
46  pause_first_sender_ = true;
47  usleep_for_n_buffer_epochs_(2);
48  usleep(2 * event_buffer_timeout_us_);
49 
50  TLOG(TLVL_INFO) << "Resuming First Sender";
51  pause_first_sender_ = false;
52  usleep_for_n_buffer_epochs_(2);
53 
54  stop_test_();
55  TLOG(TLVL_INFO) << "TestSenderPause END";
56 }
57 
59 {
60  TLOG(TLVL_INFO) << "TestReceiverPause BEGIN";
61  start_test_();
62  usleep_for_n_buffer_epochs_(2);
63 
64  TLOG(TLVL_INFO) << "Pausing Recevier";
65  pause_receiver_ = true;
66  usleep_for_n_buffer_epochs_(2);
67  usleep(2 * event_buffer_timeout_us_);
68 
69  TLOG(TLVL_INFO) << "Resuming Receiver";
70  pause_receiver_ = false;
71  usleep_for_n_buffer_epochs_(2);
72 
73  stop_test_();
74  TLOG(TLVL_INFO) << "TestReceiverPause END";
75 }
76 
78 {
79  TLOG(TLVL_INFO) << "TestSenderReconnect BEGIN";
80  start_test_();
81  usleep_for_n_buffer_epochs_(2);
82 
83  TLOG(TLVL_INFO) << "Killing first Sender";
84  kill_first_sender_ = true;
85  if (sender_threads_[0].joinable())
86  {
87  sender_threads_[0].join();
88  }
89  kill_first_sender_ = false;
90 
91  usleep_for_n_buffer_epochs_(2);
92  usleep(2 * event_buffer_timeout_us_);
93 
94  TLOG(TLVL_INFO) << "Restarting First Sender";
95  boost::thread::attributes attrs;
96  attrs.set_stack_size(4096 * 2000); // 2000 KB
97  try
98  {
99  sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_, this, 0));
100  }
101  catch (const boost::exception& e)
102  {
103  TLOG(TLVL_ERROR) << "Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
104  std::cerr << "Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
105  exit(5);
106  }
107 
108  usleep_for_n_buffer_epochs_(2);
109 
110  stop_test_();
111  TLOG(TLVL_INFO) << "TestSenderReconnect END";
112 }
113 
115 {
116  TLOG(TLVL_INFO) << "TestReceiverReconnect BEGIN";
117  send_throttle_us_ = send_throttle_us;
118  start_test_();
119  usleep_for_n_buffer_epochs_(2);
120 
121  TLOG(TLVL_INFO) << "Killing Receiver";
122  kill_receiver_ = true;
123  if (receiver_threads_[0].joinable())
124  {
125  receiver_threads_[0].join();
126  }
127  if (receiver_threads_[1].joinable())
128  {
129  receiver_threads_[1].join();
130  }
131  kill_receiver_ = false;
132 
133  usleep_for_n_buffer_epochs_(2);
134  usleep(2 * event_buffer_timeout_us_);
135 
136  TLOG(TLVL_INFO) << "Restarting Receiver";
137  boost::thread::attributes attrs;
138  attrs.set_stack_size(4096 * 2000); // 2000 KB
139  try
140  {
141  receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_, this, 0, 2));
142  receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_, this, 1, 2));
143  }
144  catch (const boost::exception& e)
145  {
146  TLOG(TLVL_ERROR) << "Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
147  std::cerr << "Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
148  exit(5);
149  }
150 
151  usleep_for_n_buffer_epochs_(2);
152 
153  stop_test_();
154  TLOG(TLVL_INFO) << "TestReceiverReconnect END";
155 }
156 
157 fhicl::ParameterSet artdaqtest::BrokenTransferTest::make_transfer_ps_(int sender_rank, int receiver_rank, const std::string& name)
158 {
159  auto thePs = ps_.get<fhicl::ParameterSet>("default_transfer_ps", fhicl::ParameterSet());
160 
161  thePs.put_or_replace("transferPluginType", ps_.get<std::string>("transfer_to_use", "Shmem"));
162  thePs.put_or_replace("destination_rank", receiver_rank);
163  thePs.put_or_replace("source_rank", sender_rank);
164  thePs.put_or_replace("buffer_count", transfer_buffer_count_);
165  if (!thePs.has_key("max_fragment_size_words"))
166  {
167  thePs.put("max_fragment_size_words", fragment_size_ + artdaq::detail::RawFragmentHeader::num_words() + 1);
168  }
169  fhicl::ParameterSet outputPs;
170 
171  TLOG(TLVL_INFO) << "Configuring transfer between " << sender_rank << " and " << receiver_rank << " with ParameterSet: " << thePs.to_string();
172 
173  outputPs.put(name, thePs);
174  return outputPs;
175 }
176 
177 void artdaqtest::BrokenTransferTest::start_test_()
178 {
179  TLOG(TLVL_DEBUG) << "start_test_ BEGIN";
180 
181  sender_ready_[0] = false;
182  sender_ready_[1] = false;
183 
184  receiver_ready_[0] = false;
185  receiver_ready_[1] = false;
186 
187  sender_current_fragment_[0] = 0;
188  sender_current_fragment_[1] = 0;
189 
190  test_start_time_ = std::chrono::steady_clock::now();
191  test_end_time_ = std::chrono::steady_clock::now();
192 
193  test_end_requested_ = false;
194  pause_first_sender_ = false;
195  pause_receiver_ = false;
196  kill_first_sender_ = false;
197  kill_receiver_ = false;
198 
199  event_buffer_.clear();
200  complete_events_.clear();
201  timeout_events_.clear();
202 
203  TLOG(TLVL_DEBUG) << "start_test_: Starting receiver threads";
204  boost::thread::attributes attrs;
205  attrs.set_stack_size(4096 * 2000); // 2000 KB
206  try
207  {
208  receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_, this, 0, 2));
209  receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_, this, 1, 2));
210  }
211  catch (const boost::exception& e)
212  {
213  TLOG(TLVL_ERROR) << "Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
214  std::cerr << "Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
215  exit(5);
216  }
217 
218  TLOG(TLVL_DEBUG) << "start_test_: Waiting for receiver_ready_";
219  while (!receiver_ready_[0] || !receiver_ready_[1])
220  {
221  usleep(10000);
222  }
223 
224  TLOG(TLVL_DEBUG) << "start_test_: Starting sender threads";
225  try
226  {
227  sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_, this, 0));
228  sender_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_, this, 1));
229  }
230  catch (const boost::exception& e)
231  {
232  TLOG(TLVL_ERROR) << "Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
233  std::cerr << "Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
234  exit(5);
235  }
236 
237  TLOG(TLVL_DEBUG) << "start_test_: Waiting for sender_ready_";
238  while (!sender_ready_[0] || !sender_ready_[1])
239  {
240  usleep(1000);
241  }
242 
243  TLOG(TLVL_DEBUG) << "start_test_ DONE";
244 }
245 
246 void artdaqtest::BrokenTransferTest::stop_test_()
247 {
248  TLOG(TLVL_DEBUG) << "stop_test_ BEGIN";
249  test_end_time_ = std::chrono::steady_clock::now();
250  test_end_requested_ = true;
251 
252  TLOG(TLVL_DEBUG) << "stop_test_: Waiting for sender threads to shut down";
253  while (sender_ready_[0] || sender_ready_[1])
254  {
255  usleep(1000);
256  }
257 
258  TLOG(TLVL_DEBUG) << "stop_test_: Joining sender threads";
259  if (sender_threads_[0].joinable())
260  {
261  sender_threads_[0].join();
262  }
263  if (sender_threads_[1].joinable())
264  {
265  sender_threads_[1].join();
266  }
267 
268  TLOG(TLVL_DEBUG) << "stop_test_: Waiting for receiver threads to shut down";
269  while (receiver_ready_[0] || receiver_ready_[1])
270  {
271  usleep(1000);
272  }
273 
274  TLOG(TLVL_DEBUG) << "stop_test_: Joining receiver threads";
275  if (receiver_threads_[0].joinable())
276  {
277  receiver_threads_[0].join();
278  }
279  if (receiver_threads_[1].joinable())
280  {
281  receiver_threads_[1].join();
282  }
283 
284  TLOG(TLVL_INFO) << "Sent " << sender_current_fragment_[0] << " events from rank 0 and " << sender_current_fragment_[1] << " events from rank 1.";
285 
286  artdaq::Fragment::sequence_id_t expected_events = sender_current_fragment_[0];
287  if (sender_current_fragment_[1] > expected_events)
288  {
289  expected_events = sender_current_fragment_[1];
290  }
291 
292  auto complete_events = complete_events_.size();
293  auto incomplete_events = timeout_events_.size();
294  auto missing_events = expected_events - complete_events - incomplete_events;
295 
296  TLOG(TLVL_INFO) << "Received " << complete_events << " complete events in " << fm_(artdaq::TimeUtils::GetElapsedTime(test_start_time_), "s")
297  << ", Incomplete: " << incomplete_events << ", Missing: " << missing_events;
298  TLOG(TLVL_DEBUG) << "stop_test_ END";
299 }
300 
301 void artdaqtest::BrokenTransferTest::do_sending_(int sender_rank)
302 {
303  std::unique_ptr<artdaq::TransferInterface> theTransfer = artdaq::MakeTransferPlugin(make_transfer_ps_(sender_rank, 2, "d2"),
305 
306  TLOG(TLVL_DEBUG) << "Sender " << sender_rank << " setting sender_ready_";
307  sender_ready_[sender_rank] = true;
308 
309  while (sender_current_fragment_[sender_rank] < sequence_id_target_() || !test_end_requested_)
310  {
311  if (sender_rank == 0 && kill_first_sender_)
312  {
313  break;
314  }
315  while (sender_rank == 0 && pause_first_sender_)
316  {
317  std::this_thread::yield();
318  usleep(10000);
319  }
320 
321  artdaq::Fragment frag(fragment_size_);
322  frag.setSequenceID(sender_current_fragment_[sender_rank]);
323  frag.setFragmentID(sender_rank);
324  frag.setSystemType(artdaq::Fragment::DataFragmentType);
325 
326  auto start_time = std::chrono::steady_clock::now();
328 
329  if (sender_tokens_[sender_rank].load() == 0)
330  {
331  TLOG(TLVL_INFO) << "Sender " << sender_rank << " waiting for token from receiver";
332  while (sender_tokens_[sender_rank].load() == 0 && !test_end_requested_) { usleep(10000); }
333  if (test_end_requested_)
334  {
335  continue;
336  }
337  TLOG(TLVL_INFO) << "Sender " << sender_rank << " waited " << fm_(artdaq::TimeUtils::GetElapsedTime(start_time), "s") << " for token from receiver";
338  }
339 
340  if (reliable_mode_)
341  {
342  sts = theTransfer->transfer_fragment_reliable_mode(std::move(frag));
343  }
344  else
345  {
346  sts = theTransfer->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
347  }
348 
350  {
351  TLOG(TLVL_ERROR) << "Error sending Fragment " << sender_current_fragment_[sender_rank] << " from sender rank " << sender_rank << ": "
353  }
354  auto duration = artdaq::TimeUtils::GetElapsedTime(start_time);
355  TLOG(TLVL_TRACE) << "Sender " << sender_rank << " Transferred Fragment " << sender_current_fragment_[sender_rank]
356  << " with size " << fragment_size_ << " words in " << fm_(duration, "s")
357  << " (approx " << fm_(static_cast<double>(fragment_size_ * sizeof(artdaq::detail::RawFragmentHeader::RawDataType)) / duration, "B/s")
358  << ") throttle " << send_throttle_us_;
359  ++sender_current_fragment_[sender_rank];
360  sender_tokens_[sender_rank]--;
361  if (send_throttle_us_ != 0)
362  {
363  usleep(send_throttle_us_);
364  }
365  }
366 
367  TLOG(TLVL_DEBUG) << "Sender " << sender_rank << " shutting down...";
368  theTransfer.reset(nullptr);
369  sender_ready_[sender_rank] = false;
370  TLOG(TLVL_DEBUG) << "Sender " << sender_rank << " DONE";
371 }
372 
373 void artdaqtest::BrokenTransferTest::do_receiving_(int sender_rank, int receiver_rank)
374 {
375  std::unique_ptr<artdaq::TransferInterface> theTransfer =
376  artdaq::MakeTransferPlugin(make_transfer_ps_(sender_rank, receiver_rank, "s" + std::to_string(sender_rank)),
377  "s" + std::to_string(sender_rank), artdaq::TransferInterface::Role::kReceive);
378  artdaq::FragmentPtr dropFrag = nullptr;
379 
380  TLOG(TLVL_DEBUG) << "Receiver " << sender_rank << "->" << receiver_rank << " setting receiver_ready_";
381  receiver_ready_[sender_rank] = true;
382  sender_tokens_[sender_rank] = event_buffer_count_;
383 
384  while (!event_buffer_.empty() || !test_end_requested_ || sender_ready_[0] || sender_ready_[1])
385  {
386  if (kill_receiver_)
387  {
388  break;
389  }
390  while (pause_receiver_)
391  {
392  std::this_thread::yield();
393  usleep(10000);
394  }
395 
396  artdaq::detail::RawFragmentHeader hdr;
397  auto rank = theTransfer->receiveFragmentHeader(hdr, 100000);
398 
399  if (rank == artdaq::TransferInterface::RECV_TIMEOUT || event_buffer_.count(hdr.sequence_id) == 0)
400  {
401  std::unique_lock<std::mutex> lk(event_buffer_mutex_);
402  do
403  {
404  event_buffer_cv_.wait_for(lk, std::chrono::microseconds(10000));
405 
406  auto it = event_buffer_.begin();
407  while (it != event_buffer_.end())
408  {
409  if (artdaq::TimeUtils::GetElapsedTimeMicroseconds(it->second.open_time) > event_buffer_timeout_us_)
410  {
411  TLOG(TLVL_WARNING) << "Receiver " << sender_rank << "->" << receiver_rank << ": Event " << it->first
412  << " has timed out after " << artdaq::TimeUtils::GetElapsedTime(it->second.open_time) << " s, removing...";
413  timeout_events_.insert(it->first);
414  it = event_buffer_.erase(it);
415  sender_tokens_[0]++;
416  sender_tokens_[1]++;
417  }
418  else
419  {
420  ++it;
421  }
422  }
423  } while (event_buffer_.size() > event_buffer_count_);
424  }
425 
426  if (rank != sender_rank)
427  {
428  continue;
429  }
430 
431  artdaq::RawDataType* ptr = nullptr;
432  bool first = true;
433  {
434  std::unique_lock<std::mutex> lk(event_buffer_mutex_);
435  if (timeout_events_.count(hdr.sequence_id) != 0u)
436  {
437  TLOG(TLVL_WARNING) << "Event " << hdr.sequence_id << " has timed out, discarding";
438  if (!dropFrag || dropFrag->size() < hdr.word_count)
439  {
440  dropFrag = std::make_unique<artdaq::Fragment>(hdr.word_count - hdr.num_words());
441  }
442  ptr = dropFrag->headerAddress() + hdr.num_words(); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
443  }
444  else
445  {
446  if (event_buffer_.count(hdr.sequence_id) == 0u)
447  {
448  event_buffer_[hdr.sequence_id].open_time = std::chrono::steady_clock::now();
449  event_buffer_[hdr.sequence_id].first_frag = artdaq::Fragment(hdr.word_count - hdr.num_words());
450  ptr = event_buffer_[hdr.sequence_id].first_frag.headerAddress() + hdr.num_words();// NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
451  TLOG(TLVL_TRACE) << "Receiver " << sender_rank << "->" << receiver_rank << " opened event " << hdr.sequence_id
452  << " with Fragment from rank " << sender_rank;
453  }
454  else
455  {
456  event_buffer_[hdr.sequence_id].second_frag = artdaq::Fragment(hdr.word_count - hdr.num_words());
457  ptr = event_buffer_[hdr.sequence_id].second_frag.headerAddress() + hdr.num_words();// NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
458  first = false;
459  }
460  }
461  }
462 
463  rank = theTransfer->receiveFragmentData(ptr, hdr.word_count - hdr.num_words());
464  if (rank != sender_rank)
465  {
466  TLOG(TLVL_ERROR) << "Error receiving Fragment data after header received successfully!";
467  exit(1);
468  }
469 
470  if (!first)
471  {
472  TLOG(TLVL_TRACE) << "Receiver " << sender_rank << "->" << receiver_rank << " completed event " << hdr.sequence_id
473  << " in " << fm_(artdaq::TimeUtils::GetElapsedTime(event_buffer_[hdr.sequence_id].open_time), "s") << ".";
474 
475  std::unique_lock<std::mutex> lk(event_buffer_mutex_);
476  complete_events_.insert(hdr.sequence_id);
477  event_buffer_.erase(hdr.sequence_id);
478  event_buffer_cv_.notify_one();
479  sender_tokens_[0]++;
480  sender_tokens_[1]++;
481  }
482  }
483 
484  TLOG(TLVL_DEBUG) << "Receiver " << sender_rank << "->" << receiver_rank << " shutting down...";
485  theTransfer->flush_buffers();
486 
487  std::lock_guard<std::mutex> lk(event_buffer_mutex_);
488  theTransfer.reset(nullptr);
489  receiver_ready_[sender_rank] = false;
490  TLOG(TLVL_DEBUG) << "Receiver " << sender_rank << "->" << receiver_rank << " DONE";
491 }
492 
493 artdaq::Fragment::sequence_id_t artdaqtest::BrokenTransferTest::sequence_id_target_()
494 {
495  auto ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_start_time_) * fragment_rate_hz_ / 1000000);
496  if (test_end_requested_)
497  {
498  ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_end_time_) * fragment_rate_hz_ / 1000000);
499  }
500  //TLOG(TLVL_DEBUG) << "sequence_id_target_ is " << ret;
501  return ret;
502 }
503 
504 std::string artdaqtest::BrokenTransferTest::fm_(double data, const std::string& units, int logt)
505 {
506  if (data < 1 && logt > -3)
507  {
508  return fm_(data * 1000, units, logt - 1);
509  }
510  if (data > 1000 && logt < 3)
511  {
512  return fm_(data / 1000, units, logt + 1);
513  }
514 
515  std::stringstream o;
516  o << std::fixed << std::setprecision(2) << data << " ";
517  switch (logt)
518  {
519  case -3:
520  o << "n";
521  break;
522  case -2:
523  o << "u";
524  break;
525  case -1:
526  o << "m";
527  break;
528  case 0:
529  default:
530  break;
531  case 1:
532  o << "K";
533  break;
534  case 2:
535  o << "M";
536  break;
537  case 3:
538  o << "G";
539  break;
540  }
541  o << units;
542  return o.str();
543 }
virtual int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout)=0
Receive a Fragment Header from the transport mechanism.
BrokenTransferTest(const fhicl::ParameterSet &ps)
BrokenTransferTest Constructor
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
This TransferInterface is a Receiver.
virtual void flush_buffers()=0
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
static std::string CopyStatusToString(CopyStatus in)
Convert a CopyStatus variable to its string represenatation
virtual int receiveFragmentData(RawDataType *destination, size_t wordCount)=0
Receive the body of a Fragment to the given destination pointer.
void TestReceiverReconnect(int send_throttle_us=0)
Run the &quot;Receiver Reconnect&quot; test
This TransferInterface is a Sender.
virtual CopyStatus transfer_fragment_reliable_mode(artdaq::Fragment &&fragment)=0
Transfer a Fragment to the destination. This should be reliable, if the underlying transport mechanis...
Some error occurred, but no exception was thrown.
The send operation completed successfully.
virtual CopyStatus transfer_fragment_min_blocking_mode(artdaq::Fragment const &fragment, size_t send_timeout_usec)=0
Transfer a Fragment to the destination. May not necessarily be reliable, but will not block longer th...
void TestSenderReconnect()
Run the &quot;Sender Reconnect&quot; test
void TestSenderPause()
Run the &quot;Sender Paused&quot; test
Value to be returned upon receive timeout.
void TestReceiverPause()
Run the &quot;Receiver Paused&quot; test