artdaq  v3_07_02
BrokenTransferTest.cc
1 #include "BrokenTransferTest.hh"
2 
3 #include "artdaq-core/Data/detail/RawFragmentHeader.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 
6 #include <thread>
7 #include "artdaq/DAQdata/Globals.hh"
8 #define TRACE_NAME "BrokenTransferTest"
9 
11  : sender_ready_()
12  , receiver_ready_()
13  , sender_current_fragment_()
14  , ps_(ps)
15  , test_start_time_(std::chrono::steady_clock::now())
16  , test_end_time_(std::chrono::steady_clock::now())
17  , test_end_requested_(false)
18  , fragment_rate_hz_(ps.get<size_t>("fragment_rate_hz", 10))
19  , pause_first_sender_(false)
20  , pause_receiver_(false)
21  , kill_first_sender_(false)
22  , kill_receiver_(false)
23  , reliable_mode_(ps.get<bool>("reliable_mode", true))
24  , fragment_size_(ps.get<size_t>("fragment_size", 0x10000))
25  , send_timeout_us_(ps.get<size_t>("send_timeout_us", 100000))
26  , event_buffer_()
27  , timeout_events_()
28  , complete_events_()
29  , transfer_buffer_count_(ps.get<size_t>("transfer_buffer_count", 10))
30  , event_buffer_count_(ps.get<size_t>("event_buffer_count", 20))
31  , event_buffer_timeout_us_(ps.get<size_t>("event_buffer_timeout_us", 1000000))
32  , send_throttle_us_(0)
33 {
34  if (fragment_rate_hz_ == 0 || fragment_rate_hz_ > 100000)
35  {
36  TLOG(TLVL_WARNING) << "Invalid rate " << fragment_rate_hz_ << " Hz specified, setting to " << (fragment_rate_hz_ == 0 ? 1 : 1000) << " Hz";
37  fragment_rate_hz_ = (fragment_rate_hz_ == 0 ? 1 : 1000);
38  }
39 }
40 
42 {
43  TLOG(TLVL_INFO) << "TestSenderPause BEGIN";
44  start_test_();
45  usleep_for_n_buffer_epochs_(2);
46 
47  TLOG(TLVL_INFO) << "Pausing First Sender";
48  pause_first_sender_ = true;
49  usleep_for_n_buffer_epochs_(2);
50  usleep(2 * event_buffer_timeout_us_);
51 
52  TLOG(TLVL_INFO) << "Resuming First Sender";
53  pause_first_sender_ = false;
54  usleep_for_n_buffer_epochs_(2);
55 
56  stop_test_();
57  TLOG(TLVL_INFO) << "TestSenderPause END";
58 }
59 
61 {
62  TLOG(TLVL_INFO) << "TestReceiverPause BEGIN";
63  start_test_();
64  usleep_for_n_buffer_epochs_(2);
65 
66  TLOG(TLVL_INFO) << "Pausing Recevier";
67  pause_receiver_ = true;
68  usleep_for_n_buffer_epochs_(2);
69  usleep(2 * event_buffer_timeout_us_);
70 
71  TLOG(TLVL_INFO) << "Resuming Receiver";
72  pause_receiver_ = false;
73  usleep_for_n_buffer_epochs_(2);
74 
75  stop_test_();
76  TLOG(TLVL_INFO) << "TestReceiverPause END";
77 }
78 
80 {
81  TLOG(TLVL_INFO) << "TestSenderReconnect BEGIN";
82  start_test_();
83  usleep_for_n_buffer_epochs_(2);
84 
85  TLOG(TLVL_INFO) << "Killing first Sender";
86  kill_first_sender_ = true;
87  if (sender_threads_[0].joinable()) sender_threads_[0].join();
88  kill_first_sender_ = false;
89 
90  usleep_for_n_buffer_epochs_(2);
91  usleep(2 * event_buffer_timeout_us_);
92 
93  TLOG(TLVL_INFO) << "Restarting First Sender";
94  boost::thread::attributes attrs;
95  attrs.set_stack_size(4096 * 2000); // 2000 KB
96  try
97  {
98  sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_, this, 0));
99  }
100  catch (const boost::exception& e)
101  {
102  TLOG(TLVL_ERROR) << "Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
103  std::cerr << "Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
104  exit(5);
105  }
106 
107  usleep_for_n_buffer_epochs_(2);
108 
109  stop_test_();
110  TLOG(TLVL_INFO) << "TestSenderReconnect END";
111 }
112 
114 {
115  TLOG(TLVL_INFO) << "TestReceiverReconnect BEGIN";
116  send_throttle_us_ = send_throttle_us;
117  start_test_();
118  usleep_for_n_buffer_epochs_(2);
119 
120  TLOG(TLVL_INFO) << "Killing Receiver";
121  kill_receiver_ = true;
122  if (receiver_threads_[0].joinable()) receiver_threads_[0].join();
123  if (receiver_threads_[1].joinable()) receiver_threads_[1].join();
124  kill_receiver_ = false;
125 
126  usleep_for_n_buffer_epochs_(2);
127  usleep(2 * event_buffer_timeout_us_);
128 
129  TLOG(TLVL_INFO) << "Restarting Receiver";
130  boost::thread::attributes attrs;
131  attrs.set_stack_size(4096 * 2000); // 2000 KB
132  try
133  {
134  receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_, this, 0, 2));
135  receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_, this, 1, 2));
136  }
137  catch (const boost::exception& e)
138  {
139  TLOG(TLVL_ERROR) << "Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
140  std::cerr << "Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
141  exit(5);
142  }
143 
144  usleep_for_n_buffer_epochs_(2);
145 
146  stop_test_();
147  TLOG(TLVL_INFO) << "TestReceiverReconnect END";
148 }
149 
150 fhicl::ParameterSet artdaqtest::BrokenTransferTest::make_transfer_ps_(int sender_rank, int receiver_rank, std::string name)
151 {
152  fhicl::ParameterSet thePs = ps_.get<fhicl::ParameterSet>("default_transfer_ps", fhicl::ParameterSet());
153 
154  thePs.put_or_replace("transferPluginType", ps_.get<std::string>("transfer_to_use", "Shmem"));
155  thePs.put_or_replace("destination_rank", receiver_rank);
156  thePs.put_or_replace("source_rank", sender_rank);
157  thePs.put_or_replace("buffer_count", transfer_buffer_count_);
158  if (!thePs.has_key("max_fragment_size_words"))
159  {
160  thePs.put("max_fragment_size_words", fragment_size_ + artdaq::detail::RawFragmentHeader::num_words() + 1);
161  }
162  fhicl::ParameterSet outputPs;
163 
164  TLOG(TLVL_INFO) << "Configuring transfer between " << sender_rank << " and " << receiver_rank << " with ParameterSet: " << thePs.to_string();
165 
166  outputPs.put(name, thePs);
167  return outputPs;
168 }
169 
170 void artdaqtest::BrokenTransferTest::start_test_()
171 {
172  TLOG(TLVL_DEBUG) << "start_test_ BEGIN";
173 
174  sender_ready_[0] = false;
175  sender_ready_[1] = false;
176 
177  receiver_ready_[0] = false;
178  receiver_ready_[1] = false;
179 
180  sender_current_fragment_[0] = 0;
181  sender_current_fragment_[1] = 0;
182 
183  test_start_time_ = std::chrono::steady_clock::now();
184  test_end_time_ = std::chrono::steady_clock::now();
185 
186  test_end_requested_ = false;
187  pause_first_sender_ = false;
188  pause_receiver_ = false;
189  kill_first_sender_ = false;
190  kill_receiver_ = false;
191 
192  event_buffer_.clear();
193  complete_events_.clear();
194  timeout_events_.clear();
195 
196  TLOG(TLVL_DEBUG) << "start_test_: Starting receiver threads";
197  boost::thread::attributes attrs;
198  attrs.set_stack_size(4096 * 2000); // 2000 KB
199  try
200  {
201  receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_, this, 0, 2));
202  receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_, this, 1, 2));
203  }
204  catch (const boost::exception& e)
205  {
206  TLOG(TLVL_ERROR) << "Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
207  std::cerr << "Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
208  exit(5);
209  }
210 
211  TLOG(TLVL_DEBUG) << "start_test_: Waiting for receiver_ready_";
212  while (!receiver_ready_[0] || !receiver_ready_[1])
213  {
214  usleep(10000);
215  }
216 
217  TLOG(TLVL_DEBUG) << "start_test_: Starting sender threads";
218  try
219  {
220  sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_, this, 0));
221  sender_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_, this, 1));
222  }
223  catch (const boost::exception& e)
224  {
225  TLOG(TLVL_ERROR) << "Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
226  std::cerr << "Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
227  exit(5);
228  }
229 
230  TLOG(TLVL_DEBUG) << "start_test_: Waiting for sender_ready_";
231  while (!sender_ready_[0] || !sender_ready_[1])
232  {
233  usleep(1000);
234  }
235 
236  TLOG(TLVL_DEBUG) << "start_test_ DONE";
237 }
238 
239 void artdaqtest::BrokenTransferTest::stop_test_()
240 {
241  TLOG(TLVL_DEBUG) << "stop_test_ BEGIN";
242  test_end_time_ = std::chrono::steady_clock::now();
243  test_end_requested_ = true;
244 
245  TLOG(TLVL_DEBUG) << "stop_test_: Joining sender threads";
246  if (sender_threads_[0].joinable()) sender_threads_[0].join();
247  if (sender_threads_[1].joinable()) sender_threads_[1].join();
248 
249  TLOG(TLVL_DEBUG) << "stop_test_: Joining receiver threads";
250  if (receiver_threads_[0].joinable()) receiver_threads_[0].join();
251  if (receiver_threads_[1].joinable()) receiver_threads_[1].join();
252 
253  TLOG(TLVL_INFO) << "Sent " << sender_current_fragment_[0] << " events from rank 0 and " << sender_current_fragment_[1] << " events from rank 1.";
254 
255  artdaq::Fragment::sequence_id_t expected_events = sender_current_fragment_[0];
256  if (sender_current_fragment_[1] > expected_events)
257  expected_events = sender_current_fragment_[1];
258 
259  auto complete_events = complete_events_.size();
260  auto incomplete_events = timeout_events_.size();
261  auto missing_events = expected_events - complete_events - incomplete_events;
262 
263  TLOG(TLVL_INFO) << "Received " << complete_events << " complete events in " << fm_(artdaq::TimeUtils::GetElapsedTime(test_start_time_), "s")
264  << ", Incomplete: " << incomplete_events << ", Missing: " << missing_events;
265  TLOG(TLVL_DEBUG) << "stop_test_ END";
266 }
267 
268 void artdaqtest::BrokenTransferTest::do_sending_(int sender_rank)
269 {
270  std::unique_ptr<artdaq::TransferInterface> theTransfer = artdaq::MakeTransferPlugin(make_transfer_ps_(sender_rank, 2, "d2"),
272 
273  TLOG(TLVL_DEBUG) << "Sender " << sender_rank << " setting sender_ready_";
274  sender_ready_[sender_rank] = true;
275 
276  while (sender_current_fragment_[sender_rank] < sequence_id_target_() || !test_end_requested_)
277  {
278  if (sender_rank == 0 && kill_first_sender_) break;
279  while (sender_rank == 0 && pause_first_sender_)
280  {
281  std::this_thread::yield();
282  usleep(10000);
283  }
284 
285  artdaq::Fragment frag(fragment_size_);
286  frag.setSequenceID(sender_current_fragment_[sender_rank]);
287  frag.setFragmentID(sender_rank);
288  frag.setSystemType(artdaq::Fragment::DataFragmentType);
289 
290  auto start_time = std::chrono::steady_clock::now();
292 
293  if (sender_tokens_[sender_rank].load() == 0)
294  {
295  TLOG(TLVL_INFO) << "Sender " << sender_rank << " waiting for token from receiver";
296  while (sender_tokens_[sender_rank].load() == 0 && !test_end_requested_) { usleep(10000); }
297  if (test_end_requested_) continue;
298  TLOG(TLVL_INFO) << "Sender " << sender_rank << " waited " << fm_(artdaq::TimeUtils::GetElapsedTime(start_time), "s") << " for token from receiver";
299  }
300 
301  if (reliable_mode_)
302  {
303  sts = theTransfer->transfer_fragment_reliable_mode(std::move(frag));
304  }
305  else
306  {
307  sts = theTransfer->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
308  }
309 
311  {
312  TLOG(TLVL_ERROR) << "Error sending Fragment " << sender_current_fragment_[sender_rank] << " from sender rank " << sender_rank << ": "
314  }
315  auto duration = artdaq::TimeUtils::GetElapsedTime(start_time);
316  TLOG(TLVL_TRACE) << "Sender " << sender_rank << " Transferred Fragment " << sender_current_fragment_[sender_rank]
317  << " with size " << fragment_size_ << " words in " << fm_(duration, "s")
318  << " (approx " << fm_(static_cast<double>(fragment_size_ * sizeof(artdaq::detail::RawFragmentHeader::RawDataType)) / duration, "B/s")
319  << ") throttle " << send_throttle_us_;
320  ++sender_current_fragment_[sender_rank];
321  sender_tokens_[sender_rank]--;
322  if (send_throttle_us_)
323  usleep(send_throttle_us_);
324  }
325 
326  TLOG(TLVL_DEBUG) << "Sender " << sender_rank << " shutting down...";
327  theTransfer.reset(nullptr);
328  TLOG(TLVL_DEBUG) << "Sender " << sender_rank << " DONE";
329 }
330 
331 void artdaqtest::BrokenTransferTest::do_receiving_(int sender_rank, int receiver_rank)
332 {
333  std::unique_ptr<artdaq::TransferInterface> theTransfer =
334  artdaq::MakeTransferPlugin(make_transfer_ps_(sender_rank, receiver_rank, "s" + std::to_string(sender_rank)),
335  "s" + std::to_string(sender_rank), artdaq::TransferInterface::Role::kReceive);
336  artdaq::FragmentPtr dropFrag = nullptr;
337 
338  TLOG(TLVL_DEBUG) << "Receiver " << sender_rank << "->" << receiver_rank << " setting receiver_ready_";
339  receiver_ready_[sender_rank] = true;
340  sender_tokens_[sender_rank] = event_buffer_count_;
341 
342  while (event_buffer_.size() > 0 || !test_end_requested_)
343  {
344  if (kill_receiver_) break;
345  while (pause_receiver_)
346  {
347  std::this_thread::yield();
348  usleep(10000);
349  }
350 
351  artdaq::detail::RawFragmentHeader hdr;
352  auto rank = theTransfer->receiveFragmentHeader(hdr, 100000);
353 
354  if (rank == artdaq::TransferInterface::RECV_TIMEOUT || event_buffer_.count(hdr.sequence_id) == 0)
355  {
356  std::unique_lock<std::mutex> lk(event_buffer_mutex_);
357  do
358  {
359  event_buffer_cv_.wait_for(lk, std::chrono::microseconds(10000));
360 
361  auto it = event_buffer_.begin();
362  while (it != event_buffer_.end())
363  {
364  if (artdaq::TimeUtils::GetElapsedTimeMicroseconds(it->second.open_time) > event_buffer_timeout_us_)
365  {
366  TLOG(TLVL_WARNING) << "Receiver " << sender_rank << "->" << receiver_rank << ": Event " << it->first
367  << " has timed out after " << artdaq::TimeUtils::GetElapsedTime(it->second.open_time) << " s, removing...";
368  timeout_events_.insert(it->first);
369  it = event_buffer_.erase(it);
370  sender_tokens_[0]++;
371  sender_tokens_[1]++;
372  }
373  else
374  {
375  ++it;
376  }
377  }
378  } while (event_buffer_.size() > event_buffer_count_);
379  }
380 
381  if (rank != sender_rank) continue;
382 
383  artdaq::RawDataType* ptr = nullptr;
384  bool first = true;
385  {
386  std::unique_lock<std::mutex> lk(event_buffer_mutex_);
387  if (timeout_events_.count(hdr.sequence_id))
388  {
389  TLOG(TLVL_WARNING) << "Event " << hdr.sequence_id << " has timed out, discarding";
390  if (!dropFrag || dropFrag->size() < hdr.word_count)
391  {
392  dropFrag.reset(new artdaq::Fragment(hdr.word_count - hdr.num_words()));
393  }
394  ptr = dropFrag->headerAddress() + hdr.num_words();
395  }
396  else
397  {
398  if (!event_buffer_.count(hdr.sequence_id))
399  {
400  event_buffer_[hdr.sequence_id].open_time = std::chrono::steady_clock::now();
401  event_buffer_[hdr.sequence_id].first_frag = artdaq::Fragment(hdr.word_count - hdr.num_words());
402  ptr = event_buffer_[hdr.sequence_id].first_frag.headerAddress() + hdr.num_words();
403  TLOG(TLVL_TRACE) << "Receiver " << sender_rank << "->" << receiver_rank << " opened event " << hdr.sequence_id
404  << " with Fragment from rank " << sender_rank;
405  }
406  else
407  {
408  event_buffer_[hdr.sequence_id].second_frag = artdaq::Fragment(hdr.word_count - hdr.num_words());
409  ptr = event_buffer_[hdr.sequence_id].second_frag.headerAddress() + hdr.num_words();
410  first = false;
411  }
412  }
413  }
414 
415  rank = theTransfer->receiveFragmentData(ptr, hdr.word_count - hdr.num_words());
416  if (rank != sender_rank)
417  {
418  TLOG(TLVL_ERROR) << "Error receiving Fragment data after header received successfully!";
419  exit(1);
420  }
421 
422  if (!first)
423  {
424  TLOG(TLVL_TRACE) << "Receiver " << sender_rank << "->" << receiver_rank << " completed event " << hdr.sequence_id
425  << " in " << fm_(artdaq::TimeUtils::GetElapsedTime(event_buffer_[hdr.sequence_id].open_time), "s") << ".";
426 
427  std::unique_lock<std::mutex> lk(event_buffer_mutex_);
428  complete_events_.insert(hdr.sequence_id);
429  event_buffer_.erase(hdr.sequence_id);
430  event_buffer_cv_.notify_one();
431  sender_tokens_[0]++;
432  sender_tokens_[1]++;
433  }
434  }
435 
436  TLOG(TLVL_DEBUG) << "Receiver " << sender_rank << "->" << receiver_rank << " shutting down...";
437  theTransfer->flush_buffers();
438 
439  std::lock_guard<std::mutex> lk(event_buffer_mutex_);
440  theTransfer.reset(nullptr);
441  TLOG(TLVL_DEBUG) << "Receiver " << sender_rank << "->" << receiver_rank << " DONE";
442 }
443 
444 artdaq::Fragment::sequence_id_t artdaqtest::BrokenTransferTest::sequence_id_target_()
445 {
446  auto ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_start_time_) * fragment_rate_hz_ / 1000000);
447  if (test_end_requested_)
448  ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_end_time_) * fragment_rate_hz_ / 1000000);
449  //TLOG(TLVL_DEBUG) << "sequence_id_target_ is " << ret;
450  return ret;
451 }
452 
453 std::string artdaqtest::BrokenTransferTest::fm_(double data, std::string units, int logt)
454 {
455  if (data < 1 && logt > -3)
456  {
457  return fm_(data * 1000, units, logt - 1);
458  }
459  else if (data > 1000 && logt < 3)
460  {
461  return fm_(data / 1000, units, logt + 1);
462  }
463 
464  std::stringstream o;
465  o << std::fixed << std::setprecision(2) << data << " ";
466  switch (logt)
467  {
468  case -3:
469  o << "n";
470  break;
471  case -2:
472  o << "u";
473  break;
474  case -1:
475  o << "m";
476  break;
477  case 0:
478  default:
479  break;
480  case 1:
481  o << "K";
482  break;
483  case 2:
484  o << "M";
485  break;
486  case 3:
487  o << "G";
488  break;
489  }
490  o << units;
491  return o.str();
492 }
virtual int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout)=0
Receive a Fragment Header from the transport mechanism.
BrokenTransferTest(fhicl::ParameterSet ps)
BrokenTransferTest Constructor
This TransferInterface is a Receiver.
virtual void flush_buffers()=0
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
static std::string CopyStatusToString(CopyStatus in)
Convert a CopyStatus variable to its string represenatation
virtual int receiveFragmentData(RawDataType *destination, size_t wordCount)=0
Receive the body of a Fragment to the given destination pointer.
void TestReceiverReconnect(int send_throttle_us=0)
Run the &quot;Receiver Reconnect&quot; test
This TransferInterface is a Sender.
virtual CopyStatus transfer_fragment_reliable_mode(artdaq::Fragment &&fragment)=0
Transfer a Fragment to the destination. This should be reliable, if the underlying transport mechanis...
Some error occurred, but no exception was thrown.
The send operation completed successfully.
virtual CopyStatus transfer_fragment_min_blocking_mode(artdaq::Fragment const &fragment, size_t send_timeout_usec)=0
Transfer a Fragment to the destination. May not necessarily be reliable, but will not block longer th...
void TestSenderReconnect()
Run the &quot;Sender Reconnect&quot; test
void TestSenderPause()
Run the &quot;Sender Paused&quot; test
Value to be returned upon receive timeout.
void TestReceiverPause()
Run the &quot;Receiver Paused&quot; test