1 #include "BrokenTransferTest.hh"
3 #include "artdaq-core/Data/detail/RawFragmentHeader.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
8 #include "artdaq/DAQdata/Globals.hh"
9 #define TRACE_NAME "BrokenTransferTest"
11 #define TLVL_MAKE_TRANSFER_PS TLVL_DEBUG + 5
12 #define TLVL_START_TEST TLVL_DEBUG + 6
13 #define TLVL_STOP_TEST TLVL_DEBUG + 7
14 #define TLVL_SENDER TLVL_DEBUG + 8
15 #define TLVL_SENDER_TOKEN_WAIT TLVL_DEBUG + 9
16 #define TLVL_RECEIVER TLVL_DEBUG + 10
21 , sender_current_fragment_()
23 , test_start_time_(std::chrono::steady_clock::now())
24 , test_end_time_(std::chrono::steady_clock::now())
25 , test_end_requested_(false)
26 , fragment_rate_hz_(ps.get<size_t>(
"fragment_rate_hz", 10))
27 , pause_first_sender_(false)
28 , pause_receiver_(false)
29 , kill_first_sender_(false)
30 , kill_receiver_(false)
31 , reliable_mode_(ps.get<bool>(
"reliable_mode", true))
32 , fragment_size_(ps.get<size_t>(
"fragment_size", 0x10000))
33 , send_timeout_us_(ps.get<size_t>(
"send_timeout_us", 100000))
34 , transfer_buffer_count_(ps.get<size_t>(
"transfer_buffer_count", 10))
35 , event_buffer_count_(ps.get<size_t>(
"event_buffer_count", 20))
36 , event_buffer_timeout_us_(ps.get<size_t>(
"event_buffer_timeout_us", 1000000))
37 , send_throttle_us_(0)
39 if (fragment_rate_hz_ == 0 || fragment_rate_hz_ > 100000)
41 TLOG(TLVL_WARNING) <<
"Invalid rate " << fragment_rate_hz_ <<
" Hz specified, setting to " << (fragment_rate_hz_ == 0 ? 1 : 1000) <<
" Hz";
42 fragment_rate_hz_ = (fragment_rate_hz_ == 0 ? 1 : 1000);
48 TLOG(TLVL_INFO) <<
"TestSenderPause BEGIN";
49 auto start_time = std::chrono::steady_clock::now();
51 usleep_for_n_buffer_epochs_(2);
53 TLOG(TLVL_INFO) <<
"Pausing First Sender";
54 pause_first_sender_ =
true;
55 usleep_for_n_buffer_epochs_(2);
56 usleep(2 * event_buffer_timeout_us_);
58 TLOG(TLVL_INFO) <<
"Resuming First Sender";
59 pause_first_sender_ =
false;
60 usleep_for_n_buffer_epochs_(2);
63 TLOG(TLVL_INFO) <<
"TestSenderPause END, duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
68 TLOG(TLVL_INFO) <<
"TestReceiverPause BEGIN";
69 auto start_time = std::chrono::steady_clock::now();
71 usleep_for_n_buffer_epochs_(2);
73 TLOG(TLVL_INFO) <<
"Pausing Recevier";
74 pause_receiver_ =
true;
75 usleep_for_n_buffer_epochs_(2);
76 usleep(2 * event_buffer_timeout_us_);
78 TLOG(TLVL_INFO) <<
"Resuming Receiver";
79 pause_receiver_ =
false;
80 usleep_for_n_buffer_epochs_(2);
83 TLOG(TLVL_INFO) <<
"TestReceiverPause END, duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
88 TLOG(TLVL_INFO) <<
"TestSenderReconnect BEGIN";
89 auto start_time = std::chrono::steady_clock::now();
91 usleep_for_n_buffer_epochs_(2);
93 TLOG(TLVL_INFO) <<
"Killing first Sender";
94 kill_first_sender_ =
true;
95 if (sender_threads_[0].joinable())
97 sender_threads_[0].join();
99 kill_first_sender_ =
false;
101 usleep_for_n_buffer_epochs_(2);
102 usleep(2 * event_buffer_timeout_us_);
104 TLOG(TLVL_INFO) <<
"Restarting First Sender";
105 boost::thread::attributes attrs;
106 attrs.set_stack_size(4096 * 2000);
109 sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 0));
111 catch (
const boost::exception& e)
113 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
114 std::cerr <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
118 usleep_for_n_buffer_epochs_(2);
121 TLOG(TLVL_INFO) <<
"TestSenderReconnect END, duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
126 TLOG(TLVL_INFO) <<
"TestReceiverReconnect BEGIN";
127 auto start_time = std::chrono::steady_clock::now();
128 send_throttle_us_ = send_throttle_factor * 1000000 / fragment_rate_hz_;
130 usleep_for_n_buffer_epochs_(2);
132 TLOG(TLVL_INFO) <<
"Killing Receiver duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
134 kill_receiver_ =
true;
135 if (receiver_threads_[0].joinable())
137 receiver_threads_[0].join();
139 if (receiver_threads_[1].joinable())
141 receiver_threads_[1].join();
143 kill_receiver_ =
false;
145 usleep_for_n_buffer_epochs_(2);
146 usleep(2 * event_buffer_timeout_us_);
148 TLOG(TLVL_INFO) <<
"Restarting Receiver duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
149 boost::thread::attributes attrs;
150 attrs.set_stack_size(4096 * 2000);
153 receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 0, 2));
154 receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 1, 2));
156 catch (
const boost::exception& e)
158 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
159 std::cerr <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
163 usleep_for_n_buffer_epochs_(2);
165 TLOG(TLVL_INFO) <<
"Stopping test, duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
167 TLOG(TLVL_INFO) <<
"TestReceiverReconnect END, duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
170 fhicl::ParameterSet artdaqtest::BrokenTransferTest::make_transfer_ps_(
int sender_rank,
int receiver_rank,
const std::string& name)
172 auto thePs = ps_.get<fhicl::ParameterSet>(
"default_transfer_ps", fhicl::ParameterSet());
174 thePs.put_or_replace(
"transferPluginType", ps_.get<std::string>(
"transfer_to_use",
"Shmem"));
175 thePs.put_or_replace(
"destination_rank", receiver_rank);
176 thePs.put_or_replace(
"source_rank", sender_rank);
177 thePs.put_or_replace(
"buffer_count", transfer_buffer_count_);
178 if (!thePs.has_key(
"max_fragment_size_words"))
180 thePs.put(
"max_fragment_size_words", fragment_size_ + artdaq::detail::RawFragmentHeader::num_words() + 1);
182 fhicl::ParameterSet outputPs;
184 TLOG(TLVL_MAKE_TRANSFER_PS) <<
"Configuring transfer between " << sender_rank <<
" and " << receiver_rank <<
" with ParameterSet: " << thePs.to_string();
186 outputPs.put(name, thePs);
190 void artdaqtest::BrokenTransferTest::start_test_()
192 TLOG(TLVL_START_TEST) <<
"start_test_ BEGIN";
194 sender_ready_[0] =
false;
195 sender_ready_[1] =
false;
197 receiver_ready_[0] =
false;
198 receiver_ready_[1] =
false;
200 sender_current_fragment_[0] = 0;
201 sender_current_fragment_[1] = 0;
203 test_start_time_ = std::chrono::steady_clock::now();
204 test_end_time_ = std::chrono::steady_clock::now();
206 test_end_requested_ =
false;
207 pause_first_sender_ =
false;
208 pause_receiver_ =
false;
209 kill_first_sender_ =
false;
210 kill_receiver_ =
false;
212 event_buffer_.clear();
213 complete_events_.clear();
214 timeout_events_.clear();
216 TLOG(TLVL_START_TEST) <<
"start_test_: Starting receiver threads";
217 boost::thread::attributes attrs;
218 attrs.set_stack_size(4096 * 2000);
221 receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 0, 2));
222 receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 1, 2));
224 catch (
const boost::exception& e)
226 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
227 std::cerr <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
231 TLOG(TLVL_START_TEST) <<
"start_test_: Waiting for receiver_ready_";
232 while (!receiver_ready_[0] || !receiver_ready_[1])
237 TLOG(TLVL_START_TEST) <<
"start_test_: Starting sender threads";
240 sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 0));
241 sender_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 1));
243 catch (
const boost::exception& e)
245 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
246 std::cerr <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
250 TLOG(TLVL_START_TEST) <<
"start_test_: Waiting for sender_ready_";
251 while (!sender_ready_[0] || !sender_ready_[1])
256 TLOG(TLVL_START_TEST) <<
"start_test_ DONE";
259 void artdaqtest::BrokenTransferTest::stop_test_()
261 TLOG(TLVL_STOP_TEST) <<
"stop_test_ BEGIN";
262 test_end_time_ = std::chrono::steady_clock::now();
263 test_end_requested_ =
true;
265 TLOG(TLVL_STOP_TEST) <<
"stop_test_: Waiting for sender threads to shut down";
266 while (sender_ready_[0] || sender_ready_[1])
271 TLOG(TLVL_STOP_TEST) <<
"stop_test_: Joining sender threads";
272 if (sender_threads_[0].joinable())
274 sender_threads_[0].join();
276 if (sender_threads_[1].joinable())
278 sender_threads_[1].join();
281 TLOG(TLVL_STOP_TEST) <<
"stop_test_: Waiting for receiver threads to shut down";
282 while (receiver_ready_[0] || receiver_ready_[1])
287 TLOG(TLVL_STOP_TEST) <<
"stop_test_: Joining receiver threads";
288 if (receiver_threads_[0].joinable())
290 receiver_threads_[0].join();
292 if (receiver_threads_[1].joinable())
294 receiver_threads_[1].join();
297 TLOG(TLVL_INFO) <<
"Sent " << sender_current_fragment_[0] <<
" events from rank 0 and " << sender_current_fragment_[1] <<
" events from rank 1.";
299 artdaq::Fragment::sequence_id_t expected_events = sender_current_fragment_[0];
300 if (sender_current_fragment_[1] > expected_events)
302 expected_events = sender_current_fragment_[1];
305 auto complete_events = complete_events_.size();
306 auto incomplete_events = timeout_events_.size();
307 auto missing_events = expected_events - complete_events - incomplete_events;
309 TLOG(TLVL_INFO) <<
"Received " << complete_events <<
" complete events in " << fm_(artdaq::TimeUtils::GetElapsedTime(test_start_time_),
"s")
310 <<
", Incomplete: " << incomplete_events <<
", Missing: " << missing_events;
311 TLOG(TLVL_STOP_TEST) <<
"stop_test_ END";
314 void artdaqtest::BrokenTransferTest::do_sending_(
int sender_rank)
319 TLOG(TLVL_SENDER) <<
"Sender " << sender_rank <<
" setting sender_ready_";
320 sender_ready_[sender_rank] =
true;
322 while (sender_current_fragment_[sender_rank] < sequence_id_target_() || !test_end_requested_)
324 if (sender_rank == 0 && kill_first_sender_)
328 while (sender_rank == 0 && pause_first_sender_)
330 std::this_thread::yield();
334 artdaq::Fragment frag(fragment_size_);
335 frag.setSequenceID(sender_current_fragment_[sender_rank]);
336 frag.setFragmentID(sender_rank);
337 frag.setSystemType(artdaq::Fragment::DataFragmentType);
339 auto start_time = std::chrono::steady_clock::now();
342 if (sender_tokens_[sender_rank].load() == 0)
344 TLOG(TLVL_SENDER_TOKEN_WAIT) <<
"Sender " << sender_rank <<
" waiting for token from receiver";
345 while (sender_tokens_[sender_rank].load() == 0 && !test_end_requested_) { usleep(10000); }
346 if (test_end_requested_)
350 TLOG(TLVL_SENDER_TOKEN_WAIT) <<
"Sender " << sender_rank <<
" waited " << fm_(artdaq::TimeUtils::GetElapsedTime(start_time),
"s") <<
" for token from receiver";
364 TLOG(TLVL_ERROR) <<
"Error sending Fragment " << sender_current_fragment_[sender_rank] <<
" from sender rank " << sender_rank <<
": "
367 auto duration = artdaq::TimeUtils::GetElapsedTime(start_time);
368 TLOG(TLVL_SENDER) <<
"Sender " << sender_rank <<
" Transferred Fragment " << sender_current_fragment_[sender_rank]
369 <<
" with size " << fragment_size_ <<
" words in " << fm_(duration,
"s")
370 <<
" (approx " << fm_(static_cast<double>(fragment_size_ *
sizeof(artdaq::detail::RawFragmentHeader::RawDataType)) / duration,
"B/s")
371 <<
") throttle " << send_throttle_us_;
372 ++sender_current_fragment_[sender_rank];
373 sender_tokens_[sender_rank]--;
374 throttle_sender_(sender_rank);
377 TLOG(TLVL_SENDER) <<
"Sender " << sender_rank <<
" shutting down...";
378 theTransfer.reset(
nullptr);
379 sender_ready_[sender_rank] =
false;
380 TLOG(TLVL_SENDER) <<
"Sender " << sender_rank <<
" DONE";
383 void artdaqtest::BrokenTransferTest::do_receiving_(
int sender_rank,
int receiver_rank)
385 std::unique_ptr<artdaq::TransferInterface> theTransfer =
388 artdaq::FragmentPtr dropFrag =
nullptr;
390 TLOG(TLVL_RECEIVER) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" setting receiver_ready_";
391 receiver_ready_[sender_rank] =
true;
392 sender_tokens_[sender_rank] = event_buffer_count_;
394 while (!event_buffer_.empty() || !test_end_requested_ || sender_ready_[0] || sender_ready_[1])
400 while (pause_receiver_)
402 std::this_thread::yield();
406 artdaq::detail::RawFragmentHeader hdr;
411 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
414 event_buffer_cv_.wait_for(lk, std::chrono::microseconds(10000));
416 auto it = event_buffer_.begin();
417 while (it != event_buffer_.end())
419 if (artdaq::TimeUtils::GetElapsedTimeMicroseconds(it->second.open_time) > event_buffer_timeout_us_)
421 TLOG(TLVL_WARNING) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
": Event " << it->first
422 <<
" has timed out after " << artdaq::TimeUtils::GetElapsedTime(it->second.open_time) <<
" s, removing...";
423 timeout_events_.insert(it->first);
424 it = event_buffer_.erase(it);
433 }
while (event_buffer_.size() > event_buffer_count_);
436 if (rank != sender_rank)
441 artdaq::RawDataType* ptr =
nullptr;
444 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
445 if (timeout_events_.count(hdr.sequence_id) != 0u)
447 TLOG(TLVL_WARNING) <<
"Event " << hdr.sequence_id <<
" has timed out, discarding";
448 if (!dropFrag || dropFrag->size() < hdr.word_count)
450 dropFrag = std::make_unique<artdaq::Fragment>(hdr.word_count - hdr.num_words());
452 ptr = dropFrag->headerAddress() + hdr.num_words();
456 if (event_buffer_.count(hdr.sequence_id) == 0u)
458 event_buffer_[hdr.sequence_id].open_time = std::chrono::steady_clock::now();
459 event_buffer_[hdr.sequence_id].first_frag.reset(
new artdaq::Fragment(hdr.word_count - hdr.num_words()));
460 ptr = event_buffer_[hdr.sequence_id].first_frag->headerAddress() + hdr.num_words();
461 TLOG(TLVL_RECEIVER) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" opened event " << hdr.sequence_id
462 <<
" with Fragment from rank " << sender_rank;
466 event_buffer_[hdr.sequence_id].second_frag.reset(
new artdaq::Fragment(hdr.word_count - hdr.num_words()));
467 ptr = event_buffer_[hdr.sequence_id].second_frag->headerAddress() + hdr.num_words();
474 if (rank != sender_rank)
476 TLOG(TLVL_ERROR) <<
"Error receiving Fragment data after header received successfully!";
482 TLOG(TLVL_RECEIVER) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" completed event " << hdr.sequence_id
483 <<
" in " << fm_(artdaq::TimeUtils::GetElapsedTime(event_buffer_[hdr.sequence_id].open_time),
"s") <<
".";
485 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
486 complete_events_.insert(hdr.sequence_id);
487 event_buffer_.erase(hdr.sequence_id);
488 event_buffer_cv_.notify_one();
494 TLOG(TLVL_RECEIVER) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" shutting down...";
497 std::lock_guard<std::mutex> lk(event_buffer_mutex_);
498 theTransfer.reset(
nullptr);
499 receiver_ready_[sender_rank] =
false;
500 TLOG(TLVL_RECEIVER) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" DONE";
503 void artdaqtest::BrokenTransferTest::throttle_sender_(
int sender_rank)
505 if (send_throttle_us_ != 0 && sender_current_fragment_[sender_rank] >= sequence_id_target_() - fragment_rate_hz_)
507 usleep(send_throttle_us_);
511 artdaq::Fragment::sequence_id_t artdaqtest::BrokenTransferTest::sequence_id_target_()
513 auto ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_start_time_) * fragment_rate_hz_ / 1000000);
514 if (test_end_requested_)
516 ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_start_time_, test_end_time_) * fragment_rate_hz_ / 1000000);
522 std::string artdaqtest::BrokenTransferTest::fm_(
double data,
const std::string& units,
int logt)
524 if (data < 1 && logt > -3)
526 return fm_(data * 1000, units, logt - 1);
528 if (data > 1000 && logt < 3)
530 return fm_(data / 1000, units, logt + 1);
534 o << std::fixed << std::setprecision(2) << data <<
" ";
virtual int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout)=0
Receive a Fragment Header from the transport mechanism.
BrokenTransferTest(const fhicl::ParameterSet &ps)
BrokenTransferTest Constructor
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
This TransferInterface is a Receiver.
virtual void flush_buffers()=0
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
static std::string CopyStatusToString(CopyStatus in)
Convert a CopyStatus variable to its string represenatation
virtual int receiveFragmentData(RawDataType *destination, size_t wordCount)=0
Receive the body of a Fragment to the given destination pointer.
This TransferInterface is a Sender.
virtual CopyStatus transfer_fragment_reliable_mode(artdaq::Fragment &&fragment)=0
Transfer a Fragment to the destination. This should be reliable, if the underlying transport mechanis...
Some error occurred, but no exception was thrown.
The send operation completed successfully.
virtual CopyStatus transfer_fragment_min_blocking_mode(artdaq::Fragment const &fragment, size_t send_timeout_usec)=0
Transfer a Fragment to the destination. May not necessarily be reliable, but will not block longer th...
void TestSenderReconnect()
Run the "Sender Reconnect" test
void TestSenderPause()
Run the "Sender Paused" test
void TestReceiverReconnect(int send_throttle_factor=0)
Run the "Receiver Reconnect" test
Value to be returned upon receive timeout.
void TestReceiverPause()
Run the "Receiver Paused" test