1 #include "BrokenTransferTest.hh"
3 #include "artdaq-core/Data/detail/RawFragmentHeader.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
8 #include "artdaq/DAQdata/Globals.hh"
9 #define TRACE_NAME "BrokenTransferTest"
11 #define TLVL_MAKE_TRANSFER_PS TLVL_DEBUG + 5
12 #define TLVL_START_TEST TLVL_DEBUG + 6
13 #define TLVL_STOP_TEST TLVL_DEBUG + 7
14 #define TLVL_SENDER TLVL_DEBUG + 8
15 #define TLVL_SENDER_TOKEN_WAIT TLVL_DEBUG + 9
16 #define TLVL_RECEIVER TLVL_DEBUG + 10
21 , sender_current_fragment_()
23 , test_start_time_(std::chrono::steady_clock::now())
24 , test_end_time_(std::chrono::steady_clock::now())
25 , test_end_requested_(false)
26 , fragment_rate_hz_(ps.get<size_t>(
"fragment_rate_hz", 10))
27 , pause_first_sender_(false)
28 , pause_receiver_(false)
29 , kill_first_sender_(false)
30 , kill_receiver_(false)
31 , reliable_mode_(ps.get<bool>(
"reliable_mode", true))
32 , fragment_size_(ps.get<size_t>(
"fragment_size", 0x10000))
33 , send_timeout_us_(ps.get<size_t>(
"send_timeout_us", 100000))
34 , transfer_buffer_count_(ps.get<size_t>(
"transfer_buffer_count", 10))
35 , event_buffer_count_(ps.get<size_t>(
"event_buffer_count", 20))
36 , event_buffer_timeout_us_(ps.get<size_t>(
"event_buffer_timeout_us", 1000000))
37 , send_throttle_us_(0)
39 if (fragment_rate_hz_ == 0 || fragment_rate_hz_ > 100000)
41 TLOG(TLVL_WARNING) <<
"Invalid rate " << fragment_rate_hz_ <<
" Hz specified, setting to " << (fragment_rate_hz_ == 0 ? 1 : 1000) <<
" Hz";
42 fragment_rate_hz_ = (fragment_rate_hz_ == 0 ? 1 : 1000);
48 TLOG(TLVL_INFO) <<
"TestSenderPause BEGIN";
49 auto start_time = std::chrono::steady_clock::now();
51 usleep_for_n_buffer_epochs_(2);
53 TLOG(TLVL_INFO) <<
"Pausing First Sender";
54 pause_first_sender_ =
true;
55 usleep_for_n_buffer_epochs_(2);
56 usleep(2 * event_buffer_timeout_us_);
58 TLOG(TLVL_INFO) <<
"Resuming First Sender";
59 pause_first_sender_ =
false;
60 usleep_for_n_buffer_epochs_(2);
63 TLOG(TLVL_INFO) <<
"TestSenderPause END, duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
68 TLOG(TLVL_INFO) <<
"TestReceiverPause BEGIN";
69 auto start_time = std::chrono::steady_clock::now();
71 usleep_for_n_buffer_epochs_(2);
73 TLOG(TLVL_INFO) <<
"Pausing Recevier";
74 pause_receiver_ =
true;
75 usleep_for_n_buffer_epochs_(2);
76 usleep(2 * event_buffer_timeout_us_);
78 TLOG(TLVL_INFO) <<
"Resuming Receiver";
79 pause_receiver_ =
false;
80 usleep_for_n_buffer_epochs_(2);
83 TLOG(TLVL_INFO) <<
"TestReceiverPause END, duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
88 TLOG(TLVL_INFO) <<
"TestSenderReconnect BEGIN";
89 auto start_time = std::chrono::steady_clock::now();
91 usleep_for_n_buffer_epochs_(2);
93 TLOG(TLVL_INFO) <<
"Killing first Sender";
94 kill_first_sender_ =
true;
95 if (sender_threads_[0].joinable())
97 sender_threads_[0].join();
99 kill_first_sender_ =
false;
101 usleep_for_n_buffer_epochs_(2);
102 usleep(2 * event_buffer_timeout_us_);
104 TLOG(TLVL_INFO) <<
"Restarting First Sender";
105 boost::thread::attributes attrs;
106 attrs.set_stack_size(4096 * 2000);
109 sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 0));
111 catch (
const boost::exception& e)
113 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
114 std::cerr <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
118 usleep_for_n_buffer_epochs_(2);
121 TLOG(TLVL_INFO) <<
"TestSenderReconnect END, duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
126 TLOG(TLVL_INFO) <<
"TestReceiverReconnect BEGIN";
127 auto start_time = std::chrono::steady_clock::now();
128 send_throttle_us_ = send_throttle_factor * 1000000 / fragment_rate_hz_;
130 usleep_for_n_buffer_epochs_(2);
132 TLOG(TLVL_INFO) <<
"Killing Receiver duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);;
133 kill_receiver_ =
true;
134 if (receiver_threads_[0].joinable())
136 receiver_threads_[0].join();
138 if (receiver_threads_[1].joinable())
140 receiver_threads_[1].join();
142 kill_receiver_ =
false;
144 usleep_for_n_buffer_epochs_(2);
145 usleep(2 * event_buffer_timeout_us_);
147 TLOG(TLVL_INFO) <<
"Restarting Receiver duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
148 boost::thread::attributes attrs;
149 attrs.set_stack_size(4096 * 2000);
152 receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 0, 2));
153 receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 1, 2));
155 catch (
const boost::exception& e)
157 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
158 std::cerr <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
162 usleep_for_n_buffer_epochs_(2);
164 TLOG(TLVL_INFO) <<
"Stopping test, duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
166 TLOG(TLVL_INFO) <<
"TestReceiverReconnect END, duration=" << artdaq::TimeUtils::GetElapsedTime(start_time);
169 fhicl::ParameterSet artdaqtest::BrokenTransferTest::make_transfer_ps_(
int sender_rank,
int receiver_rank,
const std::string& name)
171 auto thePs = ps_.get<fhicl::ParameterSet>(
"default_transfer_ps", fhicl::ParameterSet());
173 thePs.put_or_replace(
"transferPluginType", ps_.get<std::string>(
"transfer_to_use",
"Shmem"));
174 thePs.put_or_replace(
"destination_rank", receiver_rank);
175 thePs.put_or_replace(
"source_rank", sender_rank);
176 thePs.put_or_replace(
"buffer_count", transfer_buffer_count_);
177 if (!thePs.has_key(
"max_fragment_size_words"))
179 thePs.put(
"max_fragment_size_words", fragment_size_ + artdaq::detail::RawFragmentHeader::num_words() + 1);
181 fhicl::ParameterSet outputPs;
183 TLOG(TLVL_MAKE_TRANSFER_PS) <<
"Configuring transfer between " << sender_rank <<
" and " << receiver_rank <<
" with ParameterSet: " << thePs.to_string();
185 outputPs.put(name, thePs);
189 void artdaqtest::BrokenTransferTest::start_test_()
191 TLOG(TLVL_START_TEST) <<
"start_test_ BEGIN";
193 sender_ready_[0] =
false;
194 sender_ready_[1] =
false;
196 receiver_ready_[0] =
false;
197 receiver_ready_[1] =
false;
199 sender_current_fragment_[0] = 0;
200 sender_current_fragment_[1] = 0;
202 test_start_time_ = std::chrono::steady_clock::now();
203 test_end_time_ = std::chrono::steady_clock::now();
205 test_end_requested_ =
false;
206 pause_first_sender_ =
false;
207 pause_receiver_ =
false;
208 kill_first_sender_ =
false;
209 kill_receiver_ =
false;
211 event_buffer_.clear();
212 complete_events_.clear();
213 timeout_events_.clear();
215 TLOG(TLVL_START_TEST) <<
"start_test_: Starting receiver threads";
216 boost::thread::attributes attrs;
217 attrs.set_stack_size(4096 * 2000);
220 receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 0, 2));
221 receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 1, 2));
223 catch (
const boost::exception& e)
225 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
226 std::cerr <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
230 TLOG(TLVL_START_TEST) <<
"start_test_: Waiting for receiver_ready_";
231 while (!receiver_ready_[0] || !receiver_ready_[1])
236 TLOG(TLVL_START_TEST) <<
"start_test_: Starting sender threads";
239 sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 0));
240 sender_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 1));
242 catch (
const boost::exception& e)
244 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
245 std::cerr <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
249 TLOG(TLVL_START_TEST) <<
"start_test_: Waiting for sender_ready_";
250 while (!sender_ready_[0] || !sender_ready_[1])
255 TLOG(TLVL_START_TEST) <<
"start_test_ DONE";
258 void artdaqtest::BrokenTransferTest::stop_test_()
260 TLOG(TLVL_STOP_TEST) <<
"stop_test_ BEGIN";
261 test_end_time_ = std::chrono::steady_clock::now();
262 test_end_requested_ =
true;
264 TLOG(TLVL_STOP_TEST) <<
"stop_test_: Waiting for sender threads to shut down";
265 while (sender_ready_[0] || sender_ready_[1])
270 TLOG(TLVL_STOP_TEST) <<
"stop_test_: Joining sender threads";
271 if (sender_threads_[0].joinable())
273 sender_threads_[0].join();
275 if (sender_threads_[1].joinable())
277 sender_threads_[1].join();
280 TLOG(TLVL_STOP_TEST) <<
"stop_test_: Waiting for receiver threads to shut down";
281 while (receiver_ready_[0] || receiver_ready_[1])
286 TLOG(TLVL_STOP_TEST) <<
"stop_test_: Joining receiver threads";
287 if (receiver_threads_[0].joinable())
289 receiver_threads_[0].join();
291 if (receiver_threads_[1].joinable())
293 receiver_threads_[1].join();
296 TLOG(TLVL_INFO) <<
"Sent " << sender_current_fragment_[0] <<
" events from rank 0 and " << sender_current_fragment_[1] <<
" events from rank 1.";
298 artdaq::Fragment::sequence_id_t expected_events = sender_current_fragment_[0];
299 if (sender_current_fragment_[1] > expected_events)
301 expected_events = sender_current_fragment_[1];
304 auto complete_events = complete_events_.size();
305 auto incomplete_events = timeout_events_.size();
306 auto missing_events = expected_events - complete_events - incomplete_events;
308 TLOG(TLVL_INFO) <<
"Received " << complete_events <<
" complete events in " << fm_(artdaq::TimeUtils::GetElapsedTime(test_start_time_),
"s")
309 <<
", Incomplete: " << incomplete_events <<
", Missing: " << missing_events;
310 TLOG(TLVL_STOP_TEST) <<
"stop_test_ END";
313 void artdaqtest::BrokenTransferTest::do_sending_(
int sender_rank)
318 TLOG(TLVL_SENDER) <<
"Sender " << sender_rank <<
" setting sender_ready_";
319 sender_ready_[sender_rank] =
true;
321 while (sender_current_fragment_[sender_rank] < sequence_id_target_() || !test_end_requested_)
323 if (sender_rank == 0 && kill_first_sender_)
327 while (sender_rank == 0 && pause_first_sender_)
329 std::this_thread::yield();
333 artdaq::Fragment frag(fragment_size_);
334 frag.setSequenceID(sender_current_fragment_[sender_rank]);
335 frag.setFragmentID(sender_rank);
336 frag.setSystemType(artdaq::Fragment::DataFragmentType);
338 auto start_time = std::chrono::steady_clock::now();
341 if (sender_tokens_[sender_rank].load() == 0)
343 TLOG(TLVL_SENDER_TOKEN_WAIT) <<
"Sender " << sender_rank <<
" waiting for token from receiver";
344 while (sender_tokens_[sender_rank].load() == 0 && !test_end_requested_) { usleep(10000); }
345 if (test_end_requested_)
349 TLOG(TLVL_SENDER_TOKEN_WAIT) <<
"Sender " << sender_rank <<
" waited " << fm_(artdaq::TimeUtils::GetElapsedTime(start_time),
"s") <<
" for token from receiver";
363 TLOG(TLVL_ERROR) <<
"Error sending Fragment " << sender_current_fragment_[sender_rank] <<
" from sender rank " << sender_rank <<
": "
366 auto duration = artdaq::TimeUtils::GetElapsedTime(start_time);
367 TLOG(TLVL_SENDER) <<
"Sender " << sender_rank <<
" Transferred Fragment " << sender_current_fragment_[sender_rank]
368 <<
" with size " << fragment_size_ <<
" words in " << fm_(duration,
"s")
369 <<
" (approx " << fm_(static_cast<double>(fragment_size_ *
sizeof(artdaq::detail::RawFragmentHeader::RawDataType)) / duration,
"B/s")
370 <<
") throttle " << send_throttle_us_;
371 ++sender_current_fragment_[sender_rank];
372 sender_tokens_[sender_rank]--;
373 throttle_sender_(sender_rank);
376 TLOG(TLVL_SENDER) <<
"Sender " << sender_rank <<
" shutting down...";
377 theTransfer.reset(
nullptr);
378 sender_ready_[sender_rank] =
false;
379 TLOG(TLVL_SENDER) <<
"Sender " << sender_rank <<
" DONE";
382 void artdaqtest::BrokenTransferTest::do_receiving_(
int sender_rank,
int receiver_rank)
384 std::unique_ptr<artdaq::TransferInterface> theTransfer =
387 artdaq::FragmentPtr dropFrag =
nullptr;
389 TLOG(TLVL_RECEIVER) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" setting receiver_ready_";
390 receiver_ready_[sender_rank] =
true;
391 sender_tokens_[sender_rank] = event_buffer_count_;
393 while (!event_buffer_.empty() || !test_end_requested_ || sender_ready_[0] || sender_ready_[1])
399 while (pause_receiver_)
401 std::this_thread::yield();
405 artdaq::detail::RawFragmentHeader hdr;
410 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
413 event_buffer_cv_.wait_for(lk, std::chrono::microseconds(10000));
415 auto it = event_buffer_.begin();
416 while (it != event_buffer_.end())
418 if (artdaq::TimeUtils::GetElapsedTimeMicroseconds(it->second.open_time) > event_buffer_timeout_us_)
420 TLOG(TLVL_WARNING) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
": Event " << it->first
421 <<
" has timed out after " << artdaq::TimeUtils::GetElapsedTime(it->second.open_time) <<
" s, removing...";
422 timeout_events_.insert(it->first);
423 it = event_buffer_.erase(it);
432 }
while (event_buffer_.size() > event_buffer_count_);
435 if (rank != sender_rank)
440 artdaq::RawDataType* ptr =
nullptr;
443 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
444 if (timeout_events_.count(hdr.sequence_id) != 0u)
446 TLOG(TLVL_WARNING) <<
"Event " << hdr.sequence_id <<
" has timed out, discarding";
447 if (!dropFrag || dropFrag->size() < hdr.word_count)
449 dropFrag = std::make_unique<artdaq::Fragment>(hdr.word_count - hdr.num_words());
451 ptr = dropFrag->headerAddress() + hdr.num_words();
455 if (event_buffer_.count(hdr.sequence_id) == 0u)
457 event_buffer_[hdr.sequence_id].open_time = std::chrono::steady_clock::now();
458 event_buffer_[hdr.sequence_id].first_frag.reset(
new artdaq::Fragment(hdr.word_count - hdr.num_words()));
459 ptr = event_buffer_[hdr.sequence_id].first_frag->headerAddress() + hdr.num_words();
460 TLOG(TLVL_RECEIVER) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" opened event " << hdr.sequence_id
461 <<
" with Fragment from rank " << sender_rank;
465 event_buffer_[hdr.sequence_id].second_frag.reset(
new artdaq::Fragment(hdr.word_count - hdr.num_words()));
466 ptr = event_buffer_[hdr.sequence_id].second_frag->headerAddress() + hdr.num_words();
473 if (rank != sender_rank)
475 TLOG(TLVL_ERROR) <<
"Error receiving Fragment data after header received successfully!";
481 TLOG(TLVL_RECEIVER) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" completed event " << hdr.sequence_id
482 <<
" in " << fm_(artdaq::TimeUtils::GetElapsedTime(event_buffer_[hdr.sequence_id].open_time),
"s") <<
".";
484 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
485 complete_events_.insert(hdr.sequence_id);
486 event_buffer_.erase(hdr.sequence_id);
487 event_buffer_cv_.notify_one();
493 TLOG(TLVL_RECEIVER) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" shutting down...";
496 std::lock_guard<std::mutex> lk(event_buffer_mutex_);
497 theTransfer.reset(
nullptr);
498 receiver_ready_[sender_rank] =
false;
499 TLOG(TLVL_RECEIVER) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" DONE";
502 void artdaqtest::BrokenTransferTest::throttle_sender_(
int sender_rank)
504 if (send_throttle_us_ != 0 && sender_current_fragment_[sender_rank] >= sequence_id_target_() - fragment_rate_hz_)
506 usleep(send_throttle_us_);
510 artdaq::Fragment::sequence_id_t artdaqtest::BrokenTransferTest::sequence_id_target_()
512 auto ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_start_time_) * fragment_rate_hz_ / 1000000);
513 if (test_end_requested_)
515 ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_start_time_, test_end_time_) * fragment_rate_hz_ / 1000000);
521 std::string artdaqtest::BrokenTransferTest::fm_(
double data,
const std::string& units,
int logt)
523 if (data < 1 && logt > -3)
525 return fm_(data * 1000, units, logt - 1);
527 if (data > 1000 && logt < 3)
529 return fm_(data / 1000, units, logt + 1);
533 o << std::fixed << std::setprecision(2) << data <<
" ";
virtual int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout)=0
Receive a Fragment Header from the transport mechanism.
BrokenTransferTest(const fhicl::ParameterSet &ps)
BrokenTransferTest Constructor
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
This TransferInterface is a Receiver.
virtual void flush_buffers()=0
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
static std::string CopyStatusToString(CopyStatus in)
Convert a CopyStatus variable to its string represenatation
virtual int receiveFragmentData(RawDataType *destination, size_t wordCount)=0
Receive the body of a Fragment to the given destination pointer.
This TransferInterface is a Sender.
virtual CopyStatus transfer_fragment_reliable_mode(artdaq::Fragment &&fragment)=0
Transfer a Fragment to the destination. This should be reliable, if the underlying transport mechanis...
Some error occurred, but no exception was thrown.
The send operation completed successfully.
virtual CopyStatus transfer_fragment_min_blocking_mode(artdaq::Fragment const &fragment, size_t send_timeout_usec)=0
Transfer a Fragment to the destination. May not necessarily be reliable, but will not block longer th...
void TestSenderReconnect()
Run the "Sender Reconnect" test
void TestSenderPause()
Run the "Sender Paused" test
void TestReceiverReconnect(int send_throttle_factor=0)
Run the "Receiver Reconnect" test
Value to be returned upon receive timeout.
void TestReceiverPause()
Run the "Receiver Paused" test