1 #include "BrokenTransferTest.hh"
3 #include "artdaq-core/Data/detail/RawFragmentHeader.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
8 #include "artdaq/DAQdata/Globals.hh"
9 #define TRACE_NAME "BrokenTransferTest"
14 , sender_current_fragment_()
16 , test_start_time_(std::chrono::steady_clock::now())
17 , test_end_time_(std::chrono::steady_clock::now())
18 , test_end_requested_(false)
19 , fragment_rate_hz_(ps.get<size_t>(
"fragment_rate_hz", 10))
20 , pause_first_sender_(false)
21 , pause_receiver_(false)
22 , kill_first_sender_(false)
23 , kill_receiver_(false)
24 , reliable_mode_(ps.get<bool>(
"reliable_mode", true))
25 , fragment_size_(ps.get<size_t>(
"fragment_size", 0x10000))
26 , send_timeout_us_(ps.get<size_t>(
"send_timeout_us", 100000))
27 , transfer_buffer_count_(ps.get<size_t>(
"transfer_buffer_count", 10))
28 , event_buffer_count_(ps.get<size_t>(
"event_buffer_count", 20))
29 , event_buffer_timeout_us_(ps.get<size_t>(
"event_buffer_timeout_us", 1000000))
30 , send_throttle_us_(0)
32 if (fragment_rate_hz_ == 0 || fragment_rate_hz_ > 100000)
34 TLOG(TLVL_WARNING) <<
"Invalid rate " << fragment_rate_hz_ <<
" Hz specified, setting to " << (fragment_rate_hz_ == 0 ? 1 : 1000) <<
" Hz";
35 fragment_rate_hz_ = (fragment_rate_hz_ == 0 ? 1 : 1000);
41 TLOG(TLVL_INFO) <<
"TestSenderPause BEGIN";
43 usleep_for_n_buffer_epochs_(2);
45 TLOG(TLVL_INFO) <<
"Pausing First Sender";
46 pause_first_sender_ =
true;
47 usleep_for_n_buffer_epochs_(2);
48 usleep(2 * event_buffer_timeout_us_);
50 TLOG(TLVL_INFO) <<
"Resuming First Sender";
51 pause_first_sender_ =
false;
52 usleep_for_n_buffer_epochs_(2);
55 TLOG(TLVL_INFO) <<
"TestSenderPause END";
60 TLOG(TLVL_INFO) <<
"TestReceiverPause BEGIN";
62 usleep_for_n_buffer_epochs_(2);
64 TLOG(TLVL_INFO) <<
"Pausing Recevier";
65 pause_receiver_ =
true;
66 usleep_for_n_buffer_epochs_(2);
67 usleep(2 * event_buffer_timeout_us_);
69 TLOG(TLVL_INFO) <<
"Resuming Receiver";
70 pause_receiver_ =
false;
71 usleep_for_n_buffer_epochs_(2);
74 TLOG(TLVL_INFO) <<
"TestReceiverPause END";
79 TLOG(TLVL_INFO) <<
"TestSenderReconnect BEGIN";
81 usleep_for_n_buffer_epochs_(2);
83 TLOG(TLVL_INFO) <<
"Killing first Sender";
84 kill_first_sender_ =
true;
85 if (sender_threads_[0].joinable())
87 sender_threads_[0].join();
89 kill_first_sender_ =
false;
91 usleep_for_n_buffer_epochs_(2);
92 usleep(2 * event_buffer_timeout_us_);
94 TLOG(TLVL_INFO) <<
"Restarting First Sender";
95 boost::thread::attributes attrs;
96 attrs.set_stack_size(4096 * 2000);
99 sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 0));
101 catch (
const boost::exception& e)
103 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
104 std::cerr <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
108 usleep_for_n_buffer_epochs_(2);
111 TLOG(TLVL_INFO) <<
"TestSenderReconnect END";
116 TLOG(TLVL_INFO) <<
"TestReceiverReconnect BEGIN";
117 send_throttle_us_ = send_throttle_us;
119 usleep_for_n_buffer_epochs_(2);
121 TLOG(TLVL_INFO) <<
"Killing Receiver";
122 kill_receiver_ =
true;
123 if (receiver_threads_[0].joinable())
125 receiver_threads_[0].join();
127 if (receiver_threads_[1].joinable())
129 receiver_threads_[1].join();
131 kill_receiver_ =
false;
133 usleep_for_n_buffer_epochs_(2);
134 usleep(2 * event_buffer_timeout_us_);
136 TLOG(TLVL_INFO) <<
"Restarting Receiver";
137 boost::thread::attributes attrs;
138 attrs.set_stack_size(4096 * 2000);
141 receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 0, 2));
142 receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 1, 2));
144 catch (
const boost::exception& e)
146 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
147 std::cerr <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
151 usleep_for_n_buffer_epochs_(2);
154 TLOG(TLVL_INFO) <<
"TestReceiverReconnect END";
157 fhicl::ParameterSet artdaqtest::BrokenTransferTest::make_transfer_ps_(
int sender_rank,
int receiver_rank,
const std::string& name)
159 auto thePs = ps_.get<fhicl::ParameterSet>(
"default_transfer_ps", fhicl::ParameterSet());
161 thePs.put_or_replace(
"transferPluginType", ps_.get<std::string>(
"transfer_to_use",
"Shmem"));
162 thePs.put_or_replace(
"destination_rank", receiver_rank);
163 thePs.put_or_replace(
"source_rank", sender_rank);
164 thePs.put_or_replace(
"buffer_count", transfer_buffer_count_);
165 if (!thePs.has_key(
"max_fragment_size_words"))
167 thePs.put(
"max_fragment_size_words", fragment_size_ + artdaq::detail::RawFragmentHeader::num_words() + 1);
169 fhicl::ParameterSet outputPs;
171 TLOG(TLVL_INFO) <<
"Configuring transfer between " << sender_rank <<
" and " << receiver_rank <<
" with ParameterSet: " << thePs.to_string();
173 outputPs.put(name, thePs);
177 void artdaqtest::BrokenTransferTest::start_test_()
179 TLOG(TLVL_DEBUG) <<
"start_test_ BEGIN";
181 sender_ready_[0] =
false;
182 sender_ready_[1] =
false;
184 receiver_ready_[0] =
false;
185 receiver_ready_[1] =
false;
187 sender_current_fragment_[0] = 0;
188 sender_current_fragment_[1] = 0;
190 test_start_time_ = std::chrono::steady_clock::now();
191 test_end_time_ = std::chrono::steady_clock::now();
193 test_end_requested_ =
false;
194 pause_first_sender_ =
false;
195 pause_receiver_ =
false;
196 kill_first_sender_ =
false;
197 kill_receiver_ =
false;
199 event_buffer_.clear();
200 complete_events_.clear();
201 timeout_events_.clear();
203 TLOG(TLVL_DEBUG) <<
"start_test_: Starting receiver threads";
204 boost::thread::attributes attrs;
205 attrs.set_stack_size(4096 * 2000);
208 receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 0, 2));
209 receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 1, 2));
211 catch (
const boost::exception& e)
213 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
214 std::cerr <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
218 TLOG(TLVL_DEBUG) <<
"start_test_: Waiting for receiver_ready_";
219 while (!receiver_ready_[0] || !receiver_ready_[1])
224 TLOG(TLVL_DEBUG) <<
"start_test_: Starting sender threads";
227 sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 0));
228 sender_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 1));
230 catch (
const boost::exception& e)
232 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
233 std::cerr <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
237 TLOG(TLVL_DEBUG) <<
"start_test_: Waiting for sender_ready_";
238 while (!sender_ready_[0] || !sender_ready_[1])
243 TLOG(TLVL_DEBUG) <<
"start_test_ DONE";
246 void artdaqtest::BrokenTransferTest::stop_test_()
248 TLOG(TLVL_DEBUG) <<
"stop_test_ BEGIN";
249 test_end_time_ = std::chrono::steady_clock::now();
250 test_end_requested_ =
true;
252 TLOG(TLVL_DEBUG) <<
"stop_test_: Waiting for sender threads to shut down";
253 while (sender_ready_[0] || sender_ready_[1])
258 TLOG(TLVL_DEBUG) <<
"stop_test_: Joining sender threads";
259 if (sender_threads_[0].joinable())
261 sender_threads_[0].join();
263 if (sender_threads_[1].joinable())
265 sender_threads_[1].join();
268 TLOG(TLVL_DEBUG) <<
"stop_test_: Waiting for receiver threads to shut down";
269 while (receiver_ready_[0] || receiver_ready_[1])
274 TLOG(TLVL_DEBUG) <<
"stop_test_: Joining receiver threads";
275 if (receiver_threads_[0].joinable())
277 receiver_threads_[0].join();
279 if (receiver_threads_[1].joinable())
281 receiver_threads_[1].join();
284 TLOG(TLVL_INFO) <<
"Sent " << sender_current_fragment_[0] <<
" events from rank 0 and " << sender_current_fragment_[1] <<
" events from rank 1.";
286 artdaq::Fragment::sequence_id_t expected_events = sender_current_fragment_[0];
287 if (sender_current_fragment_[1] > expected_events)
289 expected_events = sender_current_fragment_[1];
292 auto complete_events = complete_events_.size();
293 auto incomplete_events = timeout_events_.size();
294 auto missing_events = expected_events - complete_events - incomplete_events;
296 TLOG(TLVL_INFO) <<
"Received " << complete_events <<
" complete events in " << fm_(artdaq::TimeUtils::GetElapsedTime(test_start_time_),
"s")
297 <<
", Incomplete: " << incomplete_events <<
", Missing: " << missing_events;
298 TLOG(TLVL_DEBUG) <<
"stop_test_ END";
301 void artdaqtest::BrokenTransferTest::do_sending_(
int sender_rank)
306 TLOG(TLVL_DEBUG) <<
"Sender " << sender_rank <<
" setting sender_ready_";
307 sender_ready_[sender_rank] =
true;
309 while (sender_current_fragment_[sender_rank] < sequence_id_target_() || !test_end_requested_)
311 if (sender_rank == 0 && kill_first_sender_)
315 while (sender_rank == 0 && pause_first_sender_)
317 std::this_thread::yield();
321 artdaq::Fragment frag(fragment_size_);
322 frag.setSequenceID(sender_current_fragment_[sender_rank]);
323 frag.setFragmentID(sender_rank);
324 frag.setSystemType(artdaq::Fragment::DataFragmentType);
326 auto start_time = std::chrono::steady_clock::now();
329 if (sender_tokens_[sender_rank].load() == 0)
331 TLOG(TLVL_INFO) <<
"Sender " << sender_rank <<
" waiting for token from receiver";
332 while (sender_tokens_[sender_rank].load() == 0 && !test_end_requested_) { usleep(10000); }
333 if (test_end_requested_)
337 TLOG(TLVL_INFO) <<
"Sender " << sender_rank <<
" waited " << fm_(artdaq::TimeUtils::GetElapsedTime(start_time),
"s") <<
" for token from receiver";
351 TLOG(TLVL_ERROR) <<
"Error sending Fragment " << sender_current_fragment_[sender_rank] <<
" from sender rank " << sender_rank <<
": "
354 auto duration = artdaq::TimeUtils::GetElapsedTime(start_time);
355 TLOG(TLVL_TRACE) <<
"Sender " << sender_rank <<
" Transferred Fragment " << sender_current_fragment_[sender_rank]
356 <<
" with size " << fragment_size_ <<
" words in " << fm_(duration,
"s")
357 <<
" (approx " << fm_(static_cast<double>(fragment_size_ *
sizeof(artdaq::detail::RawFragmentHeader::RawDataType)) / duration,
"B/s")
358 <<
") throttle " << send_throttle_us_;
359 ++sender_current_fragment_[sender_rank];
360 sender_tokens_[sender_rank]--;
361 if (send_throttle_us_ != 0)
363 usleep(send_throttle_us_);
367 TLOG(TLVL_DEBUG) <<
"Sender " << sender_rank <<
" shutting down...";
368 theTransfer.reset(
nullptr);
369 sender_ready_[sender_rank] =
false;
370 TLOG(TLVL_DEBUG) <<
"Sender " << sender_rank <<
" DONE";
373 void artdaqtest::BrokenTransferTest::do_receiving_(
int sender_rank,
int receiver_rank)
375 std::unique_ptr<artdaq::TransferInterface> theTransfer =
378 artdaq::FragmentPtr dropFrag =
nullptr;
380 TLOG(TLVL_DEBUG) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" setting receiver_ready_";
381 receiver_ready_[sender_rank] =
true;
382 sender_tokens_[sender_rank] = event_buffer_count_;
384 while (!event_buffer_.empty() || !test_end_requested_ || sender_ready_[0] || sender_ready_[1])
390 while (pause_receiver_)
392 std::this_thread::yield();
396 artdaq::detail::RawFragmentHeader hdr;
401 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
404 event_buffer_cv_.wait_for(lk, std::chrono::microseconds(10000));
406 auto it = event_buffer_.begin();
407 while (it != event_buffer_.end())
409 if (artdaq::TimeUtils::GetElapsedTimeMicroseconds(it->second.open_time) > event_buffer_timeout_us_)
411 TLOG(TLVL_WARNING) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
": Event " << it->first
412 <<
" has timed out after " << artdaq::TimeUtils::GetElapsedTime(it->second.open_time) <<
" s, removing...";
413 timeout_events_.insert(it->first);
414 it = event_buffer_.erase(it);
423 }
while (event_buffer_.size() > event_buffer_count_);
426 if (rank != sender_rank)
431 artdaq::RawDataType* ptr =
nullptr;
434 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
435 if (timeout_events_.count(hdr.sequence_id) != 0u)
437 TLOG(TLVL_WARNING) <<
"Event " << hdr.sequence_id <<
" has timed out, discarding";
438 if (!dropFrag || dropFrag->size() < hdr.word_count)
440 dropFrag = std::make_unique<artdaq::Fragment>(hdr.word_count - hdr.num_words());
442 ptr = dropFrag->headerAddress() + hdr.num_words();
446 if (event_buffer_.count(hdr.sequence_id) == 0u)
448 event_buffer_[hdr.sequence_id].open_time = std::chrono::steady_clock::now();
449 event_buffer_[hdr.sequence_id].first_frag = artdaq::Fragment(hdr.word_count - hdr.num_words());
450 ptr = event_buffer_[hdr.sequence_id].first_frag.headerAddress() + hdr.num_words();
451 TLOG(TLVL_TRACE) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" opened event " << hdr.sequence_id
452 <<
" with Fragment from rank " << sender_rank;
456 event_buffer_[hdr.sequence_id].second_frag = artdaq::Fragment(hdr.word_count - hdr.num_words());
457 ptr = event_buffer_[hdr.sequence_id].second_frag.headerAddress() + hdr.num_words();
464 if (rank != sender_rank)
466 TLOG(TLVL_ERROR) <<
"Error receiving Fragment data after header received successfully!";
472 TLOG(TLVL_TRACE) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" completed event " << hdr.sequence_id
473 <<
" in " << fm_(artdaq::TimeUtils::GetElapsedTime(event_buffer_[hdr.sequence_id].open_time),
"s") <<
".";
475 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
476 complete_events_.insert(hdr.sequence_id);
477 event_buffer_.erase(hdr.sequence_id);
478 event_buffer_cv_.notify_one();
484 TLOG(TLVL_DEBUG) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" shutting down...";
487 std::lock_guard<std::mutex> lk(event_buffer_mutex_);
488 theTransfer.reset(
nullptr);
489 receiver_ready_[sender_rank] =
false;
490 TLOG(TLVL_DEBUG) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" DONE";
493 artdaq::Fragment::sequence_id_t artdaqtest::BrokenTransferTest::sequence_id_target_()
495 auto ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_start_time_) * fragment_rate_hz_ / 1000000);
496 if (test_end_requested_)
498 ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_end_time_) * fragment_rate_hz_ / 1000000);
504 std::string artdaqtest::BrokenTransferTest::fm_(
double data,
const std::string& units,
int logt)
506 if (data < 1 && logt > -3)
508 return fm_(data * 1000, units, logt - 1);
510 if (data > 1000 && logt < 3)
512 return fm_(data / 1000, units, logt + 1);
516 o << std::fixed << std::setprecision(2) << data <<
" ";
virtual int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout)=0
Receive a Fragment Header from the transport mechanism.
BrokenTransferTest(const fhicl::ParameterSet &ps)
BrokenTransferTest Constructor
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
This TransferInterface is a Receiver.
virtual void flush_buffers()=0
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
static std::string CopyStatusToString(CopyStatus in)
Convert a CopyStatus variable to its string represenatation
virtual int receiveFragmentData(RawDataType *destination, size_t wordCount)=0
Receive the body of a Fragment to the given destination pointer.
void TestReceiverReconnect(int send_throttle_us=0)
Run the "Receiver Reconnect" test
This TransferInterface is a Sender.
virtual CopyStatus transfer_fragment_reliable_mode(artdaq::Fragment &&fragment)=0
Transfer a Fragment to the destination. This should be reliable, if the underlying transport mechanis...
Some error occurred, but no exception was thrown.
The send operation completed successfully.
virtual CopyStatus transfer_fragment_min_blocking_mode(artdaq::Fragment const &fragment, size_t send_timeout_usec)=0
Transfer a Fragment to the destination. May not necessarily be reliable, but will not block longer th...
void TestSenderReconnect()
Run the "Sender Reconnect" test
void TestSenderPause()
Run the "Sender Paused" test
Value to be returned upon receive timeout.
void TestReceiverPause()
Run the "Receiver Paused" test