1 #include "BrokenTransferTest.hh"
3 #include "artdaq-core/Data/detail/RawFragmentHeader.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
7 #include "artdaq/DAQdata/Globals.hh"
8 #define TRACE_NAME "BrokenTransferTest"
13 , sender_current_fragment_()
15 , test_start_time_(std::chrono::steady_clock::now())
16 , test_end_time_(std::chrono::steady_clock::now())
17 , test_end_requested_(false)
18 , fragment_rate_hz_(ps.get<size_t>(
"fragment_rate_hz", 10))
19 , pause_first_sender_(false)
20 , pause_receiver_(false)
21 , kill_first_sender_(false)
22 , kill_receiver_(false)
23 , reliable_mode_(ps.get<bool>(
"reliable_mode", true))
24 , fragment_size_(ps.get<size_t>(
"fragment_size", 0x10000))
25 , send_timeout_us_(ps.get<size_t>(
"send_timeout_us", 100000))
29 , transfer_buffer_count_(ps.get<size_t>(
"transfer_buffer_count", 10))
30 , event_buffer_count_(ps.get<size_t>(
"event_buffer_count", 20))
31 , event_buffer_timeout_us_(ps.get<size_t>(
"event_buffer_timeout_us", 1000000))
32 , send_throttle_us_(0)
34 if (fragment_rate_hz_ == 0 || fragment_rate_hz_ > 100000)
36 TLOG(TLVL_WARNING) <<
"Invalid rate " << fragment_rate_hz_ <<
" Hz specified, setting to " << (fragment_rate_hz_ == 0 ? 1 : 1000) <<
" Hz";
37 fragment_rate_hz_ = (fragment_rate_hz_ == 0 ? 1 : 1000);
43 TLOG(TLVL_INFO) <<
"TestSenderPause BEGIN";
45 usleep_for_n_buffer_epochs_(2);
47 TLOG(TLVL_INFO) <<
"Pausing First Sender";
48 pause_first_sender_ =
true;
49 usleep_for_n_buffer_epochs_(2);
50 usleep(2 * event_buffer_timeout_us_);
52 TLOG(TLVL_INFO) <<
"Resuming First Sender";
53 pause_first_sender_ =
false;
54 usleep_for_n_buffer_epochs_(2);
57 TLOG(TLVL_INFO) <<
"TestSenderPause END";
62 TLOG(TLVL_INFO) <<
"TestReceiverPause BEGIN";
64 usleep_for_n_buffer_epochs_(2);
66 TLOG(TLVL_INFO) <<
"Pausing Recevier";
67 pause_receiver_ =
true;
68 usleep_for_n_buffer_epochs_(2);
69 usleep(2 * event_buffer_timeout_us_);
71 TLOG(TLVL_INFO) <<
"Resuming Receiver";
72 pause_receiver_ =
false;
73 usleep_for_n_buffer_epochs_(2);
76 TLOG(TLVL_INFO) <<
"TestReceiverPause END";
81 TLOG(TLVL_INFO) <<
"TestSenderReconnect BEGIN";
83 usleep_for_n_buffer_epochs_(2);
85 TLOG(TLVL_INFO) <<
"Killing first Sender";
86 kill_first_sender_ =
true;
87 if (sender_threads_[0].joinable()) sender_threads_[0].join();
88 kill_first_sender_ =
false;
90 usleep_for_n_buffer_epochs_(2);
91 usleep(2 * event_buffer_timeout_us_);
93 TLOG(TLVL_INFO) <<
"Restarting First Sender";
94 boost::thread::attributes attrs;
95 attrs.set_stack_size(4096 * 2000);
98 sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 0));
100 catch (
const boost::exception& e)
102 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
103 std::cerr <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
107 usleep_for_n_buffer_epochs_(2);
110 TLOG(TLVL_INFO) <<
"TestSenderReconnect END";
115 TLOG(TLVL_INFO) <<
"TestReceiverReconnect BEGIN";
116 send_throttle_us_ = send_throttle_us;
118 usleep_for_n_buffer_epochs_(2);
120 TLOG(TLVL_INFO) <<
"Killing Receiver";
121 kill_receiver_ =
true;
122 if (receiver_threads_[0].joinable()) receiver_threads_[0].join();
123 if (receiver_threads_[1].joinable()) receiver_threads_[1].join();
124 kill_receiver_ =
false;
126 usleep_for_n_buffer_epochs_(2);
127 usleep(2 * event_buffer_timeout_us_);
129 TLOG(TLVL_INFO) <<
"Restarting Receiver";
130 boost::thread::attributes attrs;
131 attrs.set_stack_size(4096 * 2000);
134 receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 0, 2));
135 receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 1, 2));
137 catch (
const boost::exception& e)
139 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
140 std::cerr <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
144 usleep_for_n_buffer_epochs_(2);
147 TLOG(TLVL_INFO) <<
"TestReceiverReconnect END";
150 fhicl::ParameterSet artdaqtest::BrokenTransferTest::make_transfer_ps_(
int sender_rank,
int receiver_rank, std::string name)
152 fhicl::ParameterSet thePs = ps_.get<fhicl::ParameterSet>(
"default_transfer_ps", fhicl::ParameterSet());
154 thePs.put_or_replace(
"transferPluginType", ps_.get<std::string>(
"transfer_to_use",
"Shmem"));
155 thePs.put_or_replace(
"destination_rank", receiver_rank);
156 thePs.put_or_replace(
"source_rank", sender_rank);
157 thePs.put_or_replace(
"buffer_count", transfer_buffer_count_);
158 if (!thePs.has_key(
"max_fragment_size_words"))
160 thePs.put(
"max_fragment_size_words", fragment_size_ + artdaq::detail::RawFragmentHeader::num_words() + 1);
162 fhicl::ParameterSet outputPs;
164 TLOG(TLVL_INFO) <<
"Configuring transfer between " << sender_rank <<
" and " << receiver_rank <<
" with ParameterSet: " << thePs.to_string();
166 outputPs.put(name, thePs);
170 void artdaqtest::BrokenTransferTest::start_test_()
172 TLOG(TLVL_DEBUG) <<
"start_test_ BEGIN";
174 sender_ready_[0] =
false;
175 sender_ready_[1] =
false;
177 receiver_ready_[0] =
false;
178 receiver_ready_[1] =
false;
180 sender_current_fragment_[0] = 0;
181 sender_current_fragment_[1] = 0;
183 test_start_time_ = std::chrono::steady_clock::now();
184 test_end_time_ = std::chrono::steady_clock::now();
186 test_end_requested_ =
false;
187 pause_first_sender_ =
false;
188 pause_receiver_ =
false;
189 kill_first_sender_ =
false;
190 kill_receiver_ =
false;
192 event_buffer_.clear();
193 complete_events_.clear();
194 timeout_events_.clear();
196 TLOG(TLVL_DEBUG) <<
"start_test_: Starting receiver threads";
197 boost::thread::attributes attrs;
198 attrs.set_stack_size(4096 * 2000);
201 receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 0, 2));
202 receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 1, 2));
204 catch (
const boost::exception& e)
206 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
207 std::cerr <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
211 TLOG(TLVL_DEBUG) <<
"start_test_: Waiting for receiver_ready_";
212 while (!receiver_ready_[0] || !receiver_ready_[1])
217 TLOG(TLVL_DEBUG) <<
"start_test_: Starting sender threads";
220 sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 0));
221 sender_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 1));
223 catch (
const boost::exception& e)
225 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
226 std::cerr <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
230 TLOG(TLVL_DEBUG) <<
"start_test_: Waiting for sender_ready_";
231 while (!sender_ready_[0] || !sender_ready_[1])
236 TLOG(TLVL_DEBUG) <<
"start_test_ DONE";
239 void artdaqtest::BrokenTransferTest::stop_test_()
241 TLOG(TLVL_DEBUG) <<
"stop_test_ BEGIN";
242 test_end_time_ = std::chrono::steady_clock::now();
243 test_end_requested_ =
true;
245 TLOG(TLVL_DEBUG) <<
"stop_test_: Waiting for sender threads to shut down";
246 while (sender_ready_[0] || sender_ready_[1])
251 TLOG(TLVL_DEBUG) <<
"stop_test_: Joining sender threads";
252 if (sender_threads_[0].joinable()) sender_threads_[0].join();
253 if (sender_threads_[1].joinable()) sender_threads_[1].join();
255 TLOG(TLVL_DEBUG) <<
"stop_test_: Waiting for receiver threads to shut down";
256 while (receiver_ready_[0] || receiver_ready_[1]) {
260 TLOG(TLVL_DEBUG) <<
"stop_test_: Joining receiver threads";
261 if (receiver_threads_[0].joinable()) receiver_threads_[0].join();
262 if (receiver_threads_[1].joinable()) receiver_threads_[1].join();
264 TLOG(TLVL_INFO) <<
"Sent " << sender_current_fragment_[0] <<
" events from rank 0 and " << sender_current_fragment_[1] <<
" events from rank 1.";
266 artdaq::Fragment::sequence_id_t expected_events = sender_current_fragment_[0];
267 if (sender_current_fragment_[1] > expected_events)
268 expected_events = sender_current_fragment_[1];
270 auto complete_events = complete_events_.size();
271 auto incomplete_events = timeout_events_.size();
272 auto missing_events = expected_events - complete_events - incomplete_events;
274 TLOG(TLVL_INFO) <<
"Received " << complete_events <<
" complete events in " << fm_(artdaq::TimeUtils::GetElapsedTime(test_start_time_),
"s")
275 <<
", Incomplete: " << incomplete_events <<
", Missing: " << missing_events;
276 TLOG(TLVL_DEBUG) <<
"stop_test_ END";
279 void artdaqtest::BrokenTransferTest::do_sending_(
int sender_rank)
284 TLOG(TLVL_DEBUG) <<
"Sender " << sender_rank <<
" setting sender_ready_";
285 sender_ready_[sender_rank] =
true;
287 while (sender_current_fragment_[sender_rank] < sequence_id_target_() || !test_end_requested_)
289 if (sender_rank == 0 && kill_first_sender_)
break;
290 while (sender_rank == 0 && pause_first_sender_)
292 std::this_thread::yield();
296 artdaq::Fragment frag(fragment_size_);
297 frag.setSequenceID(sender_current_fragment_[sender_rank]);
298 frag.setFragmentID(sender_rank);
299 frag.setSystemType(artdaq::Fragment::DataFragmentType);
301 auto start_time = std::chrono::steady_clock::now();
304 if (sender_tokens_[sender_rank].load() == 0)
306 TLOG(TLVL_INFO) <<
"Sender " << sender_rank <<
" waiting for token from receiver";
307 while (sender_tokens_[sender_rank].load() == 0 && !test_end_requested_) { usleep(10000); }
308 if (test_end_requested_)
continue;
309 TLOG(TLVL_INFO) <<
"Sender " << sender_rank <<
" waited " << fm_(artdaq::TimeUtils::GetElapsedTime(start_time),
"s") <<
" for token from receiver";
323 TLOG(TLVL_ERROR) <<
"Error sending Fragment " << sender_current_fragment_[sender_rank] <<
" from sender rank " << sender_rank <<
": "
326 auto duration = artdaq::TimeUtils::GetElapsedTime(start_time);
327 TLOG(TLVL_TRACE) <<
"Sender " << sender_rank <<
" Transferred Fragment " << sender_current_fragment_[sender_rank]
328 <<
" with size " << fragment_size_ <<
" words in " << fm_(duration,
"s")
329 <<
" (approx " << fm_(static_cast<double>(fragment_size_ *
sizeof(artdaq::detail::RawFragmentHeader::RawDataType)) / duration,
"B/s")
330 <<
") throttle " << send_throttle_us_;
331 ++sender_current_fragment_[sender_rank];
332 sender_tokens_[sender_rank]--;
333 if (send_throttle_us_)
334 usleep(send_throttle_us_);
337 TLOG(TLVL_DEBUG) <<
"Sender " << sender_rank <<
" shutting down...";
338 theTransfer.reset(
nullptr);
339 sender_ready_[sender_rank] =
false;
340 TLOG(TLVL_DEBUG) <<
"Sender " << sender_rank <<
" DONE";
343 void artdaqtest::BrokenTransferTest::do_receiving_(
int sender_rank,
int receiver_rank)
345 std::unique_ptr<artdaq::TransferInterface> theTransfer =
348 artdaq::FragmentPtr dropFrag =
nullptr;
350 TLOG(TLVL_DEBUG) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" setting receiver_ready_";
351 receiver_ready_[sender_rank] =
true;
352 sender_tokens_[sender_rank] = event_buffer_count_;
354 while (event_buffer_.size() > 0 || !test_end_requested_ || sender_ready_[0] || sender_ready_[1])
356 if (kill_receiver_)
break;
357 while (pause_receiver_)
359 std::this_thread::yield();
363 artdaq::detail::RawFragmentHeader hdr;
368 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
371 event_buffer_cv_.wait_for(lk, std::chrono::microseconds(10000));
373 auto it = event_buffer_.begin();
374 while (it != event_buffer_.end())
376 if (artdaq::TimeUtils::GetElapsedTimeMicroseconds(it->second.open_time) > event_buffer_timeout_us_)
378 TLOG(TLVL_WARNING) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
": Event " << it->first
379 <<
" has timed out after " << artdaq::TimeUtils::GetElapsedTime(it->second.open_time) <<
" s, removing...";
380 timeout_events_.insert(it->first);
381 it = event_buffer_.erase(it);
390 }
while (event_buffer_.size() > event_buffer_count_);
393 if (rank != sender_rank)
continue;
395 artdaq::RawDataType* ptr =
nullptr;
398 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
399 if (timeout_events_.count(hdr.sequence_id))
401 TLOG(TLVL_WARNING) <<
"Event " << hdr.sequence_id <<
" has timed out, discarding";
402 if (!dropFrag || dropFrag->size() < hdr.word_count)
404 dropFrag.reset(
new artdaq::Fragment(hdr.word_count - hdr.num_words()));
406 ptr = dropFrag->headerAddress() + hdr.num_words();
410 if (!event_buffer_.count(hdr.sequence_id))
412 event_buffer_[hdr.sequence_id].open_time = std::chrono::steady_clock::now();
413 event_buffer_[hdr.sequence_id].first_frag = artdaq::Fragment(hdr.word_count - hdr.num_words());
414 ptr = event_buffer_[hdr.sequence_id].first_frag.headerAddress() + hdr.num_words();
415 TLOG(TLVL_TRACE) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" opened event " << hdr.sequence_id
416 <<
" with Fragment from rank " << sender_rank;
420 event_buffer_[hdr.sequence_id].second_frag = artdaq::Fragment(hdr.word_count - hdr.num_words());
421 ptr = event_buffer_[hdr.sequence_id].second_frag.headerAddress() + hdr.num_words();
428 if (rank != sender_rank)
430 TLOG(TLVL_ERROR) <<
"Error receiving Fragment data after header received successfully!";
436 TLOG(TLVL_TRACE) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" completed event " << hdr.sequence_id
437 <<
" in " << fm_(artdaq::TimeUtils::GetElapsedTime(event_buffer_[hdr.sequence_id].open_time),
"s") <<
".";
439 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
440 complete_events_.insert(hdr.sequence_id);
441 event_buffer_.erase(hdr.sequence_id);
442 event_buffer_cv_.notify_one();
448 TLOG(TLVL_DEBUG) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" shutting down...";
451 std::lock_guard<std::mutex> lk(event_buffer_mutex_);
452 theTransfer.reset(
nullptr);
453 receiver_ready_[sender_rank] =
false;
454 TLOG(TLVL_DEBUG) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" DONE";
457 artdaq::Fragment::sequence_id_t artdaqtest::BrokenTransferTest::sequence_id_target_()
459 auto ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_start_time_) * fragment_rate_hz_ / 1000000);
460 if (test_end_requested_)
461 ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_end_time_) * fragment_rate_hz_ / 1000000);
466 std::string artdaqtest::BrokenTransferTest::fm_(
double data, std::string units,
int logt)
468 if (data < 1 && logt > -3)
470 return fm_(data * 1000, units, logt - 1);
472 else if (data > 1000 && logt < 3)
474 return fm_(data / 1000, units, logt + 1);
478 o << std::fixed << std::setprecision(2) << data <<
" ";
virtual int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout)=0
Receive a Fragment Header from the transport mechanism.
BrokenTransferTest(fhicl::ParameterSet ps)
BrokenTransferTest Constructor
This TransferInterface is a Receiver.
virtual void flush_buffers()=0
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
static std::string CopyStatusToString(CopyStatus in)
Convert a CopyStatus variable to its string represenatation
virtual int receiveFragmentData(RawDataType *destination, size_t wordCount)=0
Receive the body of a Fragment to the given destination pointer.
void TestReceiverReconnect(int send_throttle_us=0)
Run the "Receiver Reconnect" test
This TransferInterface is a Sender.
virtual CopyStatus transfer_fragment_reliable_mode(artdaq::Fragment &&fragment)=0
Transfer a Fragment to the destination. This should be reliable, if the underlying transport mechanis...
Some error occurred, but no exception was thrown.
The send operation completed successfully.
virtual CopyStatus transfer_fragment_min_blocking_mode(artdaq::Fragment const &fragment, size_t send_timeout_usec)=0
Transfer a Fragment to the destination. May not necessarily be reliable, but will not block longer th...
void TestSenderReconnect()
Run the "Sender Reconnect" test
void TestSenderPause()
Run the "Sender Paused" test
Value to be returned upon receive timeout.
void TestReceiverPause()
Run the "Receiver Paused" test