1 #include "BrokenTransferTest.hh"
3 #include "artdaq-core/Data/detail/RawFragmentHeader.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
7 #include "artdaq/DAQdata/Globals.hh"
8 #define TRACE_NAME "BrokenTransferTest"
13 , sender_current_fragment_()
15 , test_start_time_(std::chrono::steady_clock::now())
16 , test_end_time_(std::chrono::steady_clock::now())
17 , test_end_requested_(false)
18 , fragment_rate_hz_(ps.get<size_t>(
"fragment_rate_hz", 10))
19 , pause_first_sender_(false)
20 , pause_receiver_(false)
21 , kill_first_sender_(false)
22 , kill_receiver_(false)
23 , reliable_mode_(ps.get<bool>(
"reliable_mode", true))
24 , fragment_size_(ps.get<size_t>(
"fragment_size", 0x10000))
25 , send_timeout_us_(ps.get<size_t>(
"send_timeout_us", 100000))
29 , transfer_buffer_count_(ps.get<size_t>(
"transfer_buffer_count", 10))
30 , event_buffer_count_(ps.get<size_t>(
"event_buffer_count", 20))
31 , event_buffer_timeout_us_(ps.get<size_t>(
"event_buffer_timeout_us", 1000000))
32 , send_throttle_us_(0)
34 if (fragment_rate_hz_ == 0 || fragment_rate_hz_ > 100000)
36 TLOG(TLVL_WARNING) <<
"Invalid rate " << fragment_rate_hz_ <<
" Hz specified, setting to " << (fragment_rate_hz_ == 0 ? 1 : 1000) <<
" Hz";
37 fragment_rate_hz_ = (fragment_rate_hz_ == 0 ? 1 : 1000);
43 TLOG(TLVL_INFO) <<
"TestSenderPause BEGIN";
45 usleep_for_n_buffer_epochs_(2);
47 TLOG(TLVL_INFO) <<
"Pausing First Sender";
48 pause_first_sender_ =
true;
49 usleep_for_n_buffer_epochs_(2);
50 usleep(2 * event_buffer_timeout_us_);
52 TLOG(TLVL_INFO) <<
"Resuming First Sender";
53 pause_first_sender_ =
false;
54 usleep_for_n_buffer_epochs_(2);
57 TLOG(TLVL_INFO) <<
"TestSenderPause END";
62 TLOG(TLVL_INFO) <<
"TestReceiverPause BEGIN";
64 usleep_for_n_buffer_epochs_(2);
66 TLOG(TLVL_INFO) <<
"Pausing Recevier";
67 pause_receiver_ =
true;
68 usleep_for_n_buffer_epochs_(2);
69 usleep(2 * event_buffer_timeout_us_);
71 TLOG(TLVL_INFO) <<
"Resuming Receiver";
72 pause_receiver_ =
false;
73 usleep_for_n_buffer_epochs_(2);
76 TLOG(TLVL_INFO) <<
"TestReceiverPause END";
81 TLOG(TLVL_INFO) <<
"TestSenderReconnect BEGIN";
83 usleep_for_n_buffer_epochs_(2);
85 TLOG(TLVL_INFO) <<
"Killing first Sender";
86 kill_first_sender_ =
true;
87 if (sender_threads_[0].joinable()) sender_threads_[0].join();
88 kill_first_sender_ =
false;
90 usleep_for_n_buffer_epochs_(2);
91 usleep(2 * event_buffer_timeout_us_);
93 TLOG(TLVL_INFO) <<
"Restarting First Sender";
94 boost::thread::attributes attrs;
95 attrs.set_stack_size(4096 * 2000);
98 sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 0));
100 catch (
const boost::exception& e)
102 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
103 std::cerr <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
107 usleep_for_n_buffer_epochs_(2);
110 TLOG(TLVL_INFO) <<
"TestSenderReconnect END";
115 TLOG(TLVL_INFO) <<
"TestReceiverReconnect BEGIN";
116 send_throttle_us_ = send_throttle_us;
118 usleep_for_n_buffer_epochs_(2);
120 TLOG(TLVL_INFO) <<
"Killing Receiver";
121 kill_receiver_ =
true;
122 if (receiver_threads_[0].joinable()) receiver_threads_[0].join();
123 if (receiver_threads_[1].joinable()) receiver_threads_[1].join();
124 kill_receiver_ =
false;
126 usleep_for_n_buffer_epochs_(2);
127 usleep(2 * event_buffer_timeout_us_);
129 TLOG(TLVL_INFO) <<
"Restarting Receiver";
130 boost::thread::attributes attrs;
131 attrs.set_stack_size(4096 * 2000);
134 receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 0, 2));
135 receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 1, 2));
137 catch (
const boost::exception& e)
139 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
140 std::cerr <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
144 usleep_for_n_buffer_epochs_(2);
147 TLOG(TLVL_INFO) <<
"TestReceiverReconnect END";
150 fhicl::ParameterSet artdaqtest::BrokenTransferTest::make_transfer_ps_(
int sender_rank,
int receiver_rank, std::string name)
152 fhicl::ParameterSet thePs = ps_.get<fhicl::ParameterSet>(
"default_transfer_ps", fhicl::ParameterSet());
154 thePs.put_or_replace(
"transferPluginType", ps_.get<std::string>(
"transfer_to_use",
"Shmem"));
155 thePs.put_or_replace(
"destination_rank", receiver_rank);
156 thePs.put_or_replace(
"source_rank", sender_rank);
157 thePs.put_or_replace(
"buffer_count", transfer_buffer_count_);
158 if (!thePs.has_key(
"max_fragment_size_words"))
160 thePs.put(
"max_fragment_size_words", fragment_size_ + artdaq::detail::RawFragmentHeader::num_words() + 1);
162 fhicl::ParameterSet outputPs;
164 TLOG(TLVL_INFO) <<
"Configuring transfer between " << sender_rank <<
" and " << receiver_rank <<
" with ParameterSet: " << thePs.to_string();
166 outputPs.put(name, thePs);
170 void artdaqtest::BrokenTransferTest::start_test_()
172 TLOG(TLVL_DEBUG) <<
"start_test_ BEGIN";
174 sender_ready_[0] =
false;
175 sender_ready_[1] =
false;
177 receiver_ready_[0] =
false;
178 receiver_ready_[1] =
false;
180 sender_current_fragment_[0] = 0;
181 sender_current_fragment_[1] = 0;
183 test_start_time_ = std::chrono::steady_clock::now();
184 test_end_time_ = std::chrono::steady_clock::now();
186 test_end_requested_ =
false;
187 pause_first_sender_ =
false;
188 pause_receiver_ =
false;
189 kill_first_sender_ =
false;
190 kill_receiver_ =
false;
192 event_buffer_.clear();
193 complete_events_.clear();
194 timeout_events_.clear();
196 TLOG(TLVL_DEBUG) <<
"start_test_: Starting receiver threads";
197 boost::thread::attributes attrs;
198 attrs.set_stack_size(4096 * 2000);
201 receiver_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 0, 2));
202 receiver_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_receiving_,
this, 1, 2));
204 catch (
const boost::exception& e)
206 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
207 std::cerr <<
"Caught boost::exception starting Receiver thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
211 TLOG(TLVL_DEBUG) <<
"start_test_: Waiting for receiver_ready_";
212 while (!receiver_ready_[0] || !receiver_ready_[1])
217 TLOG(TLVL_DEBUG) <<
"start_test_: Starting sender threads";
220 sender_threads_[0] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 0));
221 sender_threads_[1] = boost::thread(attrs, boost::bind(&BrokenTransferTest::do_sending_,
this, 1));
223 catch (
const boost::exception& e)
225 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
226 std::cerr <<
"Caught boost::exception starting Sender thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
230 TLOG(TLVL_DEBUG) <<
"start_test_: Waiting for sender_ready_";
231 while (!sender_ready_[0] || !sender_ready_[1])
236 TLOG(TLVL_DEBUG) <<
"start_test_ DONE";
239 void artdaqtest::BrokenTransferTest::stop_test_()
241 TLOG(TLVL_DEBUG) <<
"stop_test_ BEGIN";
242 test_end_time_ = std::chrono::steady_clock::now();
243 test_end_requested_ =
true;
245 TLOG(TLVL_DEBUG) <<
"stop_test_: Joining sender threads";
246 if (sender_threads_[0].joinable()) sender_threads_[0].join();
247 if (sender_threads_[1].joinable()) sender_threads_[1].join();
249 TLOG(TLVL_DEBUG) <<
"stop_test_: Joining receiver threads";
250 if (receiver_threads_[0].joinable()) receiver_threads_[0].join();
251 if (receiver_threads_[1].joinable()) receiver_threads_[1].join();
253 TLOG(TLVL_INFO) <<
"Sent " << sender_current_fragment_[0] <<
" events from rank 0 and " << sender_current_fragment_[1] <<
" events from rank 1.";
255 artdaq::Fragment::sequence_id_t expected_events = sender_current_fragment_[0];
256 if (sender_current_fragment_[1] > expected_events)
257 expected_events = sender_current_fragment_[1];
259 auto complete_events = complete_events_.size();
260 auto incomplete_events = timeout_events_.size();
261 auto missing_events = expected_events - complete_events - incomplete_events;
263 TLOG(TLVL_INFO) <<
"Received " << complete_events <<
" complete events in " << fm_(artdaq::TimeUtils::GetElapsedTime(test_start_time_),
"s")
264 <<
", Incomplete: " << incomplete_events <<
", Missing: " << missing_events;
265 TLOG(TLVL_DEBUG) <<
"stop_test_ END";
268 void artdaqtest::BrokenTransferTest::do_sending_(
int sender_rank)
273 TLOG(TLVL_DEBUG) <<
"Sender " << sender_rank <<
" setting sender_ready_";
274 sender_ready_[sender_rank] =
true;
276 while (sender_current_fragment_[sender_rank] < sequence_id_target_() || !test_end_requested_)
278 if (sender_rank == 0 && kill_first_sender_)
break;
279 while (sender_rank == 0 && pause_first_sender_)
281 std::this_thread::yield();
285 artdaq::Fragment frag(fragment_size_);
286 frag.setSequenceID(sender_current_fragment_[sender_rank]);
287 frag.setFragmentID(sender_rank);
288 frag.setSystemType(artdaq::Fragment::DataFragmentType);
290 auto start_time = std::chrono::steady_clock::now();
293 if (sender_tokens_[sender_rank].load() == 0)
295 TLOG(TLVL_INFO) <<
"Sender " << sender_rank <<
" waiting for token from receiver";
296 while (sender_tokens_[sender_rank].load() == 0 && !test_end_requested_) { usleep(10000); }
297 if (test_end_requested_)
continue;
298 TLOG(TLVL_INFO) <<
"Sender " << sender_rank <<
" waited " << fm_(artdaq::TimeUtils::GetElapsedTime(start_time),
"s") <<
" for token from receiver";
312 TLOG(TLVL_ERROR) <<
"Error sending Fragment " << sender_current_fragment_[sender_rank] <<
" from sender rank " << sender_rank <<
": "
315 auto duration = artdaq::TimeUtils::GetElapsedTime(start_time);
316 TLOG(TLVL_TRACE) <<
"Sender " << sender_rank <<
" Transferred Fragment " << sender_current_fragment_[sender_rank]
317 <<
" with size " << fragment_size_ <<
" words in " << fm_(duration,
"s")
318 <<
" (approx " << fm_(static_cast<double>(fragment_size_ *
sizeof(artdaq::detail::RawFragmentHeader::RawDataType)) / duration,
"B/s")
319 <<
") throttle " << send_throttle_us_;
320 ++sender_current_fragment_[sender_rank];
321 sender_tokens_[sender_rank]--;
322 if (send_throttle_us_)
323 usleep(send_throttle_us_);
326 TLOG(TLVL_DEBUG) <<
"Sender " << sender_rank <<
" shutting down...";
327 theTransfer.reset(
nullptr);
328 TLOG(TLVL_DEBUG) <<
"Sender " << sender_rank <<
" DONE";
331 void artdaqtest::BrokenTransferTest::do_receiving_(
int sender_rank,
int receiver_rank)
333 std::unique_ptr<artdaq::TransferInterface> theTransfer =
336 artdaq::FragmentPtr dropFrag =
nullptr;
338 TLOG(TLVL_DEBUG) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" setting receiver_ready_";
339 receiver_ready_[sender_rank] =
true;
340 sender_tokens_[sender_rank] = event_buffer_count_;
342 while (event_buffer_.size() > 0 || !test_end_requested_)
344 if (kill_receiver_)
break;
345 while (pause_receiver_)
347 std::this_thread::yield();
351 artdaq::detail::RawFragmentHeader hdr;
356 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
359 event_buffer_cv_.wait_for(lk, std::chrono::microseconds(10000));
361 auto it = event_buffer_.begin();
362 while (it != event_buffer_.end())
364 if (artdaq::TimeUtils::GetElapsedTimeMicroseconds(it->second.open_time) > event_buffer_timeout_us_)
366 TLOG(TLVL_WARNING) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
": Event " << it->first
367 <<
" has timed out after " << artdaq::TimeUtils::GetElapsedTime(it->second.open_time) <<
" s, removing...";
368 timeout_events_.insert(it->first);
369 it = event_buffer_.erase(it);
378 }
while (event_buffer_.size() > event_buffer_count_);
381 if (rank != sender_rank)
continue;
383 artdaq::RawDataType* ptr =
nullptr;
386 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
387 if (timeout_events_.count(hdr.sequence_id))
389 TLOG(TLVL_WARNING) <<
"Event " << hdr.sequence_id <<
" has timed out, discarding";
390 if (!dropFrag || dropFrag->size() < hdr.word_count)
392 dropFrag.reset(
new artdaq::Fragment(hdr.word_count - hdr.num_words()));
394 ptr = dropFrag->headerAddress() + hdr.num_words();
398 if (!event_buffer_.count(hdr.sequence_id))
400 event_buffer_[hdr.sequence_id].open_time = std::chrono::steady_clock::now();
401 event_buffer_[hdr.sequence_id].first_frag = artdaq::Fragment(hdr.word_count - hdr.num_words());
402 ptr = event_buffer_[hdr.sequence_id].first_frag.headerAddress() + hdr.num_words();
403 TLOG(TLVL_TRACE) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" opened event " << hdr.sequence_id
404 <<
" with Fragment from rank " << sender_rank;
408 event_buffer_[hdr.sequence_id].second_frag = artdaq::Fragment(hdr.word_count - hdr.num_words());
409 ptr = event_buffer_[hdr.sequence_id].second_frag.headerAddress() + hdr.num_words();
416 if (rank != sender_rank)
418 TLOG(TLVL_ERROR) <<
"Error receiving Fragment data after header received successfully!";
424 TLOG(TLVL_TRACE) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" completed event " << hdr.sequence_id
425 <<
" in " << fm_(artdaq::TimeUtils::GetElapsedTime(event_buffer_[hdr.sequence_id].open_time),
"s") <<
".";
427 std::unique_lock<std::mutex> lk(event_buffer_mutex_);
428 complete_events_.insert(hdr.sequence_id);
429 event_buffer_.erase(hdr.sequence_id);
430 event_buffer_cv_.notify_one();
436 TLOG(TLVL_DEBUG) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" shutting down...";
439 std::lock_guard<std::mutex> lk(event_buffer_mutex_);
440 theTransfer.reset(
nullptr);
441 TLOG(TLVL_DEBUG) <<
"Receiver " << sender_rank <<
"->" << receiver_rank <<
" DONE";
444 artdaq::Fragment::sequence_id_t artdaqtest::BrokenTransferTest::sequence_id_target_()
446 auto ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_start_time_) * fragment_rate_hz_ / 1000000);
447 if (test_end_requested_)
448 ret = 1 + (artdaq::TimeUtils::GetElapsedTimeMicroseconds(test_end_time_) * fragment_rate_hz_ / 1000000);
453 std::string artdaqtest::BrokenTransferTest::fm_(
double data, std::string units,
int logt)
455 if (data < 1 && logt > -3)
457 return fm_(data * 1000, units, logt - 1);
459 else if (data > 1000 && logt < 3)
461 return fm_(data / 1000, units, logt + 1);
465 o << std::fixed << std::setprecision(2) << data <<
" ";
virtual int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout)=0
Receive a Fragment Header from the transport mechanism.
BrokenTransferTest(fhicl::ParameterSet ps)
BrokenTransferTest Constructor
This TransferInterface is a Receiver.
virtual void flush_buffers()=0
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
static std::string CopyStatusToString(CopyStatus in)
Convert a CopyStatus variable to its string represenatation
virtual int receiveFragmentData(RawDataType *destination, size_t wordCount)=0
Receive the body of a Fragment to the given destination pointer.
void TestReceiverReconnect(int send_throttle_us=0)
Run the "Receiver Reconnect" test
This TransferInterface is a Sender.
virtual CopyStatus transfer_fragment_reliable_mode(artdaq::Fragment &&fragment)=0
Transfer a Fragment to the destination. This should be reliable, if the underlying transport mechanis...
Some error occurred, but no exception was thrown.
The send operation completed successfully.
virtual CopyStatus transfer_fragment_min_blocking_mode(artdaq::Fragment const &fragment, size_t send_timeout_usec)=0
Transfer a Fragment to the destination. May not necessarily be reliable, but will not block longer th...
void TestSenderReconnect()
Run the "Sender Reconnect" test
void TestSenderPause()
Run the "Sender Paused" test
Value to be returned upon receive timeout.
void TestReceiverPause()
Run the "Receiver Paused" test