1 #include "artdaq/DAQrate/TransferTest.hh"
3 #include "artdaq-core/Data/Fragment.hh"
4 #include "artdaq/DAQrate/DataSenderManager.hh"
5 #include "artdaq/DAQrate/FragmentReceiverManager.hh"
7 #define TRACE_NAME "TransferTest"
8 #include "artdaq/DAQdata/Globals.hh"
13 : senders_(psi.get<int>(
"num_senders"))
14 , receivers_(psi.get<int>(
"num_receivers"))
15 , sending_threads_(psi.get<int>(
"sending_threads", 1))
16 , sends_each_sender_(psi.get<int>(
"sends_per_sender"))
17 , receives_each_receiver_(0)
18 , buffer_count_(psi.get<int>(
"buffer_count", 10))
19 , error_count_max_(psi.get<int>(
"max_errors_before_abort", 3))
20 , fragment_size_(psi.get<size_t>(
"fragment_size", 0x100000))
21 , validate_mode_(psi.get<bool>(
"validate_data_mode", false))
22 , partition_number_(psi.get<int>(
"partition_number", rand() % 0x7F))
24 TLOG(TLVL_DEBUG + 35) <<
"CONSTRUCTOR";
26 if (fragment_size_ < artdaq::detail::RawFragmentHeader::num_words() *
sizeof(artdaq::RawDataType))
28 fragment_size_ = artdaq::detail::RawFragmentHeader::num_words() *
sizeof(artdaq::RawDataType);
31 fhicl::ParameterSet metric_pset;
35 metric_pset = psi.get<fhicl::ParameterSet>(
"metrics");
42 std::string name =
"TransferTest" + std::to_string(my_rank);
43 metricMan->initialize(metric_pset, name);
44 metricMan->do_start();
49 auto type(psi.get<std::string>(
"transfer_plugin_type",
"Shmem"));
51 bool broadcast_mode = psi.get<
bool>(
"broadcast_sends",
false);
54 receives_each_receiver_ = senders_ * sending_threads_ * sends_each_sender_;
60 if (senders_ * sending_threads_ * sends_each_sender_ % receivers_ != 0)
62 TLOG(TLVL_DEBUG + 33) <<
"Adding sends so that sends_each_sender * num_sending_ranks is a multiple of num_receiving_ranks" << std::endl;
63 while (senders_ * sends_each_sender_ % receivers_ != 0)
67 receives_each_receiver_ = senders_ * sending_threads_ * sends_each_sender_ / receivers_;
68 TLOG(TLVL_DEBUG + 33) <<
"sends_each_sender is now " << sends_each_sender_ << std::endl;
69 psi.put_or_replace(
"sends_per_sender", sends_each_sender_);
73 receives_each_receiver_ = senders_ * sending_threads_ * sends_each_sender_ / receivers_;
79 if (psi.has_key(
"hostmap"))
81 hostmap =
" host_map: @local::hostmap";
85 ss << psi.to_string() << std::endl;
88 for (
int ii = 0; ii < senders_; ++ii)
90 ss <<
"s" << ii <<
": { transferPluginType: " << type <<
" source_rank: " << ii <<
" max_fragment_size_words : " << fragment_size_ <<
" buffer_count : " << buffer_count_ <<
" partition_number : " << partition_number_ << hostmap <<
" }" << std::endl;
92 ss <<
"}" << std::endl
93 <<
" destinations: {";
94 for (
int jj = senders_; jj < senders_ + receivers_; ++jj)
96 ss <<
"d" << jj <<
": { transferPluginType: " << type <<
" destination_rank: " << jj <<
" max_fragment_size_words : " << fragment_size_ <<
" buffer_count : " << buffer_count_ <<
" partition_number : " << partition_number_ << hostmap <<
" }" << std::endl;
98 ss <<
"}" << std::endl;
100 ps_ = fhicl::ParameterSet::make(ss.str());
102 TLOG(TLVL_DEBUG + 32) <<
"Going to configure with ParameterSet: " << ps_.to_string() << std::endl;
107 TLOG(TLVL_INFO) <<
"runTest BEGIN: " << (my_rank < senders_ ?
"sending" :
"receiving");
108 start_time_ = std::chrono::steady_clock::now();
109 std::pair<size_t, double> result;
110 if (my_rank >= senders_ + receivers_)
114 if (my_rank < senders_)
116 std::vector<std::future<std::pair<size_t, double>>> results_futures(sending_threads_);
117 for (
int ii = 0; ii < sending_threads_; ++ii)
119 results_futures[ii] = std::async(std::bind(&TransferTest::do_sending,
this, ii));
121 for (
auto& future : results_futures)
125 auto thisresult = future.get();
126 result.first += thisresult.first;
127 result.second += thisresult.second;
133 result = do_receiving();
135 auto duration = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - start_time_).count();
136 TLOG(TLVL_INFO) << (my_rank < senders_ ?
"Sent " :
"Received ") << result.first <<
" bytes in " << duration <<
" seconds ( " << formatBytes(result.first / duration) <<
"/s )." << std::endl;
137 TLOG(TLVL_INFO) <<
"Rate of " << (my_rank < senders_ ?
"sending" :
"receiving") <<
": " << formatBytes(result.first / result.second) <<
"/s." << std::endl;
138 metricMan->do_stop();
139 metricMan->shutdown();
140 TLOG(TLVL_DEBUG + 36) <<
"runTest DONE";
144 std::pair<size_t, double> artdaq::TransferTest::do_sending(
int index)
146 TLOG(TLVL_DEBUG + 34) <<
"do_sending entered RawFragmentHeader::num_words()=" << artdaq::detail::RawFragmentHeader::num_words();
148 size_t totalSize = 0;
149 double totalTime = 0;
152 unsigned data_size_wrds = (fragment_size_ /
sizeof(artdaq::RawDataType)) - artdaq::detail::RawFragmentHeader::num_words();
153 artdaq::Fragment frag(data_size_wrds);
157 artdaq::RawDataType gen_seed = 0;
159 std::generate_n(frag.dataBegin(), data_size_wrds, [&]() {
return ++gen_seed; });
160 for (
size_t ii = 0; ii < frag.dataSize(); ++ii)
162 if (*(frag.dataBegin() + ii) != ii + 1)
164 TLOG(TLVL_ERROR) <<
"Data corruption detected! (" << (*(frag.dataBegin() + ii)) <<
" != " << (ii + 1) <<
") Aborting!";
166 return std::make_pair(0, 0.0);
171 int metric_send_interval = sends_each_sender_ / 1000 > 1 ? sends_each_sender_ / 1000 : 1;
172 auto init_time_metric = 0.0;
173 auto send_time_metric = 0.0;
174 auto after_time_metric = 0.0;
175 auto send_size_metric = 0.0;
176 auto error_count = 0;
178 for (
int ii = 0; ii < sends_each_sender_; ++ii)
180 auto loop_start = std::chrono::steady_clock::now();
181 TLOG(TLVL_DEBUG + 34) <<
"sender rank " << my_rank <<
" #" << ii <<
" resized bytes=" << frag.sizeBytes();
182 totalSize += frag.sizeBytes();
185 frag.setSequenceID(ii * sending_threads_ + index);
186 frag.setFragmentID(my_rank);
187 frag.setSystemType(artdaq::Fragment::DataFragmentType);
194 auto send_start = std::chrono::steady_clock::now();
195 TLOG(TLVL_DEBUG + 32) <<
"Sender " << my_rank <<
" sending fragment " << ii;
196 auto stspair = sender.sendFragment(std::move(frag));
197 auto after_send = std::chrono::steady_clock::now();
198 TLOG(TLVL_DEBUG + 33) <<
"Sender " << my_rank <<
" sent fragment " << ii;
199 sender.RemoveRoutingTableEntry(ii * sending_threads_ + index);
205 if (error_count >= error_count_max_)
207 TLOG(TLVL_ERROR) <<
"Too many errors sending fragments! Aborting... (sent=" << ii <<
"/" << sends_each_sender_ <<
")";
208 return_code_ = sends_each_sender_ - ii;
209 return std::make_pair(0, 0.0);
213 frag = artdaq::Fragment(data_size_wrds);
216 artdaq::RawDataType gen_seed = ii + 1;
218 std::generate_n(frag.dataBegin(), data_size_wrds, [&]() {
return ++gen_seed; });
219 for (
size_t jj = 0; jj < frag.dataSize(); ++jj)
221 if (*(frag.dataBegin() + jj) != (ii + 1) + jj + 1)
223 TLOG(TLVL_ERROR) <<
"Input Data corruption detected! (" << *(frag.dataBegin() + jj) <<
" != " << ii + jj + 2 <<
" at position " << ii <<
") Aborting!";
225 return std::make_pair(0, 0.0);
229 TLOG(TLVL_DEBUG + 37) <<
"sender rank " << my_rank <<
" frag replaced";
231 auto total_send_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_send - send_start).count();
232 totalTime += total_send_time;
233 send_time_metric += total_send_time;
234 send_size_metric += data_size_wrds *
sizeof(artdaq::RawDataType);
235 after_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - after_send).count();
236 init_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(send_start - loop_start).count();
238 if (metricMan && ii % metric_send_interval == 0)
240 metricMan->sendMetric(
"send_init_time", init_time_metric,
"seconds", 3, MetricMode::Accumulate);
241 metricMan->sendMetric(
"total_send_time", send_time_metric,
"seconds", 3, MetricMode::Accumulate);
242 metricMan->sendMetric(
"after_send_time", after_time_metric,
"seconds", 3, MetricMode::Accumulate);
243 metricMan->sendMetric(
"send_rate", send_size_metric / send_time_metric,
"B/s", 3, MetricMode::Average);
244 init_time_metric = 0.0;
245 send_time_metric = 0.0;
246 after_time_metric = 0.0;
247 send_size_metric = 0.0;
252 return std::make_pair(totalSize, totalTime);
255 std::pair<size_t, double> artdaq::TransferTest::do_receiving()
257 TLOG(TLVL_DEBUG + 34) <<
"do_receiving entered";
260 receiver.start_threads();
261 int counter = receives_each_receiver_;
262 size_t totalSize = 0;
263 double totalTime = 0;
265 bool nonblocking_mode = ps_.get<
bool>(
"nonblocking_sends",
false);
266 std::atomic<int> activeSenders(senders_ * sending_threads_);
267 auto end_loop = std::chrono::steady_clock::now();
269 auto recv_size_metric = 0.0;
270 auto recv_time_metric = 0.0;
271 auto input_wait_metric = 0.0;
272 auto init_wait_metric = 0.0;
273 int metric_send_interval = receives_each_receiver_ / 1000 > 1 ? receives_each_receiver_ : 1;
276 while ((activeSenders > 0 || (counter > receives_each_receiver_ / 10 && !nonblocking_mode)) && counter > 0)
278 auto start_loop = std::chrono::steady_clock::now();
279 TLOG(TLVL_DEBUG + 34) <<
"do_receiving: Counter is " << counter <<
", calling recvFragment (activeSenders=" << activeSenders <<
")";
281 auto before_receive = std::chrono::steady_clock::now();
283 auto ignoreFragPtr = receiver.recvFragment(senderSlot);
284 auto after_receive = std::chrono::steady_clock::now();
285 init_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(before_receive - start_loop).count();
289 if (ignoreFragPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
291 TLOG(TLVL_INFO) <<
"Receiver " << my_rank <<
" received EndOfData Fragment from Sender " << senderSlot;
293 TLOG(TLVL_DEBUG + 32) <<
"Active Senders is now " << activeSenders;
295 else if (ignoreFragPtr->type() != artdaq::Fragment::DataFragmentType)
297 TLOG(TLVL_WARNING) <<
"Receiver " << my_rank <<
" received Fragment with System type " << artdaq::detail::RawFragmentHeader::SystemTypeToString(ignoreFragPtr->type()) <<
" (Unexpected!)";
303 start_time_ = std::chrono::steady_clock::now();
307 TLOG(TLVL_INFO) <<
"Receiver " << my_rank <<
" received fragment " << receives_each_receiver_ - counter
308 <<
" with seqID " << ignoreFragPtr->sequenceID() <<
" from Sender " << senderSlot <<
" (Expecting " << counter <<
" more)";
309 thisSize = ignoreFragPtr->size() *
sizeof(artdaq::RawDataType);
310 totalSize += thisSize;
313 for (
size_t ii = 0; ii < ignoreFragPtr->dataSize(); ++ii)
315 if (*(ignoreFragPtr->dataBegin() + ii) != ignoreFragPtr->sequenceID() + ii + 1)
317 TLOG(TLVL_ERROR) <<
"Output Data corruption detected! (" << *(ignoreFragPtr->dataBegin() + ii) <<
" != " << (ignoreFragPtr->sequenceID() + ii + 1) <<
" at position " << ii <<
") Aborting!";
319 return std::make_pair(0, 0.0);
324 input_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - end_loop).count();
328 TLOG(TLVL_ERROR) <<
"Receiver " << my_rank <<
" detected fatal protocol error! Reducing active sender count by one!" << std::endl;
330 TLOG(TLVL_DEBUG + 32) <<
"Active Senders is now " << activeSenders;
332 TLOG(TLVL_DEBUG + 34) <<
"do_receiving: Recv Loop end, counter is " << counter;
334 auto total_recv_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - before_receive).count();
335 recv_time_metric += total_recv_time;
336 totalTime += total_recv_time;
337 recv_size_metric += thisSize;
339 if (metricMan && counter % metric_send_interval == 0)
341 metricMan->sendMetric(
"input_wait", input_wait_metric,
"seconds", 3, MetricMode::Accumulate);
342 metricMan->sendMetric(
"recv_init_time", init_wait_metric,
"seconds", 3, MetricMode::Accumulate);
343 metricMan->sendMetric(
"total_recv_time", recv_time_metric,
"seconds", 3, MetricMode::Accumulate);
344 metricMan->sendMetric(
"recv_rate", recv_size_metric / recv_time_metric,
"B/s", 3, MetricMode::Average);
346 input_wait_metric = 0.0;
347 init_wait_metric = 0.0;
348 recv_time_metric = 0.0;
349 recv_size_metric = 0.0;
351 end_loop = std::chrono::steady_clock::now();
354 if (counter != 0 && !nonblocking_mode)
356 TLOG(TLVL_ERROR) <<
"Did not receive all expected Fragments! Missing " << counter <<
" Fragments!";
357 return_code_ = counter;
358 return std::make_pair(0, 0.0);
361 return std::make_pair(totalSize, totalTime);
int runTest()
Run the test as configured.
Sends Fragment objects using TransferInterface plugins. Uses Routing Tables if confgiured, otherwise will Round-Robin Fragments to the destinations.
Receives Fragment objects from one or more DataSenderManager instances using TransferInterface plugin...
Value that is to be returned when a Transfer plugin determines that no more data will be arriving...
The send operation completed successfully.
TransferTest(fhicl::ParameterSet psi)
TransferTest Constructor.
For code clarity, things checking for successful receive should check retval >= NO_RANK_INFO.
Value to be returned upon receive timeout.