00001 #include "artdaq/DAQrate/TransferTest.hh"
00002
00003 #include "artdaq-core/Data/Fragment.hh"
00004 #include "artdaq/DAQrate/FragmentReceiverManager.hh"
00005 #include "artdaq/DAQrate/DataSenderManager.hh"
00006
00007 #define TRACE_NAME "TransferTest"
00008 #include "artdaq/DAQdata/Globals.hh"
00009
00010 #include "fhiclcpp/make_ParameterSet.h"
00011
00012 #include <future>
00013
00014 artdaq::TransferTest::TransferTest(fhicl::ParameterSet psi)
00015 : senders_(psi.get<int>("num_senders"))
00016 , receivers_(psi.get<int>("num_receivers"))
00017 , sending_threads_(psi.get<int>("sending_threads", 1))
00018 , sends_each_sender_(psi.get<int>("sends_per_sender"))
00019 , receives_each_receiver_(0)
00020 , buffer_count_(psi.get<int>("buffer_count", 10))
00021 , error_count_max_(psi.get<int>("max_errors_before_abort", 3))
00022 , fragment_size_(psi.get<size_t>("fragment_size", 0x100000))
00023 , ps_()
00024 , validate_mode_(psi.get<bool>("validate_data_mode", false))
00025 , partition_number_(psi.get<int>("partition_number", rand() % 0x7F))
00026 {
00027 TLOG(10) << "CONSTRUCTOR";
00028
00029 if (fragment_size_ < artdaq::detail::RawFragmentHeader::num_words() * sizeof(artdaq::RawDataType))
00030 {
00031 fragment_size_ = artdaq::detail::RawFragmentHeader::num_words() * sizeof(artdaq::RawDataType);
00032 }
00033
00034 fhicl::ParameterSet metric_pset;
00035
00036 try
00037 {
00038 metric_pset = psi.get<fhicl::ParameterSet>("metrics");
00039 }
00040 catch (...) {}
00041
00042 try
00043 {
00044 std::string name = "TransferTest" + std::to_string(my_rank);
00045 metricMan->initialize(metric_pset, name);
00046 metricMan->do_start();
00047 }
00048 catch (...) {}
00049
00050 std::string type(psi.get<std::string>("transfer_plugin_type", "Shmem"));
00051
00052 bool broadcast_mode = psi.get<bool>("broadcast_sends", false);
00053 if (broadcast_mode)
00054 {
00055 receives_each_receiver_ = senders_ * sending_threads_ * sends_each_sender_;
00056 }
00057 else
00058 {
00059 if (receivers_ > 0)
00060 {
00061 if (senders_ * sending_threads_ * sends_each_sender_ % receivers_ != 0)
00062 {
00063 TLOG(TLVL_TRACE) << "Adding sends so that sends_each_sender * num_sending_ranks is a multiple of num_receiving_ranks" << std::endl;
00064 while (senders_ * sends_each_sender_ % receivers_ != 0)
00065 {
00066 sends_each_sender_++;
00067 }
00068 receives_each_receiver_ = senders_ * sending_threads_ * sends_each_sender_ / receivers_;
00069 TLOG(TLVL_TRACE) << "sends_each_sender is now " << sends_each_sender_ << std::endl;
00070 psi.put_or_replace("sends_per_sender", sends_each_sender_);
00071 }
00072 else
00073 {
00074 receives_each_receiver_ = senders_ * sending_threads_ * sends_each_sender_ / receivers_;
00075 }
00076 }
00077 }
00078
00079 std::string hostmap = "";
00080 if (psi.has_key("hostmap"))
00081 {
00082 hostmap = " host_map: @local::hostmap";
00083 }
00084
00085 std::stringstream ss;
00086 ss << psi.to_string() << std::endl;
00087
00088 ss << " sources: {";
00089 for (int ii = 0; ii < senders_; ++ii)
00090 {
00091 ss << "s" << ii << ": { transferPluginType: " << type << " source_rank: " << ii << " max_fragment_size_words : " << fragment_size_ << " buffer_count : " << buffer_count_ << " partition_number : " << partition_number_ << hostmap << " }" << std::endl;
00092 }
00093 ss << "}" << std::endl << " destinations: {";
00094 for (int jj = senders_; jj < senders_ + receivers_; ++jj)
00095 {
00096 ss << "d" << jj << ": { transferPluginType: " << type << " destination_rank: " << jj << " max_fragment_size_words : " << fragment_size_ << " buffer_count : " << buffer_count_ << " partition_number : " << partition_number_ << hostmap << " }" << std::endl;
00097 }
00098 ss << "}" << std::endl;
00099
00100 make_ParameterSet(ss.str(), ps_);
00101
00102
00103 TLOG(TLVL_DEBUG) << "Going to configure with ParameterSet: " << ps_.to_string() << std::endl;
00104 }
00105
00106 int artdaq::TransferTest::runTest()
00107 {
00108 TLOG(TLVL_INFO) << "runTest BEGIN: " << (my_rank < senders_ ? "sending" : "receiving");
00109 start_time_ = std::chrono::steady_clock::now();
00110 std::pair<size_t, double> result;
00111 if (my_rank >= senders_ + receivers_) return 0;
00112 if (my_rank < senders_)
00113 {
00114 std::vector<std::future<std::pair<size_t, double>>> results_futures(sending_threads_);
00115 for (int ii = 0; ii < sending_threads_; ++ii)
00116 {
00117 results_futures[ii] = std::async(std::bind(&TransferTest::do_sending, this, ii));
00118 }
00119 for (auto& future : results_futures)
00120 {
00121 if (future.valid())
00122 {
00123 auto thisresult = future.get();
00124 result.first += thisresult.first;
00125 result.second += thisresult.second;
00126 }
00127 }
00128 }
00129 else
00130 {
00131 result = do_receiving();
00132 }
00133 auto duration = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - start_time_).count();
00134 TLOG(TLVL_INFO) << (my_rank < senders_ ? "Sent " : "Received ") << result.first << " bytes in " << duration << " seconds ( " << formatBytes(result.first / duration) << "/s )." << std::endl;
00135 TLOG(TLVL_INFO) << "Rate of " << (my_rank < senders_ ? "sending" : "receiving") << ": " << formatBytes(result.first / result.second) << "/s." << std::endl;
00136 metricMan->do_stop();
00137 metricMan->shutdown();
00138 TLOG(11) << "runTest DONE";
00139 return 0;
00140 }
00141
00142 std::pair<size_t, double> artdaq::TransferTest::do_sending(int index)
00143 {
00144 TLOG(7) << "do_sending entered RawFragmentHeader::num_words()=" << artdaq::detail::RawFragmentHeader::num_words();
00145
00146 size_t totalSize = 0;
00147 double totalTime = 0;
00148 artdaq::DataSenderManager sender(ps_);
00149
00150 unsigned data_size_wrds = (fragment_size_ / sizeof(artdaq::RawDataType)) - artdaq::detail::RawFragmentHeader::num_words();
00151 artdaq::Fragment frag(data_size_wrds);
00152
00153 if (validate_mode_)
00154 {
00155 artdaq::RawDataType gen_seed = 0;
00156
00157 std::generate_n(frag.dataBegin(), data_size_wrds, [&]() { return ++gen_seed; });
00158 for (size_t ii = 0; ii < frag.dataSize(); ++ii)
00159 {
00160 if (*(frag.dataBegin() + ii) != ii + 1)
00161 {
00162 TLOG(TLVL_ERROR) << "Data corruption detected! (" << (*(frag.dataBegin() + ii)) << " != " << (ii + 1) << ") Aborting!";
00163 exit(1);
00164 }
00165 }
00166 }
00167
00168 int metric_send_interval = sends_each_sender_ / 1000 > 1 ? sends_each_sender_ / 1000 : 1;
00169 auto init_time_metric = 0.0;
00170 auto send_time_metric = 0.0;
00171 auto after_time_metric = 0.0;
00172 auto send_size_metric = 0.0;
00173 auto error_count = 0;
00174
00175 for (int ii = 0; ii < sends_each_sender_; ++ii)
00176 {
00177 auto loop_start = std::chrono::steady_clock::now();
00178 TLOG(7) << "sender rank " << my_rank << " #" << ii << " resized bytes=" << frag.sizeBytes();
00179 totalSize += frag.sizeBytes();
00180
00181
00182 frag.setSequenceID(ii * sending_threads_ + index);
00183 frag.setFragmentID(my_rank);
00184 frag.setSystemType(artdaq::Fragment::DataFragmentType);
00185
00186
00187
00188
00189
00190
00191 auto send_start = std::chrono::steady_clock::now();
00192 TLOG(TLVL_DEBUG) << "Sender " << my_rank << " sending fragment " << ii;
00193 auto stspair = sender.sendFragment(std::move(frag));
00194 auto after_send = std::chrono::steady_clock::now();
00195 TLOG(TLVL_TRACE) << "Sender " << my_rank << " sent fragment " << ii;
00196
00197
00198 if (stspair.second != artdaq::TransferInterface::CopyStatus::kSuccess)
00199 {
00200 error_count++;
00201 if (error_count >= error_count_max_)
00202 {
00203 TLOG(TLVL_ERROR) << "Too many errors sending fragments! Aborting... (sent=" << ii << "/" << sends_each_sender_ << ")";
00204 exit(sends_each_sender_ - ii);
00205 }
00206 }
00207
00208 frag = artdaq::Fragment(data_size_wrds);
00209 if (validate_mode_)
00210 {
00211 artdaq::RawDataType gen_seed = ii + 1;
00212
00213 std::generate_n(frag.dataBegin(), data_size_wrds, [&]() { return ++gen_seed; });
00214 for (size_t jj = 0; jj < frag.dataSize(); ++jj)
00215 {
00216 if (*(frag.dataBegin() + jj) != (ii + 1) + jj + 1)
00217 {
00218 TLOG(TLVL_ERROR) << "Input Data corruption detected! (" << *(frag.dataBegin() + jj) << " != " << ii + jj + 2 << " at position " << ii << ") Aborting!";
00219 exit(1);
00220 }
00221 }
00222 }
00223 TLOG(9) << "sender rank " << my_rank << " frag replaced";
00224
00225 auto total_send_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_send - send_start).count();
00226 totalTime += total_send_time;
00227 send_time_metric += total_send_time;
00228 send_size_metric += data_size_wrds * sizeof(artdaq::RawDataType);
00229 after_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - after_send).count();
00230 init_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(send_start - loop_start).count();
00231
00232 if (metricMan && ii % metric_send_interval == 0)
00233 {
00234 metricMan->sendMetric("send_init_time", init_time_metric, "seconds", 3, MetricMode::Accumulate);
00235 metricMan->sendMetric("total_send_time", send_time_metric, "seconds", 3, MetricMode::Accumulate);
00236 metricMan->sendMetric("after_send_time", after_time_metric, "seconds", 3, MetricMode::Accumulate);
00237 metricMan->sendMetric("send_rate", send_size_metric / send_time_metric, "B/s", 3, MetricMode::Average);
00238 init_time_metric = 0.0;
00239 send_time_metric = 0.0;
00240 after_time_metric = 0.0;
00241 send_size_metric = 0.0;
00242 }
00243 usleep(0);
00244 }
00245
00246 return std::make_pair(totalSize, totalTime);
00247 }
00248
00249 std::pair<size_t, double> artdaq::TransferTest::do_receiving()
00250 {
00251 TLOG(7) << "do_receiving entered";
00252
00253 artdaq::FragmentReceiverManager receiver(ps_);
00254 receiver.start_threads();
00255 int counter = receives_each_receiver_;
00256 size_t totalSize = 0;
00257 double totalTime = 0;
00258 bool first = true;
00259 bool nonblocking_mode = ps_.get<bool>("nonblocking_sends", false);
00260 std::atomic<int> activeSenders(senders_ * sending_threads_);
00261 auto end_loop = std::chrono::steady_clock::now();
00262
00263 auto recv_size_metric = 0.0;
00264 auto recv_time_metric = 0.0;
00265 auto input_wait_metric = 0.0;
00266 auto init_wait_metric = 0.0;
00267 int metric_send_interval = receives_each_receiver_ / 1000 > 1 ? receives_each_receiver_ : 1;
00268
00269
00270 while ((activeSenders > 0 || (counter > receives_each_receiver_ / 10 && !nonblocking_mode)) && counter > 0)
00271 {
00272 auto start_loop = std::chrono::steady_clock::now();
00273 TLOG(7) << "do_receiving: Counter is " << counter << ", calling recvFragment (activeSenders=" << activeSenders << ")";
00274 int senderSlot = artdaq::TransferInterface::RECV_TIMEOUT;
00275 auto before_receive = std::chrono::steady_clock::now();
00276
00277 auto ignoreFragPtr = receiver.recvFragment(senderSlot);
00278 auto after_receive = std::chrono::steady_clock::now();
00279 init_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(before_receive - start_loop).count();
00280 size_t thisSize = 0;
00281 if (senderSlot >= artdaq::TransferInterface::RECV_SUCCESS && ignoreFragPtr)
00282 {
00283 if (ignoreFragPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
00284 {
00285 TLOG(TLVL_INFO) << "Receiver " << my_rank << " received EndOfData Fragment from Sender " << senderSlot;
00286 activeSenders--;
00287 TLOG(TLVL_DEBUG) << "Active Senders is now " << activeSenders;
00288 }
00289 else if (ignoreFragPtr->type() != artdaq::Fragment::DataFragmentType)
00290 {
00291 TLOG(TLVL_WARNING) << "Receiver " << my_rank << " received Fragment with System type " << artdaq::detail::RawFragmentHeader::SystemTypeToString(ignoreFragPtr->type()) << " (Unexpected!)";
00292 }
00293 else
00294 {
00295 if (first)
00296 {
00297 start_time_ = std::chrono::steady_clock::now();
00298 first = false;
00299 }
00300 counter--;
00301 TLOG(TLVL_INFO) << "Receiver " << my_rank << " received fragment " << receives_each_receiver_ - counter
00302 << " with seqID " << ignoreFragPtr->sequenceID() << " from Sender " << senderSlot << " (Expecting " << counter << " more)";
00303 thisSize = ignoreFragPtr->size() * sizeof(artdaq::RawDataType);
00304 totalSize += thisSize;
00305 if (validate_mode_)
00306 {
00307 for (size_t ii = 0; ii < ignoreFragPtr->dataSize(); ++ii)
00308 {
00309 if (*(ignoreFragPtr->dataBegin() + ii) != ignoreFragPtr->sequenceID() + ii + 1)
00310 {
00311 TLOG(TLVL_ERROR) << "Output Data corruption detected! (" << *(ignoreFragPtr->dataBegin() + ii) << " != " << (ignoreFragPtr->sequenceID() + ii + 1) << " at position " << ii << ") Aborting!";
00312 exit(1);
00313 }
00314 }
00315 }
00316 }
00317 input_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - end_loop).count();
00318 }
00319 else if (senderSlot == artdaq::TransferInterface::DATA_END)
00320 {
00321 TLOG(TLVL_ERROR) << "Receiver " << my_rank << " detected fatal protocol error! Reducing active sender count by one!" << std::endl;
00322 activeSenders--;
00323 TLOG(TLVL_DEBUG) << "Active Senders is now " << activeSenders;
00324 }
00325 TLOG(7) << "do_receiving: Recv Loop end, counter is " << counter;
00326
00327
00328 auto total_recv_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - before_receive).count();
00329 recv_time_metric += total_recv_time;
00330 totalTime += total_recv_time;
00331 recv_size_metric += thisSize;
00332
00333 if (metricMan && counter % metric_send_interval == 0)
00334 {
00335 metricMan->sendMetric("input_wait", input_wait_metric, "seconds", 3, MetricMode::Accumulate);
00336 metricMan->sendMetric("recv_init_time", init_wait_metric, "seconds", 3, MetricMode::Accumulate);
00337 metricMan->sendMetric("total_recv_time", recv_time_metric, "seconds", 3, MetricMode::Accumulate);
00338 metricMan->sendMetric("recv_rate", recv_size_metric / recv_time_metric, "B/s", 3, MetricMode::Average);
00339
00340 input_wait_metric = 0.0;
00341 init_wait_metric = 0.0;
00342 recv_time_metric = 0.0;
00343 recv_size_metric = 0.0;
00344 }
00345 end_loop = std::chrono::steady_clock::now();
00346 }
00347
00348 if (counter != 0 && !nonblocking_mode)
00349 {
00350 TLOG(TLVL_ERROR) << "Did not receive all expected Fragments! Missing " << counter << " Fragments!";
00351 exit(counter);
00352 }
00353
00354 return std::make_pair(totalSize, totalTime);
00355 }