00001 #include "artdaq/DAQrate/TransferTest.hh"
00002
00003 #include "artdaq-core/Data/Fragment.hh"
00004 #include "artdaq/DAQrate/FragmentReceiverManager.hh"
00005 #include "artdaq/DAQrate/DataSenderManager.hh"
00006
00007 #define TRACE_NAME "TransferTest"
00008 #include "artdaq/DAQdata/Globals.hh"
00009
00010 #include "fhiclcpp/make_ParameterSet.h"
00011
00012 #include <future>
00013
00014 artdaq::TransferTest::TransferTest(fhicl::ParameterSet psi)
00015 : senders_(psi.get<int>("num_senders"))
00016 , receivers_(psi.get<int>("num_receivers"))
00017 , sending_threads_(psi.get<int>("sending_threads", 1))
00018 , sends_each_sender_(psi.get<int>("sends_per_sender"))
00019 , receives_each_receiver_(senders_ * sending_threads_ * sends_each_sender_ / receivers_)
00020 , buffer_count_(psi.get<int>("buffer_count", 10))
00021 , error_count_max_(psi.get<int>("max_errors_before_abort", 3))
00022 , fragment_size_(psi.get<size_t>("fragment_size", 0x100000))
00023 , ps_()
00024 , validate_mode_(psi.get<bool>("validate_data_mode", false))
00025 , partition_number_(psi.get<int>("partition_number", rand() % 0x7F))
00026 {
00027 TLOG(10) << "CONSTRUCTOR";
00028 metricMan = &metricMan_;
00029
00030 if (fragment_size_ < artdaq::detail::RawFragmentHeader::num_words() * sizeof(artdaq::RawDataType))
00031 {
00032 fragment_size_ = artdaq::detail::RawFragmentHeader::num_words() * sizeof(artdaq::RawDataType);
00033 }
00034
00035 fhicl::ParameterSet metric_pset;
00036
00037 try
00038 {
00039 metric_pset = psi.get<fhicl::ParameterSet>("metrics");
00040 }
00041 catch (...) {}
00042
00043 try
00044 {
00045 std::string name = "TransferTest" + std::to_string(my_rank);
00046 metricMan_.initialize(metric_pset, name);
00047 metricMan_.do_start();
00048 }
00049 catch (...) {}
00050
00051 std::string type(psi.get<std::string>("transfer_plugin_type", "Shmem"));
00052
00053 if (receivers_ > 0)
00054 {
00055 if (senders_ * sends_each_sender_ % receivers_ != 0)
00056 {
00057 TLOG(TLVL_TRACE) << "Adding sends so that sends_each_sender * num_sending_ranks is a multiple of num_receiving_ranks" << std::endl;
00058 while (senders_ * sends_each_sender_ % receivers_ != 0)
00059 {
00060 sends_each_sender_++;
00061 }
00062 receives_each_receiver_ = senders_ * sends_each_sender_ / receivers_;
00063 TLOG(TLVL_TRACE) << "sends_each_sender is now " << sends_each_sender_ << std::endl;
00064 psi.put_or_replace("sends_per_sender", sends_each_sender_);
00065 }
00066 }
00067
00068 std::string hostmap = "";
00069 if (psi.has_key("hostmap"))
00070 {
00071 hostmap = " host_map: @local::hostmap";
00072 }
00073
00074 std::stringstream ss;
00075 ss << psi.to_string() << std::endl;
00076
00077 ss << " sources: {";
00078 for (int ii = 0; ii < senders_; ++ii)
00079 {
00080 ss << "s" << ii << ": { transferPluginType: " << type << " source_rank: " << ii << " max_fragment_size_words : " << fragment_size_ << " buffer_count : " << buffer_count_ << " partition_number : " << partition_number_ << hostmap << " }" << std::endl;
00081 }
00082 ss << "}" << std::endl << " destinations: {";
00083 for (int jj = senders_; jj < senders_ + receivers_; ++jj)
00084 {
00085 ss << "d" << jj << ": { transferPluginType: " << type << " destination_rank: " << jj << " max_fragment_size_words : " << fragment_size_ << " buffer_count : " << buffer_count_ << " partition_number : " << partition_number_ << hostmap << " }" << std::endl;
00086 }
00087 ss << "}" << std::endl;
00088
00089 make_ParameterSet(ss.str(), ps_);
00090
00091
00092 TLOG(TLVL_DEBUG) << "Going to configure with ParameterSet: " << ps_.to_string() << std::endl;
00093 }
00094
00095 int artdaq::TransferTest::runTest()
00096 {
00097 TLOG(TLVL_INFO) << "runTest BEGIN: " << (my_rank < senders_ ? "sending" : "receiving");
00098 start_time_ = std::chrono::steady_clock::now();
00099 std::pair<size_t, double> result;
00100 if (my_rank >= senders_ + receivers_) return 0;
00101 if (my_rank < senders_)
00102 {
00103 std::vector<std::future<std::pair<size_t, double>>> results_futures(sending_threads_);
00104 for (int ii = 0; ii < sending_threads_; ++ii)
00105 {
00106 results_futures[ii] = std::async(std::bind(&TransferTest::do_sending, this, ii));
00107 }
00108 for (auto& future : results_futures)
00109 {
00110 if (future.valid())
00111 {
00112 auto thisresult = future.get();
00113 result.first += thisresult.first;
00114 result.second += thisresult.second;
00115 }
00116 }
00117 }
00118 else
00119 {
00120 result = do_receiving();
00121 }
00122 auto duration = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - start_time_).count();
00123 TLOG(TLVL_INFO) << (my_rank < senders_ ? "Sent " : "Received ") << result.first << " bytes in " << duration << " seconds ( " << formatBytes(result.first / duration) << "/s )." << std::endl;
00124 TLOG(TLVL_INFO) << "Rate of " << (my_rank < senders_ ? "sending" : "receiving") << ": " << formatBytes(result.first / result.second) << "/s." << std::endl;
00125 metricMan_.do_stop();
00126 metricMan_.shutdown();
00127 TLOG(11) << "runTest DONE";
00128 return 0;
00129 }
00130
00131 std::pair<size_t, double> artdaq::TransferTest::do_sending(int index)
00132 {
00133 TLOG(7) << "do_sending entered RawFragmentHeader::num_words()=" << std::to_string(artdaq::detail::RawFragmentHeader::num_words());
00134
00135 size_t totalSize = 0;
00136 double totalTime = 0;
00137 artdaq::DataSenderManager sender(ps_);
00138
00139 unsigned data_size_wrds = (fragment_size_ / sizeof(artdaq::RawDataType)) - artdaq::detail::RawFragmentHeader::num_words();
00140 artdaq::Fragment frag(data_size_wrds);
00141
00142 if (validate_mode_)
00143 {
00144 artdaq::RawDataType gen_seed = 0;
00145
00146 std::generate_n(frag.dataBegin(), data_size_wrds, [&]() { return ++gen_seed; });
00147 for (size_t ii = 0; ii < frag.dataSize(); ++ii)
00148 {
00149 if (*(frag.dataBegin() + ii) != ii + 1)
00150 {
00151 TLOG(TLVL_ERROR) << "Data corruption detected! (" << std::to_string(*(frag.dataBegin() + ii)) << " != " << std::to_string(ii + 1) << ") Aborting!";
00152 exit(1);
00153 }
00154 }
00155 }
00156
00157 int metric_send_interval = sends_each_sender_ / 1000 > 1 ? sends_each_sender_ / 1000 : 1;
00158 auto init_time_metric = 0.0;
00159 auto send_time_metric = 0.0;
00160 auto after_time_metric = 0.0;
00161 auto send_size_metric = 0.0;
00162 auto error_count = 0;
00163
00164 for (int ii = 0; ii < sends_each_sender_; ++ii)
00165 {
00166 auto loop_start = std::chrono::steady_clock::now();
00167 TLOG(7) << "sender rank " << my_rank << " #" << ii << " resized bytes=" << std::to_string(frag.sizeBytes());
00168 totalSize += frag.sizeBytes();
00169
00170
00171 frag.setSequenceID(ii * sending_threads_ + index);
00172 frag.setFragmentID(my_rank);
00173 frag.setSystemType(artdaq::Fragment::DataFragmentType);
00174
00175
00176
00177
00178
00179
00180 auto send_start = std::chrono::steady_clock::now();
00181 auto stspair = sender.sendFragment(std::move(frag));
00182 TLOG(TLVL_DEBUG) << "Sender " << my_rank << " sending fragment " << ii;
00183 auto after_send = std::chrono::steady_clock::now();
00184 TLOG(TLVL_TRACE) << "Sender " << my_rank << " sent fragment " << ii;
00185
00186
00187 if (stspair.second != artdaq::TransferInterface::CopyStatus::kSuccess)
00188 {
00189 error_count++;
00190 if (error_count >= error_count_max_)
00191 {
00192 TLOG(TLVL_ERROR) << "Too many errors sending fragments! Aborting... (sent=" << ii << "/" << sends_each_sender_ << ")";
00193 exit(sends_each_sender_ - ii);
00194 }
00195 }
00196
00197 frag = artdaq::Fragment(data_size_wrds);
00198 if (validate_mode_)
00199 {
00200 artdaq::RawDataType gen_seed = ii + 1;
00201
00202 std::generate_n(frag.dataBegin(), data_size_wrds, [&]() { return ++gen_seed; });
00203 for (size_t jj = 0; jj < frag.dataSize(); ++jj)
00204 {
00205 if (*(frag.dataBegin() + jj) != (ii + 1) + jj + 1)
00206 {
00207 TLOG(TLVL_ERROR) << "Input Data corruption detected! (" << std::to_string(*(frag.dataBegin() + jj)) << " != " << std::to_string(ii + jj + 2) << " at position " << ii << ") Aborting!";
00208 exit(1);
00209 }
00210 }
00211 }
00212 TLOG(9) << "sender rank " << my_rank << " frag replaced";
00213
00214 auto total_send_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_send - send_start).count();
00215 totalTime += total_send_time;
00216 send_time_metric += total_send_time;
00217 send_size_metric += data_size_wrds * sizeof(artdaq::RawDataType);
00218 after_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - after_send).count();
00219 init_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(send_start - loop_start).count();
00220
00221 if (metricMan && ii % metric_send_interval == 0)
00222 {
00223 metricMan->sendMetric("send_init_time", init_time_metric, "seconds", 3, MetricMode::Accumulate);
00224 metricMan->sendMetric("total_send_time", send_time_metric, "seconds", 3, MetricMode::Accumulate);
00225 metricMan->sendMetric("after_send_time", after_time_metric, "seconds", 3, MetricMode::Accumulate);
00226 metricMan->sendMetric("send_rate", send_size_metric / send_time_metric, "B/s", 3, MetricMode::Average);
00227 init_time_metric = 0.0;
00228 send_time_metric = 0.0;
00229 after_time_metric = 0.0;
00230 send_size_metric = 0.0;
00231 }
00232 usleep(0);
00233 }
00234
00235 return std::make_pair(totalSize, totalTime);
00236 }
00237
00238 std::pair<size_t, double> artdaq::TransferTest::do_receiving()
00239 {
00240 TLOG(7) << "do_receiving entered";
00241
00242 artdaq::FragmentReceiverManager receiver(ps_);
00243 receiver.start_threads();
00244 int counter = receives_each_receiver_;
00245 size_t totalSize = 0;
00246 double totalTime = 0;
00247 bool first = true;
00248 int activeSenders = senders_ * sending_threads_;
00249 auto end_loop = std::chrono::steady_clock::now();
00250
00251 auto recv_size_metric = 0.0;
00252 auto recv_time_metric = 0.0;
00253 auto input_wait_metric = 0.0;
00254 auto init_wait_metric = 0.0;
00255 int metric_send_interval = receives_each_receiver_ / 1000 > 1 ? receives_each_receiver_ : 1;
00256
00257
00258 while ((activeSenders > 0 || counter > receives_each_receiver_ / 10) && counter > 0)
00259 {
00260 auto start_loop = std::chrono::steady_clock::now();
00261 TLOG(7) << "do_receiving: Counter is " << counter << ", calling recvFragment";
00262 int senderSlot = artdaq::TransferInterface::RECV_TIMEOUT;
00263 auto before_receive = std::chrono::steady_clock::now();
00264 init_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(before_receive - start_loop).count();
00265
00266 auto ignoreFragPtr = receiver.recvFragment(senderSlot);
00267 auto after_receive = std::chrono::steady_clock::now();
00268 size_t thisSize = 0;
00269 if (senderSlot >= artdaq::TransferInterface::RECV_SUCCESS && ignoreFragPtr)
00270 {
00271 if (ignoreFragPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
00272 {
00273 TLOG(TLVL_INFO) << "Receiver " << my_rank << " received EndOfData Fragment from Sender " << senderSlot;
00274 activeSenders--;
00275 TLOG(TLVL_DEBUG) << "Active Senders is now " << activeSenders;
00276 }
00277 else if (ignoreFragPtr->type() != artdaq::Fragment::DataFragmentType)
00278 {
00279 TLOG(TLVL_WARNING) << "Receiver " << my_rank << " received Fragment with System type " << artdaq::detail::RawFragmentHeader::SystemTypeToString(ignoreFragPtr->type()) << " (Unexpected!)";
00280 }
00281 else
00282 {
00283 if (first)
00284 {
00285 start_time_ = std::chrono::steady_clock::now();
00286 first = false;
00287 }
00288 counter--;
00289 TLOG(TLVL_INFO) << "Receiver " << my_rank << " received fragment " << receives_each_receiver_ - counter
00290 << " with seqID " << std::to_string(ignoreFragPtr->sequenceID()) << " from Sender " << senderSlot << " (Expecting " << counter << " more)";
00291 thisSize = ignoreFragPtr->size() * sizeof(artdaq::RawDataType);
00292 totalSize += thisSize;
00293 if (validate_mode_)
00294 {
00295 for (size_t ii = 0; ii < ignoreFragPtr->dataSize(); ++ii)
00296 {
00297 if (*(ignoreFragPtr->dataBegin() + ii) != ignoreFragPtr->sequenceID() + ii + 1)
00298 {
00299 TLOG(TLVL_ERROR) << "Output Data corruption detected! (" << std::to_string(*(ignoreFragPtr->dataBegin() + ii)) << " != " << std::to_string(ignoreFragPtr->sequenceID() + ii + 1) << " at position " << ii << ") Aborting!";
00300 exit(1);
00301 }
00302 }
00303 }
00304 }
00305 input_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - end_loop).count();
00306 }
00307 else if (senderSlot == artdaq::TransferInterface::DATA_END)
00308 {
00309 TLOG(TLVL_ERROR) << "Receiver " << my_rank << " detected fatal protocol error! Reducing active sender count by one!" << std::endl;
00310 activeSenders--;
00311 TLOG(TLVL_DEBUG) << "Active Senders is now " << activeSenders;
00312 }
00313 TLOG(7) << "do_receiving: Recv Loop end, counter is " << counter;
00314
00315
00316 auto total_recv_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - before_receive).count();
00317 recv_time_metric += total_recv_time;
00318 totalTime += total_recv_time;
00319 recv_size_metric += thisSize;
00320
00321 if (metricMan && counter % metric_send_interval == 0)
00322 {
00323 metricMan->sendMetric("input_wait", input_wait_metric, "seconds", 3, MetricMode::Accumulate);
00324 metricMan->sendMetric("recv_init_time", init_wait_metric, "seconds", 3, MetricMode::Accumulate);
00325 metricMan->sendMetric("total_recv_time", recv_time_metric, "seconds", 3, MetricMode::Accumulate);
00326 metricMan->sendMetric("recv_rate", recv_size_metric / recv_time_metric, "B/s", 3, MetricMode::Average);
00327
00328 input_wait_metric = 0.0;
00329 init_wait_metric = 0.0;
00330 recv_time_metric = 0.0;
00331 recv_size_metric = 0.0;
00332 }
00333 end_loop = std::chrono::steady_clock::now();
00334 }
00335
00336 if (counter != 0)
00337 {
00338 TLOG(TLVL_ERROR) << "Did not receive all expected Fragments! Missing " << counter << " Fragments!";
00339 exit(counter);
00340 }
00341
00342 return std::make_pair(totalSize, totalTime);
00343 }