00001 #include "artdaq/DAQrate/TransferTest.hh"
00002
00003 #include "artdaq-core/Data/Fragment.hh"
00004 #include "artdaq/DAQrate/FragmentReceiverManager.hh"
00005 #include "artdaq/DAQrate/DataSenderManager.hh"
00006
00007 #define TRACE_NAME "TransferTest"
00008 #include "artdaq/DAQdata/Globals.hh"
00009
00010 #include "fhiclcpp/make_ParameterSet.h"
00011
00012 #include <future>
00013
00014 artdaq::TransferTest::TransferTest(fhicl::ParameterSet psi)
00015 : senders_(psi.get<int>("num_senders"))
00016 , receivers_(psi.get<int>("num_receivers"))
00017 , sending_threads_(psi.get<int>("sending_threads", 1))
00018 , sends_each_sender_(psi.get<int>("sends_per_sender"))
00019 , receives_each_receiver_(senders_ * sending_threads_ * sends_each_sender_ / receivers_)
00020 , buffer_count_(psi.get<int>("buffer_count", 10))
00021 , error_count_max_(psi.get<int>("max_errors_before_abort", 3))
00022 , fragment_size_(psi.get<size_t>("fragment_size", 0x100000))
00023 , ps_()
00024 , validate_mode_(psi.get<bool>("validate_data_mode", false))
00025 , partition_number_(psi.get<int>("partition_number", rand() % 0x7F))
00026 {
00027 TLOG(10) << "CONSTRUCTOR";
00028
00029 if (fragment_size_ < artdaq::detail::RawFragmentHeader::num_words() * sizeof(artdaq::RawDataType))
00030 {
00031 fragment_size_ = artdaq::detail::RawFragmentHeader::num_words() * sizeof(artdaq::RawDataType);
00032 }
00033
00034 fhicl::ParameterSet metric_pset;
00035
00036 try
00037 {
00038 metric_pset = psi.get<fhicl::ParameterSet>("metrics");
00039 }
00040 catch (...) {}
00041
00042 try
00043 {
00044 std::string name = "TransferTest" + std::to_string(my_rank);
00045 metricMan->initialize(metric_pset, name);
00046 metricMan->do_start();
00047 }
00048 catch (...) {}
00049
00050 std::string type(psi.get<std::string>("transfer_plugin_type", "Shmem"));
00051
00052 if (receivers_ > 0)
00053 {
00054 if (senders_ * sends_each_sender_ % receivers_ != 0)
00055 {
00056 TLOG(TLVL_TRACE) << "Adding sends so that sends_each_sender * num_sending_ranks is a multiple of num_receiving_ranks" << std::endl;
00057 while (senders_ * sends_each_sender_ % receivers_ != 0)
00058 {
00059 sends_each_sender_++;
00060 }
00061 receives_each_receiver_ = senders_ * sends_each_sender_ / receivers_;
00062 TLOG(TLVL_TRACE) << "sends_each_sender is now " << sends_each_sender_ << std::endl;
00063 psi.put_or_replace("sends_per_sender", sends_each_sender_);
00064 }
00065 }
00066
00067 std::string hostmap = "";
00068 if (psi.has_key("hostmap"))
00069 {
00070 hostmap = " host_map: @local::hostmap";
00071 }
00072
00073 std::stringstream ss;
00074 ss << psi.to_string() << std::endl;
00075
00076 ss << " sources: {";
00077 for (int ii = 0; ii < senders_; ++ii)
00078 {
00079 ss << "s" << ii << ": { transferPluginType: " << type << " source_rank: " << ii << " max_fragment_size_words : " << fragment_size_ << " buffer_count : " << buffer_count_ << " partition_number : " << partition_number_ << hostmap << " }" << std::endl;
00080 }
00081 ss << "}" << std::endl << " destinations: {";
00082 for (int jj = senders_; jj < senders_ + receivers_; ++jj)
00083 {
00084 ss << "d" << jj << ": { transferPluginType: " << type << " destination_rank: " << jj << " max_fragment_size_words : " << fragment_size_ << " buffer_count : " << buffer_count_ << " partition_number : " << partition_number_ << hostmap << " }" << std::endl;
00085 }
00086 ss << "}" << std::endl;
00087
00088 make_ParameterSet(ss.str(), ps_);
00089
00090
00091 TLOG(TLVL_DEBUG) << "Going to configure with ParameterSet: " << ps_.to_string() << std::endl;
00092 }
00093
00094 int artdaq::TransferTest::runTest()
00095 {
00096 TLOG(TLVL_INFO) << "runTest BEGIN: " << (my_rank < senders_ ? "sending" : "receiving");
00097 start_time_ = std::chrono::steady_clock::now();
00098 std::pair<size_t, double> result;
00099 if (my_rank >= senders_ + receivers_) return 0;
00100 if (my_rank < senders_)
00101 {
00102 std::vector<std::future<std::pair<size_t, double>>> results_futures(sending_threads_);
00103 for (int ii = 0; ii < sending_threads_; ++ii)
00104 {
00105 results_futures[ii] = std::async(std::bind(&TransferTest::do_sending, this, ii));
00106 }
00107 for (auto& future : results_futures)
00108 {
00109 if (future.valid())
00110 {
00111 auto thisresult = future.get();
00112 result.first += thisresult.first;
00113 result.second += thisresult.second;
00114 }
00115 }
00116 }
00117 else
00118 {
00119 result = do_receiving();
00120 }
00121 auto duration = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - start_time_).count();
00122 TLOG(TLVL_INFO) << (my_rank < senders_ ? "Sent " : "Received ") << result.first << " bytes in " << duration << " seconds ( " << formatBytes(result.first / duration) << "/s )." << std::endl;
00123 TLOG(TLVL_INFO) << "Rate of " << (my_rank < senders_ ? "sending" : "receiving") << ": " << formatBytes(result.first / result.second) << "/s." << std::endl;
00124 metricMan->do_stop();
00125 metricMan->shutdown();
00126 TLOG(11) << "runTest DONE";
00127 return 0;
00128 }
00129
00130 std::pair<size_t, double> artdaq::TransferTest::do_sending(int index)
00131 {
00132 TLOG(7) << "do_sending entered RawFragmentHeader::num_words()=" << artdaq::detail::RawFragmentHeader::num_words();
00133
00134 size_t totalSize = 0;
00135 double totalTime = 0;
00136 artdaq::DataSenderManager sender(ps_);
00137
00138 unsigned data_size_wrds = (fragment_size_ / sizeof(artdaq::RawDataType)) - artdaq::detail::RawFragmentHeader::num_words();
00139 artdaq::Fragment frag(data_size_wrds);
00140
00141 if (validate_mode_)
00142 {
00143 artdaq::RawDataType gen_seed = 0;
00144
00145 std::generate_n(frag.dataBegin(), data_size_wrds, [&]() { return ++gen_seed; });
00146 for (size_t ii = 0; ii < frag.dataSize(); ++ii)
00147 {
00148 if (*(frag.dataBegin() + ii) != ii + 1)
00149 {
00150 TLOG(TLVL_ERROR) << "Data corruption detected! (" << (*(frag.dataBegin() + ii)) << " != " << (ii + 1) << ") Aborting!";
00151 exit(1);
00152 }
00153 }
00154 }
00155
00156 int metric_send_interval = sends_each_sender_ / 1000 > 1 ? sends_each_sender_ / 1000 : 1;
00157 auto init_time_metric = 0.0;
00158 auto send_time_metric = 0.0;
00159 auto after_time_metric = 0.0;
00160 auto send_size_metric = 0.0;
00161 auto error_count = 0;
00162
00163 for (int ii = 0; ii < sends_each_sender_; ++ii)
00164 {
00165 auto loop_start = std::chrono::steady_clock::now();
00166 TLOG(7) << "sender rank " << my_rank << " #" << ii << " resized bytes=" << frag.sizeBytes();
00167 totalSize += frag.sizeBytes();
00168
00169
00170 frag.setSequenceID(ii * sending_threads_ + index);
00171 frag.setFragmentID(my_rank);
00172 frag.setSystemType(artdaq::Fragment::DataFragmentType);
00173
00174
00175
00176
00177
00178
00179 auto send_start = std::chrono::steady_clock::now();
00180 TLOG(TLVL_DEBUG) << "Sender " << my_rank << " sending fragment " << ii;
00181 auto stspair = sender.sendFragment(std::move(frag));
00182 auto after_send = std::chrono::steady_clock::now();
00183 TLOG(TLVL_TRACE) << "Sender " << my_rank << " sent fragment " << ii;
00184
00185
00186 if (stspair.second != artdaq::TransferInterface::CopyStatus::kSuccess)
00187 {
00188 error_count++;
00189 if (error_count >= error_count_max_)
00190 {
00191 TLOG(TLVL_ERROR) << "Too many errors sending fragments! Aborting... (sent=" << ii << "/" << sends_each_sender_ << ")";
00192 exit(sends_each_sender_ - ii);
00193 }
00194 }
00195
00196 frag = artdaq::Fragment(data_size_wrds);
00197 if (validate_mode_)
00198 {
00199 artdaq::RawDataType gen_seed = ii + 1;
00200
00201 std::generate_n(frag.dataBegin(), data_size_wrds, [&]() { return ++gen_seed; });
00202 for (size_t jj = 0; jj < frag.dataSize(); ++jj)
00203 {
00204 if (*(frag.dataBegin() + jj) != (ii + 1) + jj + 1)
00205 {
00206 TLOG(TLVL_ERROR) << "Input Data corruption detected! (" << *(frag.dataBegin() + jj) << " != " << ii + jj + 2 << " at position " << ii << ") Aborting!";
00207 exit(1);
00208 }
00209 }
00210 }
00211 TLOG(9) << "sender rank " << my_rank << " frag replaced";
00212
00213 auto total_send_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_send - send_start).count();
00214 totalTime += total_send_time;
00215 send_time_metric += total_send_time;
00216 send_size_metric += data_size_wrds * sizeof(artdaq::RawDataType);
00217 after_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - after_send).count();
00218 init_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(send_start - loop_start).count();
00219
00220 if (metricMan && ii % metric_send_interval == 0)
00221 {
00222 metricMan->sendMetric("send_init_time", init_time_metric, "seconds", 3, MetricMode::Accumulate);
00223 metricMan->sendMetric("total_send_time", send_time_metric, "seconds", 3, MetricMode::Accumulate);
00224 metricMan->sendMetric("after_send_time", after_time_metric, "seconds", 3, MetricMode::Accumulate);
00225 metricMan->sendMetric("send_rate", send_size_metric / send_time_metric, "B/s", 3, MetricMode::Average);
00226 init_time_metric = 0.0;
00227 send_time_metric = 0.0;
00228 after_time_metric = 0.0;
00229 send_size_metric = 0.0;
00230 }
00231 usleep(0);
00232 }
00233
00234 return std::make_pair(totalSize, totalTime);
00235 }
00236
00237 std::pair<size_t, double> artdaq::TransferTest::do_receiving()
00238 {
00239 TLOG(7) << "do_receiving entered";
00240
00241 artdaq::FragmentReceiverManager receiver(ps_);
00242 receiver.start_threads();
00243 int counter = receives_each_receiver_;
00244 size_t totalSize = 0;
00245 double totalTime = 0;
00246 bool first = true;
00247 int activeSenders = senders_ * sending_threads_;
00248 auto end_loop = std::chrono::steady_clock::now();
00249
00250 auto recv_size_metric = 0.0;
00251 auto recv_time_metric = 0.0;
00252 auto input_wait_metric = 0.0;
00253 auto init_wait_metric = 0.0;
00254 int metric_send_interval = receives_each_receiver_ / 1000 > 1 ? receives_each_receiver_ : 1;
00255
00256
00257 while ((activeSenders > 0 || counter > receives_each_receiver_ / 10) && counter > 0)
00258 {
00259 auto start_loop = std::chrono::steady_clock::now();
00260 TLOG(7) << "do_receiving: Counter is " << counter << ", calling recvFragment";
00261 int senderSlot = artdaq::TransferInterface::RECV_TIMEOUT;
00262 auto before_receive = std::chrono::steady_clock::now();
00263
00264 auto ignoreFragPtr = receiver.recvFragment(senderSlot);
00265 auto after_receive = std::chrono::steady_clock::now();
00266 init_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(before_receive - start_loop).count();
00267 size_t thisSize = 0;
00268 if (senderSlot >= artdaq::TransferInterface::RECV_SUCCESS && ignoreFragPtr)
00269 {
00270 if (ignoreFragPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
00271 {
00272 TLOG(TLVL_INFO) << "Receiver " << my_rank << " received EndOfData Fragment from Sender " << senderSlot;
00273 activeSenders--;
00274 TLOG(TLVL_DEBUG) << "Active Senders is now " << activeSenders;
00275 }
00276 else if (ignoreFragPtr->type() != artdaq::Fragment::DataFragmentType)
00277 {
00278 TLOG(TLVL_WARNING) << "Receiver " << my_rank << " received Fragment with System type " << artdaq::detail::RawFragmentHeader::SystemTypeToString(ignoreFragPtr->type()) << " (Unexpected!)";
00279 }
00280 else
00281 {
00282 if (first)
00283 {
00284 start_time_ = std::chrono::steady_clock::now();
00285 first = false;
00286 }
00287 counter--;
00288 TLOG(TLVL_INFO) << "Receiver " << my_rank << " received fragment " << receives_each_receiver_ - counter
00289 << " with seqID " << ignoreFragPtr->sequenceID() << " from Sender " << senderSlot << " (Expecting " << counter << " more)";
00290 thisSize = ignoreFragPtr->size() * sizeof(artdaq::RawDataType);
00291 totalSize += thisSize;
00292 if (validate_mode_)
00293 {
00294 for (size_t ii = 0; ii < ignoreFragPtr->dataSize(); ++ii)
00295 {
00296 if (*(ignoreFragPtr->dataBegin() + ii) != ignoreFragPtr->sequenceID() + ii + 1)
00297 {
00298 TLOG(TLVL_ERROR) << "Output Data corruption detected! (" << *(ignoreFragPtr->dataBegin() + ii) << " != " << (ignoreFragPtr->sequenceID() + ii + 1) << " at position " << ii << ") Aborting!";
00299 exit(1);
00300 }
00301 }
00302 }
00303 }
00304 input_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - end_loop).count();
00305 }
00306 else if (senderSlot == artdaq::TransferInterface::DATA_END)
00307 {
00308 TLOG(TLVL_ERROR) << "Receiver " << my_rank << " detected fatal protocol error! Reducing active sender count by one!" << std::endl;
00309 activeSenders--;
00310 TLOG(TLVL_DEBUG) << "Active Senders is now " << activeSenders;
00311 }
00312 TLOG(7) << "do_receiving: Recv Loop end, counter is " << counter;
00313
00314
00315 auto total_recv_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - before_receive).count();
00316 recv_time_metric += total_recv_time;
00317 totalTime += total_recv_time;
00318 recv_size_metric += thisSize;
00319
00320 if (metricMan && counter % metric_send_interval == 0)
00321 {
00322 metricMan->sendMetric("input_wait", input_wait_metric, "seconds", 3, MetricMode::Accumulate);
00323 metricMan->sendMetric("recv_init_time", init_wait_metric, "seconds", 3, MetricMode::Accumulate);
00324 metricMan->sendMetric("total_recv_time", recv_time_metric, "seconds", 3, MetricMode::Accumulate);
00325 metricMan->sendMetric("recv_rate", recv_size_metric / recv_time_metric, "B/s", 3, MetricMode::Average);
00326
00327 input_wait_metric = 0.0;
00328 init_wait_metric = 0.0;
00329 recv_time_metric = 0.0;
00330 recv_size_metric = 0.0;
00331 }
00332 end_loop = std::chrono::steady_clock::now();
00333 }
00334
00335 if (counter != 0)
00336 {
00337 TLOG(TLVL_ERROR) << "Did not receive all expected Fragments! Missing " << counter << " Fragments!";
00338 exit(counter);
00339 }
00340
00341 return std::make_pair(totalSize, totalTime);
00342 }