$treeview $search $mathjax $extrastylesheet
artdaq
v3_04_00
$projectbrief
|
$projectbrief
|
$searchbox |
00001 #include "artdaq/DAQrate/TransferTest.hh" 00002 00003 #include "artdaq-core/Data/Fragment.hh" 00004 #include "artdaq/DAQrate/FragmentReceiverManager.hh" 00005 #include "artdaq/DAQrate/DataSenderManager.hh" 00006 00007 #define TRACE_NAME "TransferTest" 00008 #include "artdaq/DAQdata/Globals.hh" 00009 00010 #include "fhiclcpp/make_ParameterSet.h" 00011 00012 #include <future> 00013 00014 artdaq::TransferTest::TransferTest(fhicl::ParameterSet psi) 00015 : senders_(psi.get<int>("num_senders")) 00016 , receivers_(psi.get<int>("num_receivers")) 00017 , sending_threads_(psi.get<int>("sending_threads", 1)) 00018 , sends_each_sender_(psi.get<int>("sends_per_sender")) 00019 , receives_each_receiver_(0) 00020 , buffer_count_(psi.get<int>("buffer_count", 10)) 00021 , error_count_max_(psi.get<int>("max_errors_before_abort", 3)) 00022 , fragment_size_(psi.get<size_t>("fragment_size", 0x100000)) 00023 , ps_() 00024 , validate_mode_(psi.get<bool>("validate_data_mode", false)) 00025 , partition_number_(psi.get<int>("partition_number", rand() % 0x7F)) 00026 { 00027 TLOG(10) << "CONSTRUCTOR"; 00028 00029 if (fragment_size_ < artdaq::detail::RawFragmentHeader::num_words() * sizeof(artdaq::RawDataType)) 00030 { 00031 fragment_size_ = artdaq::detail::RawFragmentHeader::num_words() * sizeof(artdaq::RawDataType); 00032 } 00033 00034 fhicl::ParameterSet metric_pset; 00035 00036 try 00037 { 00038 metric_pset = psi.get<fhicl::ParameterSet>("metrics"); 00039 } 00040 catch (...) {} // OK if there's no metrics table defined in the FHiCL 00041 00042 try 00043 { 00044 std::string name = "TransferTest" + std::to_string(my_rank); 00045 metricMan->initialize(metric_pset, name); 00046 metricMan->do_start(); 00047 } 00048 catch (...) {} 00049 00050 std::string type(psi.get<std::string>("transfer_plugin_type", "Shmem")); 00051 00052 bool broadcast_mode = psi.get<bool>("broadcast_sends", false); 00053 if (broadcast_mode) 00054 { 00055 receives_each_receiver_ = senders_ * sending_threads_ * sends_each_sender_; 00056 } 00057 else 00058 { 00059 if (receivers_ > 0) 00060 { 00061 if (senders_ * sending_threads_ * sends_each_sender_ % receivers_ != 0) 00062 { 00063 TLOG(TLVL_TRACE) << "Adding sends so that sends_each_sender * num_sending_ranks is a multiple of num_receiving_ranks" << std::endl; 00064 while (senders_ * sends_each_sender_ % receivers_ != 0) 00065 { 00066 sends_each_sender_++; 00067 } 00068 receives_each_receiver_ = senders_ * sending_threads_ * sends_each_sender_ / receivers_; 00069 TLOG(TLVL_TRACE) << "sends_each_sender is now " << sends_each_sender_ << std::endl; 00070 psi.put_or_replace("sends_per_sender", sends_each_sender_); 00071 } 00072 else 00073 { 00074 receives_each_receiver_ = senders_ * sending_threads_ * sends_each_sender_ / receivers_; 00075 } 00076 } 00077 } 00078 00079 std::string hostmap = ""; 00080 if (psi.has_key("hostmap")) 00081 { 00082 hostmap = " host_map: @local::hostmap"; 00083 } 00084 00085 std::stringstream ss; 00086 ss << psi.to_string() << std::endl; 00087 00088 ss << " sources: {"; 00089 for (int ii = 0; ii < senders_; ++ii) 00090 { 00091 ss << "s" << ii << ": { transferPluginType: " << type << " source_rank: " << ii << " max_fragment_size_words : " << fragment_size_ << " buffer_count : " << buffer_count_ << " partition_number : " << partition_number_ << hostmap << " }" << std::endl; 00092 } 00093 ss << "}" << std::endl << " destinations: {"; 00094 for (int jj = senders_; jj < senders_ + receivers_; ++jj) 00095 { 00096 ss << "d" << jj << ": { transferPluginType: " << type << " destination_rank: " << jj << " max_fragment_size_words : " << fragment_size_ << " buffer_count : " << buffer_count_ << " partition_number : " << partition_number_ << hostmap << " }" << std::endl; 00097 } 00098 ss << "}" << std::endl; 00099 00100 make_ParameterSet(ss.str(), ps_); 00101 00102 00103 TLOG(TLVL_DEBUG) << "Going to configure with ParameterSet: " << ps_.to_string() << std::endl; 00104 } 00105 00106 int artdaq::TransferTest::runTest() 00107 { 00108 TLOG(TLVL_INFO) << "runTest BEGIN: " << (my_rank < senders_ ? "sending" : "receiving"); 00109 start_time_ = std::chrono::steady_clock::now(); 00110 std::pair<size_t, double> result; 00111 if (my_rank >= senders_ + receivers_) return 0; 00112 if (my_rank < senders_) 00113 { 00114 std::vector<std::future<std::pair<size_t, double>>> results_futures(sending_threads_); 00115 for (int ii = 0; ii < sending_threads_; ++ii) 00116 { 00117 results_futures[ii] = std::async(std::bind(&TransferTest::do_sending, this, ii)); 00118 } 00119 for (auto& future : results_futures) 00120 { 00121 if (future.valid()) 00122 { 00123 auto thisresult = future.get(); 00124 result.first += thisresult.first; 00125 result.second += thisresult.second; 00126 } 00127 } 00128 } 00129 else 00130 { 00131 result = do_receiving(); 00132 } 00133 auto duration = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - start_time_).count(); 00134 TLOG(TLVL_INFO) << (my_rank < senders_ ? "Sent " : "Received ") << result.first << " bytes in " << duration << " seconds ( " << formatBytes(result.first / duration) << "/s )." << std::endl; 00135 TLOG(TLVL_INFO) << "Rate of " << (my_rank < senders_ ? "sending" : "receiving") << ": " << formatBytes(result.first / result.second) << "/s." << std::endl; 00136 metricMan->do_stop(); 00137 metricMan->shutdown(); 00138 TLOG(11) << "runTest DONE"; 00139 return 0; 00140 } 00141 00142 std::pair<size_t, double> artdaq::TransferTest::do_sending(int index) 00143 { 00144 TLOG(7) << "do_sending entered RawFragmentHeader::num_words()=" << artdaq::detail::RawFragmentHeader::num_words(); 00145 00146 size_t totalSize = 0; 00147 double totalTime = 0; 00148 artdaq::DataSenderManager sender(ps_); 00149 00150 unsigned data_size_wrds = (fragment_size_ / sizeof(artdaq::RawDataType)) - artdaq::detail::RawFragmentHeader::num_words(); 00151 artdaq::Fragment frag(data_size_wrds); 00152 00153 if (validate_mode_) 00154 { 00155 artdaq::RawDataType gen_seed = 0; 00156 00157 std::generate_n(frag.dataBegin(), data_size_wrds, [&]() { return ++gen_seed; }); 00158 for (size_t ii = 0; ii < frag.dataSize(); ++ii) 00159 { 00160 if (*(frag.dataBegin() + ii) != ii + 1) 00161 { 00162 TLOG(TLVL_ERROR) << "Data corruption detected! (" << (*(frag.dataBegin() + ii)) << " != " << (ii + 1) << ") Aborting!"; 00163 exit(1); 00164 } 00165 } 00166 } 00167 00168 int metric_send_interval = sends_each_sender_ / 1000 > 1 ? sends_each_sender_ / 1000 : 1; 00169 auto init_time_metric = 0.0; 00170 auto send_time_metric = 0.0; 00171 auto after_time_metric = 0.0; 00172 auto send_size_metric = 0.0; 00173 auto error_count = 0; 00174 00175 for (int ii = 0; ii < sends_each_sender_; ++ii) 00176 { 00177 auto loop_start = std::chrono::steady_clock::now(); 00178 TLOG(7) << "sender rank " << my_rank << " #" << ii << " resized bytes=" << frag.sizeBytes(); 00179 totalSize += frag.sizeBytes(); 00180 00181 //unsigned sndDatSz = data_size_wrds; 00182 frag.setSequenceID(ii * sending_threads_ + index); 00183 frag.setFragmentID(my_rank); 00184 frag.setSystemType(artdaq::Fragment::DataFragmentType); 00185 /* 00186 artdaq::Fragment::iterator it = frag.dataBegin(); 00187 *it = my_rank; 00188 *++it = ii; 00189 *++it = sndDatSz;*/ 00190 00191 auto send_start = std::chrono::steady_clock::now(); 00192 TLOG(TLVL_DEBUG) << "Sender " << my_rank << " sending fragment " << ii; 00193 auto stspair = sender.sendFragment(std::move(frag)); 00194 auto after_send = std::chrono::steady_clock::now(); 00195 TLOG(TLVL_TRACE) << "Sender " << my_rank << " sent fragment " << ii; 00196 //usleep( (data_size_wrds*sizeof(artdaq::RawDataType))/233 ); 00197 00198 if (stspair.second != artdaq::TransferInterface::CopyStatus::kSuccess) 00199 { 00200 error_count++; 00201 if (error_count >= error_count_max_) 00202 { 00203 TLOG(TLVL_ERROR) << "Too many errors sending fragments! Aborting... (sent=" << ii << "/" << sends_each_sender_ << ")"; 00204 exit(sends_each_sender_ - ii); 00205 } 00206 } 00207 00208 frag = artdaq::Fragment(data_size_wrds); // replace/renew 00209 if (validate_mode_) 00210 { 00211 artdaq::RawDataType gen_seed = ii + 1; 00212 00213 std::generate_n(frag.dataBegin(), data_size_wrds, [&]() { return ++gen_seed; }); 00214 for (size_t jj = 0; jj < frag.dataSize(); ++jj) 00215 { 00216 if (*(frag.dataBegin() + jj) != (ii + 1) + jj + 1) 00217 { 00218 TLOG(TLVL_ERROR) << "Input Data corruption detected! (" << *(frag.dataBegin() + jj) << " != " << ii + jj + 2 << " at position " << ii << ") Aborting!"; 00219 exit(1); 00220 } 00221 } 00222 } 00223 TLOG(9) << "sender rank " << my_rank << " frag replaced"; 00224 00225 auto total_send_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_send - send_start).count(); 00226 totalTime += total_send_time; 00227 send_time_metric += total_send_time; 00228 send_size_metric += data_size_wrds * sizeof(artdaq::RawDataType); 00229 after_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(std::chrono::steady_clock::now() - after_send).count(); 00230 init_time_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(send_start - loop_start).count(); 00231 00232 if (metricMan && ii % metric_send_interval == 0) 00233 { 00234 metricMan->sendMetric("send_init_time", init_time_metric, "seconds", 3, MetricMode::Accumulate); 00235 metricMan->sendMetric("total_send_time", send_time_metric, "seconds", 3, MetricMode::Accumulate); 00236 metricMan->sendMetric("after_send_time", after_time_metric, "seconds", 3, MetricMode::Accumulate); 00237 metricMan->sendMetric("send_rate", send_size_metric / send_time_metric, "B/s", 3, MetricMode::Average); 00238 init_time_metric = 0.0; 00239 send_time_metric = 0.0; 00240 after_time_metric = 0.0; 00241 send_size_metric = 0.0; 00242 } 00243 usleep(0); // Yield execution 00244 } 00245 00246 return std::make_pair(totalSize, totalTime); 00247 } // do_sending 00248 00249 std::pair<size_t, double> artdaq::TransferTest::do_receiving() 00250 { 00251 TLOG(7) << "do_receiving entered"; 00252 00253 artdaq::FragmentReceiverManager receiver(ps_); 00254 receiver.start_threads(); 00255 int counter = receives_each_receiver_; 00256 size_t totalSize = 0; 00257 double totalTime = 0; 00258 bool first = true; 00259 bool nonblocking_mode = ps_.get<bool>("nonblocking_sends", false); 00260 std::atomic<int> activeSenders(senders_ * sending_threads_); 00261 auto end_loop = std::chrono::steady_clock::now(); 00262 00263 auto recv_size_metric = 0.0; 00264 auto recv_time_metric = 0.0; 00265 auto input_wait_metric = 0.0; 00266 auto init_wait_metric = 0.0; 00267 int metric_send_interval = receives_each_receiver_ / 1000 > 1 ? receives_each_receiver_ : 1; 00268 00269 // Only abort when there are no senders if were's > 90% done 00270 while ((activeSenders > 0 || (counter > receives_each_receiver_ / 10 && !nonblocking_mode)) && counter > 0) 00271 { 00272 auto start_loop = std::chrono::steady_clock::now(); 00273 TLOG(7) << "do_receiving: Counter is " << counter << ", calling recvFragment (activeSenders=" << activeSenders << ")"; 00274 int senderSlot = artdaq::TransferInterface::RECV_TIMEOUT; 00275 auto before_receive = std::chrono::steady_clock::now(); 00276 00277 auto ignoreFragPtr = receiver.recvFragment(senderSlot); 00278 auto after_receive = std::chrono::steady_clock::now(); 00279 init_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(before_receive - start_loop).count(); 00280 size_t thisSize = 0; 00281 if (senderSlot >= artdaq::TransferInterface::RECV_SUCCESS && ignoreFragPtr) 00282 { 00283 if (ignoreFragPtr->type() == artdaq::Fragment::EndOfDataFragmentType) 00284 { 00285 TLOG(TLVL_INFO) << "Receiver " << my_rank << " received EndOfData Fragment from Sender " << senderSlot; 00286 activeSenders--; 00287 TLOG(TLVL_DEBUG) << "Active Senders is now " << activeSenders; 00288 } 00289 else if (ignoreFragPtr->type() != artdaq::Fragment::DataFragmentType) 00290 { 00291 TLOG(TLVL_WARNING) << "Receiver " << my_rank << " received Fragment with System type " << artdaq::detail::RawFragmentHeader::SystemTypeToString(ignoreFragPtr->type()) << " (Unexpected!)"; 00292 } 00293 else 00294 { 00295 if (first) 00296 { 00297 start_time_ = std::chrono::steady_clock::now(); 00298 first = false; 00299 } 00300 counter--; 00301 TLOG(TLVL_INFO) << "Receiver " << my_rank << " received fragment " << receives_each_receiver_ - counter 00302 << " with seqID " << ignoreFragPtr->sequenceID() << " from Sender " << senderSlot << " (Expecting " << counter << " more)"; 00303 thisSize = ignoreFragPtr->size() * sizeof(artdaq::RawDataType); 00304 totalSize += thisSize; 00305 if (validate_mode_) 00306 { 00307 for (size_t ii = 0; ii < ignoreFragPtr->dataSize(); ++ii) 00308 { 00309 if (*(ignoreFragPtr->dataBegin() + ii) != ignoreFragPtr->sequenceID() + ii + 1) 00310 { 00311 TLOG(TLVL_ERROR) << "Output Data corruption detected! (" << *(ignoreFragPtr->dataBegin() + ii) << " != " << (ignoreFragPtr->sequenceID() + ii + 1) << " at position " << ii << ") Aborting!"; 00312 exit(1); 00313 } 00314 } 00315 } 00316 } 00317 input_wait_metric += std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - end_loop).count(); 00318 } 00319 else if (senderSlot == artdaq::TransferInterface::DATA_END) 00320 { 00321 TLOG(TLVL_ERROR) << "Receiver " << my_rank << " detected fatal protocol error! Reducing active sender count by one!" << std::endl; 00322 activeSenders--; 00323 TLOG(TLVL_DEBUG) << "Active Senders is now " << activeSenders; 00324 } 00325 TLOG(7) << "do_receiving: Recv Loop end, counter is " << counter; 00326 00327 00328 auto total_recv_time = std::chrono::duration_cast<artdaq::TimeUtils::seconds>(after_receive - before_receive).count(); 00329 recv_time_metric += total_recv_time; 00330 totalTime += total_recv_time; 00331 recv_size_metric += thisSize; 00332 00333 if (metricMan && counter % metric_send_interval == 0) 00334 { 00335 metricMan->sendMetric("input_wait", input_wait_metric, "seconds", 3, MetricMode::Accumulate); 00336 metricMan->sendMetric("recv_init_time", init_wait_metric, "seconds", 3, MetricMode::Accumulate); 00337 metricMan->sendMetric("total_recv_time", recv_time_metric, "seconds", 3, MetricMode::Accumulate); 00338 metricMan->sendMetric("recv_rate", recv_size_metric / recv_time_metric, "B/s", 3, MetricMode::Average); 00339 00340 input_wait_metric = 0.0; 00341 init_wait_metric = 0.0; 00342 recv_time_metric = 0.0; 00343 recv_size_metric = 0.0; 00344 } 00345 end_loop = std::chrono::steady_clock::now(); 00346 } 00347 00348 if (counter != 0 && !nonblocking_mode) 00349 { 00350 TLOG(TLVL_ERROR) << "Did not receive all expected Fragments! Missing " << counter << " Fragments!"; 00351 exit(counter); 00352 } 00353 00354 return std::make_pair(totalSize, totalTime); 00355 }