$treeview $search $mathjax $extrastylesheet
artdaq
v3_04_01
$projectbrief
|
$projectbrief
|
$searchbox |
00001 #include <chrono> 00002 00003 #define TRACE_NAME (app_name + "_DataReceiverManager").c_str() 00004 #include "artdaq/DAQdata/Globals.hh" 00005 #include "artdaq/DAQrate/DataReceiverManager.hh" 00006 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh" 00007 #include "artdaq/TransferPlugins/detail/HostMap.hh" 00008 #include "cetlib_except/exception.h" 00009 #include <iomanip> 00010 00011 artdaq::DataReceiverManager::DataReceiverManager(const fhicl::ParameterSet& pset, std::shared_ptr<SharedMemoryEventManager> shm) 00012 : stop_requested_(false) 00013 , stop_requested_time_(0) 00014 , source_threads_() 00015 , source_plugins_() 00016 , source_metric_data_() 00017 , source_metric_send_time_() 00018 , enabled_sources_() 00019 , running_sources_() 00020 , recv_frag_count_() 00021 , recv_frag_size_() 00022 , recv_seq_count_() 00023 , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000)) 00024 , stop_timeout_ms_(pset.get<size_t>("stop_timeout_ms", 1500)) 00025 , shm_manager_(shm) 00026 , non_reliable_mode_enabled_(pset.get<bool>("non_reliable_mode", false)) 00027 , non_reliable_mode_retry_count_(pset.get<size_t>("non_reliable_mode_retry_count", -1)) 00028 { 00029 TLOG(TLVL_DEBUG) << "Constructor"; 00030 auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>()); 00031 auto enabled_srcs_empty = enabled_srcs.size() == 0; 00032 00033 if (non_reliable_mode_enabled_) 00034 { 00035 TLOG(TLVL_WARNING) << "DataReceiverManager is configured to drop data after " << non_reliable_mode_retry_count_ 00036 << " failed attempts to put data into the SharedMemoryEventManager! If this is unexpected, please check your configuration!"; 00037 } 00038 00039 if (enabled_srcs_empty) 00040 { 00041 TLOG(TLVL_INFO) << "enabled_sources not specified, assuming all sources enabled."; 00042 } 00043 else 00044 { 00045 for (auto& s : enabled_srcs) 00046 { 00047 enabled_sources_[s] = true; 00048 } 00049 } 00050 00051 hostMap_t host_map = MakeHostMap(pset); 00052 size_t tcp_receive_buffer_size = pset.get<size_t>("tcp_receive_buffer_size", 0); 00053 size_t max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0); 00054 00055 auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet()); 00056 for (auto& s : srcs.get_pset_names()) 00057 { 00058 auto src_pset = srcs.get<fhicl::ParameterSet>(s); 00059 host_map = MakeHostMap(src_pset, host_map); 00060 } 00061 auto host_map_pset = MakeHostMapPset(host_map); 00062 fhicl::ParameterSet srcs_mod; 00063 for (auto& s : srcs.get_pset_names()) 00064 { 00065 auto src_pset = srcs.get<fhicl::ParameterSet>(s); 00066 src_pset.erase("host_map"); 00067 src_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset); 00068 00069 if (tcp_receive_buffer_size != 0 && !src_pset.has_key("tcp_receive_buffer_size")) 00070 { 00071 src_pset.put<size_t>("tcp_receive_buffer_size", tcp_receive_buffer_size); 00072 } 00073 if (max_fragment_size_words != 0 && !src_pset.has_key("max_fragment_size_words")) 00074 { 00075 src_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words); 00076 } 00077 00078 srcs_mod.put<fhicl::ParameterSet>(s, src_pset); 00079 } 00080 00081 for (auto& s : srcs_mod.get_pset_names()) 00082 { 00083 try 00084 { 00085 auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs_mod, s, 00086 TransferInterface::Role::kReceive)); 00087 auto source_rank = transfer->source_rank(); 00088 if (enabled_srcs_empty) enabled_sources_[source_rank] = true; 00089 else if (!enabled_sources_.count(source_rank)) enabled_sources_[source_rank] = false; 00090 running_sources_[source_rank] = false; 00091 source_plugins_[source_rank] = std::move(transfer); 00092 source_metric_send_time_[source_rank] = std::chrono::steady_clock::now(); 00093 source_metric_data_[source_rank] = source_metric_data(); 00094 } 00095 catch (cet::exception ex) 00096 { 00097 TLOG(TLVL_WARNING) << "cet::exception caught while setting up source " << s << ": " << ex.what(); 00098 } 00099 catch (std::exception ex) 00100 { 00101 TLOG(TLVL_WARNING) << "std::exception caught while setting up source " << s << ": " << ex.what(); 00102 } 00103 catch (...) 00104 { 00105 TLOG(TLVL_WARNING) << "Non-cet exception caught while setting up source " << s << "."; 00106 } 00107 } 00108 if (srcs.get_pset_names().size() == 0) 00109 { 00110 TLOG(TLVL_ERROR) << "No sources configured!"; 00111 } 00112 } 00113 00114 artdaq::DataReceiverManager::~DataReceiverManager() 00115 { 00116 TLOG(TLVL_TRACE) << "~DataReceiverManager: BEGIN"; 00117 stop_threads(); 00118 shm_manager_.reset(); 00119 TLOG(TLVL_TRACE) << "Destructor END"; 00120 } 00121 00122 00123 void artdaq::DataReceiverManager::start_threads() 00124 { 00125 stop_requested_ = false; 00126 if (shm_manager_) shm_manager_->setRequestMode(artdaq::detail::RequestMessageMode::Normal); 00127 for (auto& source : source_plugins_) 00128 { 00129 auto& rank = source.first; 00130 if (enabled_sources_.count(rank) && enabled_sources_[rank].load()) 00131 { 00132 source_metric_data_[rank] = source_metric_data(); 00133 source_metric_send_time_[rank] = std::chrono::steady_clock::now(); 00134 00135 recv_frag_count_.setSlot(rank, 0); 00136 recv_frag_size_.setSlot(rank,0); 00137 recv_seq_count_.setSlot(rank,0); 00138 00139 running_sources_[rank] = true; 00140 boost::thread::attributes attrs; 00141 attrs.set_stack_size(4096 * 2000); // 2000 KB 00142 try { 00143 source_threads_[rank] = boost::thread(attrs, boost::bind(&DataReceiverManager::runReceiver_, this, rank)); 00144 } 00145 catch (const boost::exception& e) 00146 { 00147 TLOG(TLVL_ERROR) << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno; 00148 std::cerr << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl; 00149 exit(5); 00150 } 00151 } 00152 } 00153 } 00154 00155 void artdaq::DataReceiverManager::stop_threads() 00156 { 00157 TLOG(TLVL_TRACE) << "stop_threads: BEGIN: Setting stop_requested to true, frags=" << count() << ", bytes=" << byteCount(); 00158 00159 stop_requested_time_ = TimeUtils::gettimeofday_us(); 00160 stop_requested_ = true; 00161 00162 auto initial_count = running_sources().size(); 00163 TLOG(TLVL_TRACE) << "stop_threads: Waiting for " << initial_count << " running receiver threads to stop"; 00164 auto wait_start = std::chrono::steady_clock::now(); 00165 auto last_report = std::chrono::steady_clock::now(); 00166 while (running_sources().size() && TimeUtils::GetElapsedTime(wait_start) < 60.0) 00167 { 00168 usleep(10000); 00169 if (TimeUtils::GetElapsedTime(last_report) > 1.0) 00170 { 00171 TLOG(TLVL_DEBUG) << "stop_threads: Waited " << TimeUtils::GetElapsedTime(wait_start) << " s for " << initial_count 00172 << " receiver threads to end (" << running_sources().size() << " remain)"; 00173 last_report = std::chrono::steady_clock::now(); 00174 } 00175 } 00176 if (running_sources().size()) { 00177 TLOG(TLVL_WARNING) << "stop_threads: Timeout expired while waiting for all receiver threads to end. There are " 00178 << running_sources().size() << " threads remaining."; 00179 } 00180 00181 TLOG(TLVL_TRACE) << "stop_threads: Joining " << source_threads_.size() << " receiver threads"; 00182 for (auto it = source_threads_.begin(); it != source_threads_.end(); ++it) 00183 { 00184 TLOG(TLVL_TRACE) << "stop_threads: Joining thread for source_rank " << (*it).first; 00185 if ((*it).second.joinable()) 00186 (*it).second.join(); 00187 else 00188 TLOG(TLVL_ERROR) << "stop_threads: Thread for source rank " << (*it).first << " is not joinable!"; 00189 } 00190 source_threads_.clear(); // To prevent error messages from shutdown-after-stop 00191 00192 TLOG(TLVL_TRACE) << "stop_threads: END"; 00193 } 00194 00195 std::set<int> artdaq::DataReceiverManager::enabled_sources() const 00196 { 00197 std::set<int> output; 00198 for (auto& src : enabled_sources_) 00199 { 00200 if (src.second) output.insert(src.first); 00201 } 00202 return output; 00203 } 00204 00205 std::set<int> artdaq::DataReceiverManager::running_sources() const 00206 { 00207 std::set<int> output; 00208 for (auto& src : running_sources_) 00209 { 00210 if (src.second) output.insert(src.first); 00211 } 00212 return output; 00213 } 00214 00215 void artdaq::DataReceiverManager::runReceiver_(int source_rank) 00216 { 00217 std::chrono::steady_clock::time_point start_time, after_header, before_body,after_body, end_time = std::chrono::steady_clock::now(); 00218 int ret; 00219 detail::RawFragmentHeader header; 00220 size_t endOfDataCount = -1; 00221 auto sleep_time = receive_timeout_ / 100 > 100000 ? 100000 : receive_timeout_ / 100; 00222 if (sleep_time < 5000) sleep_time = 5000; 00223 auto max_retries = non_reliable_mode_retry_count_ * ceil(receive_timeout_ / sleep_time); 00224 00225 while (!(stop_requested_ && TimeUtils::gettimeofday_us() - stop_requested_time_ > stop_timeout_ms_ * 1000) && enabled_sources_.count(source_rank)) 00226 { 00227 TLOG(16) << "runReceiver_: Begin loop"; 00228 std::this_thread::yield(); 00229 00230 // Don't stop receiving until we haven't received anything for 1 second 00231 if (endOfDataCount <= recv_frag_count_.slotCount(source_rank) && !source_plugins_[source_rank]->isRunning()) 00232 { 00233 TLOG(TLVL_DEBUG) << "runReceiver_: End of Data conditions met, ending runReceiver loop"; 00234 break; 00235 } 00236 00237 start_time = std::chrono::steady_clock::now(); 00238 00239 TLOG(16) << "runReceiver_: Calling receiveFragmentHeader tmo=" << receive_timeout_; 00240 ret = source_plugins_[source_rank]->receiveFragmentHeader(header, receive_timeout_); 00241 TLOG(16) << "runReceiver_: Done with receiveFragmentHeader, ret=" << ret << " (should be " << source_rank << ")"; 00242 if (ret != source_rank) 00243 { 00244 if (ret >= 0) { 00245 TLOG(TLVL_WARNING) << "Received Fragment from rank " << ret << ", but was expecting one from rank " << source_rank << "!"; 00246 } 00247 else if (ret == TransferInterface::DATA_END) 00248 { 00249 TLOG(TLVL_ERROR) << "Transfer Plugin returned DATA_END, ending receive loop!"; 00250 break; 00251 } 00252 if (*running_sources().begin() == source_rank) // Only do this for the first sender in the running_sources_ map 00253 { 00254 TLOG(6) << "Calling SMEM::CheckPendingBuffers from DRM receiver thread for " << source_rank << " to make sure that things aren't stuck"; 00255 shm_manager_->CheckPendingBuffers(); 00256 } 00257 00258 usleep(sleep_time); 00259 continue; // Receive timeout or other oddness 00260 } 00261 00262 after_header = std::chrono::steady_clock::now(); 00263 00264 if (Fragment::isUserFragmentType(header.type) || header.type == Fragment::DataFragmentType || header.type == Fragment::EmptyFragmentType || header.type == Fragment::ContainerFragmentType) { 00265 TLOG(TLVL_TRACE) << "Received Fragment Header from rank " << source_rank << ", sequence ID " << header.sequence_id << ", timestamp " << header.timestamp; 00266 RawDataType* loc = nullptr; 00267 size_t retries = 0; 00268 while (loc == nullptr)//&& TimeUtils::GetElapsedTimeMicroseconds(after_header)) < receive_timeout_) 00269 { 00270 loc = shm_manager_->WriteFragmentHeader(header); 00271 00272 // Break here and outside of the loop to go to the cleanup steps at the end of runReceiver_ 00273 if (loc == nullptr && stop_requested_) break; 00274 00275 if (loc == nullptr) usleep(sleep_time); 00276 retries++; 00277 if (non_reliable_mode_enabled_ && retries > max_retries) 00278 { 00279 loc = shm_manager_->WriteFragmentHeader(header, true); 00280 } 00281 } 00282 // Break here to go to cleanup at the end of runReceiver_ 00283 if (loc == nullptr && stop_requested_) break; 00284 if (loc == nullptr) 00285 { 00286 // Could not enqueue event! 00287 TLOG(TLVL_ERROR) << "runReceiver_: Could not get data location for event " << header.sequence_id; 00288 continue; 00289 } 00290 before_body = std::chrono::steady_clock::now(); 00291 00292 auto hdrLoc = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(loc - artdaq::detail::RawFragmentHeader::num_words()); 00293 TLOG(16) << "runReceiver_: Calling receiveFragmentData from rank " << source_rank << ", sequence ID " << header.sequence_id << ", timestamp " << header.timestamp; 00294 auto ret2 = source_plugins_[source_rank]->receiveFragmentData(loc, header.word_count - header.num_words()); 00295 TLOG(16) << "runReceiver_: Done with receiveFragmentData, ret2=" << ret2 << " (should be " << source_rank << ")"; 00296 00297 if (ret != ret2) { 00298 TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")"; 00299 TLOG(TLVL_ERROR) << "Error receiving data from rank " << source_rank << ", data has been lost! Event " << header.sequence_id << " will most likely be Incomplete!"; 00300 00301 // Mark the Fragment as invalid 00302 /* \todo Make a RawFragmentHeader field that marks it as invalid while maintaining previous type! */ 00303 hdrLoc->type = Fragment::ErrorFragmentType; 00304 00305 shm_manager_->DoneWritingFragment(header); 00306 //throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")"; 00307 continue; 00308 } 00309 00310 shm_manager_->DoneWritingFragment(header); 00311 TLOG(TLVL_TRACE) << "Done receiving fragment with sequence ID " << header.sequence_id << " from rank " << source_rank; 00312 00313 recv_frag_count_.incSlot(source_rank); 00314 recv_frag_size_.incSlot(source_rank, header.word_count * sizeof(RawDataType)); 00315 recv_seq_count_.setSlot(source_rank, header.sequence_id); 00316 if (endOfDataCount != static_cast<size_t>(-1)) 00317 { 00318 TLOG(TLVL_DEBUG) << "Received fragment " << header.sequence_id << " from rank " << source_rank 00319 << " (" << recv_frag_count_.slotCount(source_rank) << "/" << endOfDataCount << ")"; 00320 } 00321 00322 after_body = std::chrono::steady_clock::now(); 00323 00324 source_metric_data_[source_rank].hdr_delta_t += TimeUtils::GetElapsedTime(start_time, after_header); 00325 source_metric_data_[source_rank].store_delta_t += TimeUtils::GetElapsedTime(after_header, before_body); 00326 source_metric_data_[source_rank].data_delta_t += TimeUtils::GetElapsedTime(before_body, after_body); 00327 source_metric_data_[source_rank].delta_t += TimeUtils::GetElapsedTime(start_time, after_body); 00328 source_metric_data_[source_rank].dead_t += TimeUtils::GetElapsedTime(end_time, start_time); 00329 00330 source_metric_data_[source_rank].data_size += header.word_count * sizeof(RawDataType); 00331 source_metric_data_[source_rank].header_size += header.num_words() * sizeof(RawDataType); 00332 source_metric_data_[source_rank].data_point_count++; 00333 00334 if (metricMan && TimeUtils::GetElapsedTime(source_metric_send_time_[source_rank]) > 1) 00335 {//&& recv_frag_count_.slotCount(source_rank) % 100 == 0) { 00336 TLOG(6) << "runReceiver_: Sending receive stats for rank " << source_rank; 00337 metricMan->sendMetric("Total Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].delta_t, "s", 5, MetricMode::Accumulate); 00338 metricMan->sendMetric("Total Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(source_metric_data_[source_rank].data_size), "B", 5, MetricMode::Accumulate); 00339 metricMan->sendMetric("Total Receive Rate From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].data_size / source_metric_data_[source_rank].delta_t, "B/s", 5, MetricMode::Average); 00340 00341 metricMan->sendMetric("Header Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].hdr_delta_t, "s", 5, MetricMode::Accumulate); 00342 metricMan->sendMetric("Header Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(source_metric_data_[source_rank].header_size), "B", 5, MetricMode::Accumulate); 00343 metricMan->sendMetric("Header Receive Rate From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].header_size / source_metric_data_[source_rank].hdr_delta_t, "B/s", 5, MetricMode::Average); 00344 00345 auto payloadSize = source_metric_data_[source_rank].data_size - source_metric_data_[source_rank].header_size; 00346 metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].data_delta_t, "s", 5, MetricMode::Accumulate); 00347 metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(payloadSize), "B", 5, MetricMode::Accumulate); 00348 metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), payloadSize / source_metric_data_[source_rank].data_delta_t, "B/s", 5, MetricMode::Average); 00349 00350 metricMan->sendMetric("Data Receive Count From Rank " + std::to_string(source_rank), recv_frag_count_.slotCount(source_rank), "fragments", 3, MetricMode::LastPoint); 00351 00352 metricMan->sendMetric("Total Shared Memory Wait Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].store_delta_t, "s", 3, MetricMode::Accumulate); 00353 metricMan->sendMetric("Avg Shared Memory Wait Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].store_delta_t / source_metric_data_[source_rank].data_point_count, "s", 3, MetricMode::Average); 00354 metricMan->sendMetric("Avg Fragment Wait Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].dead_t / source_metric_data_[source_rank].data_point_count, "s", 3, MetricMode::Average); 00355 00356 TLOG(6) << "runReceiver_: Done sending receive stats for rank " << source_rank; 00357 00358 source_metric_send_time_[source_rank] = std::chrono::steady_clock::now(); 00359 source_metric_data_[source_rank] = source_metric_data(); 00360 } 00361 00362 end_time = std::chrono::steady_clock::now(); 00363 } 00364 else if (header.type == Fragment::EndOfDataFragmentType || header.type == Fragment::InitFragmentType || header.type == Fragment::EndOfRunFragmentType || header.type == Fragment::EndOfSubrunFragmentType || header.type == Fragment::ShutdownFragmentType) 00365 { 00366 TLOG(TLVL_DEBUG) << "Received System Fragment from rank " << source_rank << " of type " << detail::RawFragmentHeader::SystemTypeToString(header.type) << "."; 00367 00368 FragmentPtr frag(new Fragment(header.word_count - header.num_words())); 00369 memcpy(frag->headerAddress(), &header, header.num_words() * sizeof(RawDataType)); 00370 auto ret3 = source_plugins_[source_rank]->receiveFragmentData(frag->headerAddress() + header.num_words(), header.word_count - header.num_words()); 00371 if (ret3 != source_rank) 00372 { 00373 TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")"; 00374 throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")"; 00375 } 00376 00377 switch (header.type) 00378 { 00379 case Fragment::EndOfDataFragmentType: 00380 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun); 00381 if(endOfDataCount == static_cast<size_t>(-1) ) endOfDataCount = *(frag->dataBegin()); 00382 else endOfDataCount += *(frag->dataBegin()); 00383 TLOG(TLVL_DEBUG) << "EndOfData Fragment indicates that " << endOfDataCount << " fragments are expected from rank " << source_rank 00384 << " (recvd " << recv_frag_count_.slotCount(source_rank) << ")."; 00385 break; 00386 case Fragment::InitFragmentType: 00387 TLOG(TLVL_DEBUG) << "Received Init Fragment from rank " << source_rank << "."; 00388 shm_manager_->setRequestMode(detail::RequestMessageMode::Normal); 00389 shm_manager_->SetInitFragment(std::move(frag)); 00390 break; 00391 case Fragment::EndOfRunFragmentType: 00392 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun); 00393 //shm_manager_->endRun(); 00394 break; 00395 case Fragment::EndOfSubrunFragmentType: 00396 //shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun); 00397 TLOG(TLVL_DEBUG) << "Received EndOfSubrun Fragment from rank " << source_rank 00398 << " with sequence_id " << header.sequence_id << "."; 00399 if (header.sequence_id != Fragment::InvalidSequenceID) shm_manager_->rolloverSubrun(header.sequence_id, header.timestamp); 00400 else shm_manager_->rolloverSubrun(recv_seq_count_.slotCount(source_rank), header.timestamp); 00401 break; 00402 case Fragment::ShutdownFragmentType: 00403 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun); 00404 break; 00405 } 00406 } 00407 } 00408 00409 source_plugins_[source_rank]->flush_buffers(); 00410 00411 TLOG(TLVL_DEBUG) << "runReceiver_ " << source_rank << " receive loop exited"; 00412 running_sources_[source_rank] = false; 00413 }