$treeview $search $mathjax $extrastylesheet
artdaq
v3_04_00
$projectbrief
|
$projectbrief
|
$searchbox |
00001 #define TRACE_NAME (app_name + "_FragmentReceiverManager").c_str() 00002 #include "artdaq/DAQdata/Globals.hh" 00003 00004 #include <chrono> 00005 00006 #include "artdaq/DAQrate/FragmentReceiverManager.hh" 00007 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh" 00008 #include "cetlib_except/exception.h" 00009 00010 artdaq::FragmentReceiverManager::FragmentReceiverManager(const fhicl::ParameterSet& pset) 00011 : stop_requested_(false) 00012 , source_threads_() 00013 , source_plugins_() 00014 , source_metric_data_() 00015 , source_metric_send_time_() 00016 , enabled_sources_() 00017 , fragment_store_() 00018 , recv_frag_count_() 00019 , recv_frag_size_() 00020 , recv_seq_count_() 00021 , suppress_noisy_senders_(pset.get<bool>("auto_suppression_enabled", true)) 00022 , suppression_threshold_(pset.get<size_t>("max_receive_difference", 50)) 00023 , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000)) 00024 , last_source_(-1) 00025 { 00026 TLOG(TLVL_DEBUG) << "Constructor"; 00027 auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>()); 00028 auto enabled_srcs_empty = enabled_srcs.size() == 0; 00029 if (enabled_srcs_empty) 00030 { 00031 TLOG(TLVL_INFO) << "enabled_sources not specified, assuming all sources enabled."; 00032 } 00033 else 00034 { 00035 for (auto& s : enabled_srcs) 00036 { 00037 enabled_sources_[s] = true; 00038 } 00039 } 00040 00041 auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet()); 00042 for (auto& s : srcs.get_pset_names()) 00043 { 00044 try 00045 { 00046 auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs, s, 00047 TransferInterface::Role::kReceive)); 00048 auto source_rank = transfer->source_rank(); 00049 if (enabled_srcs_empty) enabled_sources_[source_rank] = true; 00050 else if (!enabled_sources_.count(source_rank)) enabled_sources_[source_rank] = false; 00051 running_sources_[source_rank] = false; 00052 source_plugins_[source_rank] = std::move(transfer); 00053 fragment_store_[source_rank]; 00054 source_metric_send_time_[source_rank] = std::chrono::steady_clock::now(); 00055 source_metric_data_[source_rank] = std::pair<size_t, double>(); 00056 } 00057 catch (cet::exception ex) 00058 { 00059 TLOG(TLVL_WARNING) << "cet::exception caught while setting up source " << s << ": " << ex.what(); 00060 } 00061 catch (std::exception ex) 00062 { 00063 TLOG(TLVL_WARNING) << "std::exception caught while setting up source " << s << ": " << ex.what(); 00064 } 00065 catch (...) 00066 { 00067 TLOG(TLVL_WARNING) << "Non-cet exception caught while setting up source " << s << "."; 00068 } 00069 } 00070 if (srcs.get_pset_names().size() == 0) 00071 { 00072 TLOG(TLVL_ERROR) << "No sources configured!"; 00073 } 00074 } 00075 00076 artdaq::FragmentReceiverManager::~FragmentReceiverManager() 00077 { 00078 TLOG(TLVL_DEBUG) << "Destructor"; 00079 TLOG(5) << "~FragmentReceiverManager: BEGIN: Setting stop_requested to true, frags=" << count() << ", bytes=" << byteCount(); 00080 stop_requested_ = true; 00081 00082 TLOG(5) << "~FragmentReceiverManager: Notifying all threads"; 00083 output_cv_.notify_all(); 00084 00085 TLOG(5) << "~FragmentReceiverManager: Joining all threads"; 00086 for (auto& s : source_threads_) 00087 { 00088 auto& thread = s.second; 00089 if (thread.joinable()) thread.join(); 00090 } 00091 TLOG(5) << "~FragmentReceiverManager: DONE"; 00092 } 00093 00094 bool artdaq::FragmentReceiverManager::fragments_ready_() const 00095 { 00096 for (auto& it : fragment_store_) 00097 { 00098 if (!enabled_sources_.count(it.first)) continue; 00099 if (!it.second.empty()) { return true; } 00100 } 00101 return false; 00102 } 00103 00104 int artdaq::FragmentReceiverManager::get_next_source_() const 00105 { 00106 //std::unique_lock<std::mutex> lck(fragment_store_mutex_); 00107 std::set<int> ready_sources; 00108 for (auto& it : fragment_store_) 00109 { 00110 if (!enabled_sources_.count(it.first)) continue; 00111 if (!it.second.empty()) { 00112 ready_sources.insert(it.first); 00113 } 00114 } 00115 00116 if (ready_sources.size()) { 00117 auto iter = ready_sources.find(last_source_); 00118 if (iter == ready_sources.end() || ++iter == ready_sources.end()) { 00119 TLOG(10) << "get_next_source returning " << *ready_sources.begin(); 00120 last_source_ = *ready_sources.begin(); 00121 return *ready_sources.begin(); 00122 } 00123 00124 TLOG(10) << "get_next_source returning " << *iter; 00125 last_source_ = *iter; 00126 return *iter; 00127 } 00128 00129 TLOG(10) << "get_next_source returning -1"; 00130 return -1; 00131 } 00132 00133 void artdaq::FragmentReceiverManager::start_threads() 00134 { 00135 for (auto& source : source_plugins_) 00136 { 00137 auto& rank = source.first; 00138 if (enabled_sources_.count(rank)) 00139 { 00140 running_sources_[rank] = true; 00141 try { 00142 source_threads_[rank] = boost::thread(&FragmentReceiverManager::runReceiver_, this, rank); 00143 } 00144 catch (const boost::exception& e) 00145 { 00146 TLOG(TLVL_ERROR) << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno; 00147 std::cerr << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl; 00148 exit(5); 00149 } 00150 } 00151 } 00152 } 00153 00154 artdaq::FragmentPtr artdaq::FragmentReceiverManager::recvFragment(int& rank, size_t timeout_usec) 00155 { 00156 TLOG(5) << "recvFragment entered tmo=" << timeout_usec << " us"; 00157 00158 if (timeout_usec == 0) timeout_usec = 1000000; 00159 00160 auto ready = fragments_ready_(); 00161 size_t waited = 0; 00162 auto wait_amount = timeout_usec / 1000 > 1000 ? timeout_usec / 1000 : 1000; 00163 TLOG(5) << "recvFragment fragment_ready_=" << ready << " before wait"; 00164 while (!ready && waited < timeout_usec) 00165 { 00166 { 00167 std::unique_lock<std::mutex> lck(input_cv_mutex_); 00168 input_cv_.wait_for(lck, std::chrono::microseconds(wait_amount)); 00169 } 00170 waited += wait_amount; 00171 ready = fragments_ready_(); 00172 if (running_sources().size() == 0) break; 00173 } 00174 TLOG(5) << "recvFragment fragment_ready_=" << ready << " after waited=" << waited; 00175 if (!ready) 00176 { 00177 TLOG(5) << "recvFragment: No fragments ready, returning empty"; 00178 rank = TransferInterface::RECV_TIMEOUT; 00179 return std::unique_ptr<Fragment>{}; 00180 } 00181 00182 int current_source = get_next_source_(); 00183 FragmentPtr current_fragment = fragment_store_[current_source].front(); 00184 output_cv_.notify_all(); 00185 rank = current_source; 00186 00187 if (current_fragment != nullptr) 00188 TLOG(5) << "recvFragment: Done rank=" << rank << ", fragment size=" << std::to_string(current_fragment->size()) << " words, seqId=" << current_fragment->sequenceID(); 00189 return current_fragment; 00190 } 00191 00192 std::set<int> artdaq::FragmentReceiverManager::running_sources() const 00193 { 00194 std::set<int> output; 00195 for (auto& src : running_sources_) 00196 { 00197 if (src.second) output.insert(src.first); 00198 } 00199 return output; 00200 } 00201 00202 std::set<int> artdaq::FragmentReceiverManager::enabled_sources() const 00203 { 00204 std::set<int> output; 00205 for (auto& src : enabled_sources_) 00206 { 00207 if (src.second) output.insert(src.first); 00208 } 00209 return output; 00210 } 00211 00212 void artdaq::FragmentReceiverManager::runReceiver_(int source_rank) 00213 { 00214 while (!stop_requested_ && enabled_sources_.count(source_rank)) 00215 { 00216 TLOG(16) << "runReceiver_ " << source_rank << ": Begin loop"; 00217 auto is_suppressed = suppress_noisy_senders_ && recv_seq_count_.slotCount(source_rank) > suppression_threshold_ + recv_seq_count_.minCount(); 00218 while (!stop_requested_ && is_suppressed) 00219 { 00220 TLOG(6) << "runReceiver_: Suppressing receiver rank " << source_rank; 00221 if (!is_suppressed) input_cv_.notify_all(); 00222 else 00223 { 00224 std::unique_lock<std::mutex> lck(output_cv_mutex_); 00225 output_cv_.wait_for(lck, std::chrono::seconds(1)); 00226 } 00227 is_suppressed = suppress_noisy_senders_ && recv_seq_count_.slotCount(source_rank) > suppression_threshold_ + recv_seq_count_.minCount(); 00228 } 00229 if (stop_requested_) 00230 { 00231 running_sources_[source_rank] = false; 00232 return; 00233 } 00234 00235 if (fragment_store_[source_rank].GetEndOfData() <= recv_frag_count_.slotCount(source_rank) && !source_plugins_[source_rank]->isRunning()) 00236 { 00237 TLOG(TLVL_DEBUG) << "runReceiver_: EndOfData conditions satisfied, ending receive loop"; 00238 running_sources_[source_rank] = false; 00239 return; 00240 } 00241 00242 auto start_time = std::chrono::steady_clock::now(); 00243 TLOG(16) << "runReceiver_: Calling receiveFragment"; 00244 auto fragment = std::unique_ptr<Fragment>(new Fragment()); 00245 #if 0 00246 auto ret = source_plugins_[source_rank]->receiveFragment(*fragment, receive_timeout_); 00247 TLOG(16) << "runReceiver_: Done with receiveFragment, ret=" << ret << " (should be " << source_rank << ")"; 00248 if (ret != source_rank) continue; // Receive timeout or other oddness 00249 #else 00250 artdaq::detail::RawFragmentHeader hdr; 00251 auto ret1 = source_plugins_[source_rank]->receiveFragmentHeader(hdr, receive_timeout_); 00252 TLOG(16) << "runReceiver_: Done with receiveFragmentHeader, ret1=" << ret1 << " (should be " << source_rank << ")"; 00253 00254 if (ret1 != source_rank) continue; // Receive timeout or other oddness 00255 00256 fragment->resize(hdr.word_count - hdr.num_words()); 00257 memcpy(fragment->headerAddress(), &hdr, hdr.num_words() * sizeof(artdaq::RawDataType)); 00258 auto ret2 = source_plugins_[source_rank]->receiveFragmentData(fragment->headerAddress() + hdr.num_words(), hdr.word_count - hdr.num_words()); 00259 if (ret2 != ret1) 00260 { 00261 TLOG(TLVL_ERROR) << "ReceiveFragmentHeader returned " << ret1 << ", but ReceiveFragmentData returned " << ret2; 00262 continue; 00263 } 00264 #endif 00265 00266 00267 if (fragment->type() == artdaq::Fragment::EndOfDataFragmentType) 00268 { 00269 TLOG(TLVL_TRACE) << "runReceiver_: EndOfData Fragment received!"; 00270 fragment_store_[source_rank].SetEndOfData(*reinterpret_cast<size_t*>(fragment->dataBegin())); 00271 } 00272 else if (fragment->type() == artdaq::Fragment::DataFragmentType || fragment->type() == artdaq::Fragment::ContainerFragmentType || fragment->isUserFragmentType(fragment->type())) 00273 { 00274 TLOG(TLVL_TRACE) << "runReceiver_: Data Fragment received!"; 00275 recv_frag_count_.incSlot(source_rank); 00276 recv_frag_size_.incSlot(source_rank, fragment->size() * sizeof(RawDataType)); 00277 recv_seq_count_.setSlot(source_rank, fragment->sequenceID()); 00278 } 00279 else 00280 { 00281 continue; 00282 } 00283 00284 auto delta_t = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - start_time).count(); 00285 source_metric_data_[source_rank].first += fragment->size() * sizeof(RawDataType); 00286 source_metric_data_[source_rank].second += delta_t; 00287 00288 if (metricMan && TimeUtils::GetElapsedTime(source_metric_send_time_[source_rank]) > 1) 00289 { 00290 TLOG(6) << "runReceiver_: Sending receive stats"; 00291 metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].second, "s", 1, MetricMode::Accumulate); 00292 metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(source_metric_data_[source_rank].first), "B", 1, MetricMode::Accumulate); 00293 metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].first / source_metric_data_[source_rank].second, "B/s", 1, MetricMode::Average); 00294 00295 source_metric_send_time_[source_rank] = std::chrono::steady_clock::now(); 00296 source_metric_data_[source_rank].first = 0; 00297 source_metric_data_[source_rank].second = 0.0; 00298 } 00299 00300 00301 fragment_store_[source_rank].emplace_back(std::move(fragment)); 00302 TLOG(TLVL_TRACE) << "runReceiver_: There are now " << fragment_store_[source_rank].size() << " Fragments stored from this source"; 00303 input_cv_.notify_all(); 00304 00305 } 00306 00307 running_sources_[source_rank] = false; 00308 }