00001 #define TRACE_NAME "FragmentReceiverManager"
00002
00003 #include <chrono>
00004
00005 #include "artdaq/DAQrate/FragmentReceiverManager.hh"
00006 #include "artdaq/DAQdata/Globals.hh"
00007 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
00008 #include "cetlib_except/exception.h"
00009
00010 artdaq::FragmentReceiverManager::FragmentReceiverManager(const fhicl::ParameterSet& pset)
00011 : stop_requested_(false)
00012 , source_threads_()
00013 , source_plugins_()
00014 , enabled_sources_()
00015 , fragment_store_()
00016 , recv_frag_count_()
00017 , recv_frag_size_()
00018 , recv_seq_count_()
00019 , suppress_noisy_senders_(pset.get<bool>("auto_suppression_enabled", true))
00020 , suppression_threshold_(pset.get<size_t>("max_receive_difference", 50))
00021 , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000))
00022 , last_source_(-1)
00023 {
00024 TLOG(TLVL_DEBUG) << "Constructor" ;
00025 auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>());
00026 auto enabled_srcs_empty = enabled_srcs.size() == 0;
00027 if (enabled_srcs_empty)
00028 {
00029 TLOG(TLVL_INFO) << "enabled_sources not specified, assuming all sources enabled." ;
00030 }
00031 else
00032 {
00033 for (auto& s : enabled_srcs)
00034 {
00035 enabled_sources_.insert(s);
00036 }
00037 }
00038
00039 auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet());
00040 for (auto& s : srcs.get_pset_names())
00041 {
00042 try
00043 {
00044 auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs, s,
00045 TransferInterface::Role::kReceive));
00046 auto source_rank = transfer->source_rank();
00047 if (enabled_srcs_empty) enabled_sources_.insert(source_rank);
00048 source_plugins_[source_rank] = std::move(transfer);
00049 fragment_store_[source_rank];
00050 }
00051 catch (cet::exception ex)
00052 {
00053 TLOG(TLVL_WARNING) << "cet::exception caught while setting up source " << s << ": " << ex.what() ;
00054 }
00055 catch (std::exception ex)
00056 {
00057 TLOG(TLVL_WARNING) << "std::exception caught while setting up source " << s << ": " << ex.what() ;
00058 }
00059 catch (...)
00060 {
00061 TLOG(TLVL_WARNING) << "Non-cet exception caught while setting up source " << s << "." ;
00062 }
00063 }
00064 if (srcs.get_pset_names().size() == 0)
00065 {
00066 TLOG(TLVL_ERROR) << "No sources configured!" ;
00067 }
00068 }
00069
00070 artdaq::FragmentReceiverManager::~FragmentReceiverManager()
00071 {
00072 TLOG(TLVL_DEBUG) << "Destructor" ;
00073 TLOG(5) << "~FragmentReceiverManager: BEGIN: Setting stop_requested to true, frags=" << std::to_string(count()) << ", bytes=" << std::to_string(byteCount()) ;
00074 stop_requested_ = true;
00075
00076 TLOG(5) << "~FragmentReceiverManager: Notifying all threads" ;
00077 output_cv_.notify_all();
00078
00079 TLOG(5) << "~FragmentReceiverManager: Joining all threads" ;
00080 for (auto& s : source_threads_)
00081 {
00082 auto& thread = s.second;
00083 if (thread.joinable()) thread.join();
00084 }
00085 TLOG(5) << "~FragmentReceiverManager: DONE" ;
00086 }
00087
00088 bool artdaq::FragmentReceiverManager::fragments_ready_() const
00089 {
00090 for (auto& it : fragment_store_)
00091 {
00092 if (!enabled_sources_.count(it.first)) continue;
00093 if (!it.second.empty()) { return true; }
00094 }
00095 return false;
00096 }
00097
00098 int artdaq::FragmentReceiverManager::get_next_source_() const
00099 {
00100
00101 std::set<int> ready_sources;
00102 for (auto& it : fragment_store_)
00103 {
00104 if (!enabled_sources_.count(it.first)) continue;
00105 if (!it.second.empty()) {
00106 ready_sources.insert(it.first);
00107 }
00108 }
00109
00110 if (ready_sources.size()) {
00111 auto iter = ready_sources.find(last_source_);
00112 if (iter == ready_sources.end() || ++iter == ready_sources.end()) {
00113 TLOG(TLVL_DEBUG) << "get_next_source returning " << *ready_sources.begin();
00114 last_source_ = *ready_sources.begin();
00115 return *ready_sources.begin();
00116 }
00117
00118 TLOG(TLVL_DEBUG) << "get_next_source returning " << *iter;
00119 last_source_ = *iter;
00120 return *iter;
00121 }
00122
00123 TLOG(TLVL_DEBUG) << "get_next_source returning -1";
00124 return -1;
00125 }
00126
00127 void artdaq::FragmentReceiverManager::start_threads()
00128 {
00129 for (auto& source : source_plugins_)
00130 {
00131 auto& rank = source.first;
00132 if (enabled_sources_.count(rank))
00133 {
00134 source_threads_[rank] = boost::thread(&FragmentReceiverManager::runReceiver_, this, rank);
00135 }
00136 }
00137 }
00138
00139 artdaq::FragmentPtr artdaq::FragmentReceiverManager::recvFragment(int& rank, size_t timeout_usec)
00140 {
00141 TLOG(5) <<"recvFragment entered tmo=" << std::to_string(timeout_usec) << " us" ;
00142
00143 if (timeout_usec == 0) timeout_usec = 1000000;
00144
00145 auto ready = fragments_ready_();
00146 size_t waited = 0;
00147 auto wait_amount = timeout_usec / 1000 > 1000 ? timeout_usec / 1000 : 1000;
00148 TLOG(5) << "recvFragment fragment_ready_=" << ready << " before wait" ;
00149 while (!ready && waited < timeout_usec)
00150 {
00151 {
00152 std::unique_lock<std::mutex> lck(input_cv_mutex_);
00153 input_cv_.wait_for(lck, std::chrono::microseconds(wait_amount));
00154 }
00155 waited += wait_amount;
00156 ready = fragments_ready_();
00157 if (running_sources_.size() == 0) break;
00158 }
00159 TLOG(5) << "recvFragment fragment_ready_=" << ready << " after waited=" << std::to_string( waited) ;
00160 if (!ready)
00161 {
00162 TLOG(5) << "recvFragment: No fragments ready, returning empty" ;
00163 rank = TransferInterface::RECV_TIMEOUT;
00164 return std::unique_ptr<Fragment>{};
00165 }
00166
00167 int current_source = get_next_source_();
00168 FragmentPtr current_fragment = fragment_store_[current_source].front();
00169 output_cv_.notify_all();
00170 rank = current_source;
00171
00172 if (current_fragment != nullptr)
00173 TLOG(5) << "recvFragment: Done rank="<< rank <<", fragment size="<<std::to_string(current_fragment->size()) << " words, seqId=" << std::to_string( current_fragment->sequenceID()) ;
00174 return current_fragment;
00175 }
00176
00177 void artdaq::FragmentReceiverManager::runReceiver_(int source_rank)
00178 {
00179 running_sources_.insert(source_rank);
00180 auto eod_quiet_start = std::chrono::steady_clock::now();
00181 while (!stop_requested_ && enabled_sources_.count(source_rank))
00182 {
00183 TLOG(16) << "runReceiver_ "<< source_rank << ": Begin loop" ;
00184 auto is_suppressed = suppress_noisy_senders_ && recv_seq_count_.slotCount(source_rank) > suppression_threshold_ + recv_seq_count_.minCount();
00185 while (!stop_requested_ && is_suppressed)
00186 {
00187 TLOG(6) << "runReceiver_: Suppressing receiver rank " << source_rank ;
00188 if (!is_suppressed) input_cv_.notify_all();
00189 else
00190 {
00191 std::unique_lock<std::mutex> lck(output_cv_mutex_);
00192 output_cv_.wait_for(lck, std::chrono::seconds(1));
00193 }
00194 is_suppressed = suppress_noisy_senders_ && recv_seq_count_.slotCount(source_rank) > suppression_threshold_ + recv_seq_count_.minCount();
00195 }
00196 if (stop_requested_)
00197 {
00198 running_sources_.erase(source_rank);
00199 return;
00200 }
00201
00202 if (fragment_store_[source_rank].GetEndOfData() <= recv_frag_count_.slotCount(source_rank) && TimeUtils::GetElapsedTimeMilliseconds(eod_quiet_start) > 1000)
00203 {
00204 TLOG(TLVL_DEBUG) << "runReceiver_: EndOfData conditions satisfied, ending receive loop";
00205 running_sources_.erase(source_rank);
00206 return;
00207 }
00208
00209 auto start_time = std::chrono::steady_clock::now();
00210 TLOG(16) << "runReceiver_: Calling receiveFragment" ;
00211 auto fragment = std::unique_ptr<Fragment>(new Fragment());
00212 #if 0
00213 auto ret = source_plugins_[source_rank]->receiveFragment(*fragment, receive_timeout_);
00214 TLOG(16) << "runReceiver_: Done with receiveFragment, ret=" << ret << " (should be " << source_rank << ")" ;
00215 if (ret != source_rank) continue;
00216 #else
00217 artdaq::detail::RawFragmentHeader hdr;
00218 auto ret1 = source_plugins_[source_rank]->receiveFragmentHeader(hdr, receive_timeout_);
00219 TLOG(16) << "runReceiver_: Done with receiveFragmentHeader, ret1=" << ret1 << " (should be " << source_rank << ")" ;
00220
00221 if (ret1 != source_rank) continue;
00222 eod_quiet_start = std::chrono::steady_clock::now();
00223
00224 fragment->resize(hdr.word_count - hdr.num_words());
00225 memcpy(fragment->headerAddress(), &hdr, hdr.num_words() * sizeof(artdaq::RawDataType));
00226 auto ret2 = source_plugins_[source_rank]->receiveFragmentData(fragment->headerAddress() + hdr.num_words(), hdr.word_count - hdr.num_words());
00227 if (ret2 != ret1)
00228 {
00229 TLOG(TLVL_ERROR) << "ReceiveFragmentHeader returned " << ret1 << ", but ReceiveFragmentData returned " << ret2 ;
00230 continue;
00231 }
00232 #endif
00233
00234
00235 if (fragment->type() == artdaq::Fragment::EndOfDataFragmentType)
00236 {
00237 TLOG(TLVL_TRACE) << "runReceiver_: EndOfData Fragment received!";
00238 fragment_store_[source_rank].SetEndOfData(*reinterpret_cast<size_t*>(fragment->dataBegin()));
00239 }
00240 else if(fragment->type() == artdaq::Fragment::DataFragmentType || fragment->type() == artdaq::Fragment::ContainerFragmentType || fragment->isUserFragmentType(fragment->type()))
00241 {
00242 TLOG(TLVL_TRACE) << "runReceiver_: Data Fragment received!";
00243 recv_frag_count_.incSlot(source_rank);
00244 recv_frag_size_.incSlot(source_rank, fragment->size() * sizeof(RawDataType));
00245 recv_seq_count_.setSlot(source_rank, fragment->sequenceID());
00246 }
00247 else
00248 {
00249 continue;
00250 }
00251
00252
00253
00254 if (metricMan)
00255 {
00256 TLOG(6) << "runReceiver_: Sending receive stats" ;
00257 auto delta_t = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - start_time).count();
00258 metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), delta_t, "s", 1, MetricMode::Accumulate);
00259 metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(fragment->size() * sizeof(RawDataType)), "B", 1, MetricMode::Accumulate);
00260 metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), fragment->size() * sizeof(RawDataType) / delta_t, "B/s", 1, MetricMode::Average);
00261 }
00262
00263
00264 fragment_store_[source_rank].emplace_back(std::move(fragment));
00265 TLOG(TLVL_TRACE) << "runReceiver_: There are now " << fragment_store_[source_rank].size() << " Fragments stored from this source";
00266 input_cv_.notify_all();
00267
00268 }
00269
00270 running_sources_.erase(source_rank);
00271 }