00001 #define TRACE_NAME (app_name + "_FragmentReceiverManager").c_str()
00002 #include "artdaq/DAQdata/Globals.hh"
00003
00004 #include <chrono>
00005
00006 #include "artdaq/DAQrate/FragmentReceiverManager.hh"
00007 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
00008 #include "cetlib_except/exception.h"
00009
00010 artdaq::FragmentReceiverManager::FragmentReceiverManager(const fhicl::ParameterSet& pset)
00011 : stop_requested_(false)
00012 , source_threads_()
00013 , source_plugins_()
00014 , source_metric_data_()
00015 , source_metric_send_time_()
00016 , enabled_sources_()
00017 , fragment_store_()
00018 , recv_frag_count_()
00019 , recv_frag_size_()
00020 , recv_seq_count_()
00021 , suppress_noisy_senders_(pset.get<bool>("auto_suppression_enabled", true))
00022 , suppression_threshold_(pset.get<size_t>("max_receive_difference", 50))
00023 , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000))
00024 , last_source_(-1)
00025 {
00026 TLOG(TLVL_DEBUG) << "Constructor";
00027 auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>());
00028 auto enabled_srcs_empty = enabled_srcs.size() == 0;
00029 if (enabled_srcs_empty)
00030 {
00031 TLOG(TLVL_INFO) << "enabled_sources not specified, assuming all sources enabled.";
00032 }
00033 else
00034 {
00035 for (auto& s : enabled_srcs)
00036 {
00037 enabled_sources_[s] = true;
00038 }
00039 }
00040
00041 auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet());
00042 for (auto& s : srcs.get_pset_names())
00043 {
00044 try
00045 {
00046 auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs, s,
00047 TransferInterface::Role::kReceive));
00048 auto source_rank = transfer->source_rank();
00049 if (enabled_srcs_empty) enabled_sources_[source_rank] = true;
00050 else if (!enabled_sources_.count(source_rank)) enabled_sources_[source_rank] = false;
00051 running_sources_[source_rank] = false;
00052 source_plugins_[source_rank] = std::move(transfer);
00053 fragment_store_[source_rank];
00054 source_metric_send_time_[source_rank] = std::chrono::steady_clock::now();
00055 source_metric_data_[source_rank] = std::pair<size_t, double>();
00056 }
00057 catch (cet::exception ex)
00058 {
00059 TLOG(TLVL_WARNING) << "cet::exception caught while setting up source " << s << ": " << ex.what();
00060 }
00061 catch (std::exception ex)
00062 {
00063 TLOG(TLVL_WARNING) << "std::exception caught while setting up source " << s << ": " << ex.what();
00064 }
00065 catch (...)
00066 {
00067 TLOG(TLVL_WARNING) << "Non-cet exception caught while setting up source " << s << ".";
00068 }
00069 }
00070 if (srcs.get_pset_names().size() == 0)
00071 {
00072 TLOG(TLVL_ERROR) << "No sources configured!";
00073 }
00074 }
00075
00076 artdaq::FragmentReceiverManager::~FragmentReceiverManager()
00077 {
00078 TLOG(TLVL_DEBUG) << "Destructor";
00079 TLOG(5) << "~FragmentReceiverManager: BEGIN: Setting stop_requested to true, frags=" << count() << ", bytes=" << byteCount();
00080 stop_requested_ = true;
00081
00082 TLOG(5) << "~FragmentReceiverManager: Notifying all threads";
00083 output_cv_.notify_all();
00084
00085 TLOG(5) << "~FragmentReceiverManager: Joining all threads";
00086 for (auto& s : source_threads_)
00087 {
00088 auto& thread = s.second;
00089 if (thread.joinable()) thread.join();
00090 }
00091 TLOG(5) << "~FragmentReceiverManager: DONE";
00092 }
00093
00094 bool artdaq::FragmentReceiverManager::fragments_ready_() const
00095 {
00096 for (auto& it : fragment_store_)
00097 {
00098 if (!enabled_sources_.count(it.first)) continue;
00099 if (!it.second.empty()) { return true; }
00100 }
00101 return false;
00102 }
00103
00104 int artdaq::FragmentReceiverManager::get_next_source_() const
00105 {
00106
00107 std::set<int> ready_sources;
00108 for (auto& it : fragment_store_)
00109 {
00110 if (!enabled_sources_.count(it.first)) continue;
00111 if (!it.second.empty()) {
00112 ready_sources.insert(it.first);
00113 }
00114 }
00115
00116 if (ready_sources.size()) {
00117 auto iter = ready_sources.find(last_source_);
00118 if (iter == ready_sources.end() || ++iter == ready_sources.end()) {
00119 TLOG(10) << "get_next_source returning " << *ready_sources.begin();
00120 last_source_ = *ready_sources.begin();
00121 return *ready_sources.begin();
00122 }
00123
00124 TLOG(10) << "get_next_source returning " << *iter;
00125 last_source_ = *iter;
00126 return *iter;
00127 }
00128
00129 TLOG(10) << "get_next_source returning -1";
00130 return -1;
00131 }
00132
00133 void artdaq::FragmentReceiverManager::start_threads()
00134 {
00135 for (auto& source : source_plugins_)
00136 {
00137 auto& rank = source.first;
00138 if (enabled_sources_.count(rank))
00139 {
00140 running_sources_[rank] = true;
00141 try {
00142 source_threads_[rank] = boost::thread(&FragmentReceiverManager::runReceiver_, this, rank);
00143 }
00144 catch (const boost::exception& e)
00145 {
00146 TLOG(TLVL_ERROR) << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
00147 std::cerr << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
00148 exit(5);
00149 }
00150 }
00151 }
00152 }
00153
00154 artdaq::FragmentPtr artdaq::FragmentReceiverManager::recvFragment(int& rank, size_t timeout_usec)
00155 {
00156 TLOG(5) << "recvFragment entered tmo=" << timeout_usec << " us";
00157
00158 if (timeout_usec == 0) timeout_usec = 1000000;
00159
00160 auto ready = fragments_ready_();
00161 size_t waited = 0;
00162 auto wait_amount = timeout_usec / 1000 > 1000 ? timeout_usec / 1000 : 1000;
00163 TLOG(5) << "recvFragment fragment_ready_=" << ready << " before wait";
00164 while (!ready && waited < timeout_usec)
00165 {
00166 {
00167 std::unique_lock<std::mutex> lck(input_cv_mutex_);
00168 input_cv_.wait_for(lck, std::chrono::microseconds(wait_amount));
00169 }
00170 waited += wait_amount;
00171 ready = fragments_ready_();
00172 if (running_sources().size() == 0) break;
00173 }
00174 TLOG(5) << "recvFragment fragment_ready_=" << ready << " after waited=" << waited;
00175 if (!ready)
00176 {
00177 TLOG(5) << "recvFragment: No fragments ready, returning empty";
00178 rank = TransferInterface::RECV_TIMEOUT;
00179 return std::unique_ptr<Fragment>{};
00180 }
00181
00182 int current_source = get_next_source_();
00183 FragmentPtr current_fragment = fragment_store_[current_source].front();
00184 output_cv_.notify_all();
00185 rank = current_source;
00186
00187 if (current_fragment != nullptr)
00188 TLOG(5) << "recvFragment: Done rank=" << rank << ", fragment size=" << std::to_string(current_fragment->size()) << " words, seqId=" << current_fragment->sequenceID();
00189 return current_fragment;
00190 }
00191
00192 std::set<int> artdaq::FragmentReceiverManager::running_sources() const
00193 {
00194 std::set<int> output;
00195 for (auto& src : running_sources_)
00196 {
00197 if (src.second) output.insert(src.first);
00198 }
00199 return output;
00200 }
00201
00202 std::set<int> artdaq::FragmentReceiverManager::enabled_sources() const
00203 {
00204 std::set<int> output;
00205 for (auto& src : enabled_sources_)
00206 {
00207 if (src.second) output.insert(src.first);
00208 }
00209 return output;
00210 }
00211
00212 void artdaq::FragmentReceiverManager::runReceiver_(int source_rank)
00213 {
00214 while (!stop_requested_ && enabled_sources_.count(source_rank))
00215 {
00216 TLOG(16) << "runReceiver_ " << source_rank << ": Begin loop";
00217 auto is_suppressed = suppress_noisy_senders_ && recv_seq_count_.slotCount(source_rank) > suppression_threshold_ + recv_seq_count_.minCount();
00218 while (!stop_requested_ && is_suppressed)
00219 {
00220 TLOG(6) << "runReceiver_: Suppressing receiver rank " << source_rank;
00221 if (!is_suppressed) input_cv_.notify_all();
00222 else
00223 {
00224 std::unique_lock<std::mutex> lck(output_cv_mutex_);
00225 output_cv_.wait_for(lck, std::chrono::seconds(1));
00226 }
00227 is_suppressed = suppress_noisy_senders_ && recv_seq_count_.slotCount(source_rank) > suppression_threshold_ + recv_seq_count_.minCount();
00228 }
00229 if (stop_requested_)
00230 {
00231 running_sources_[source_rank] = false;
00232 return;
00233 }
00234
00235 if (fragment_store_[source_rank].GetEndOfData() <= recv_frag_count_.slotCount(source_rank) && !source_plugins_[source_rank]->isRunning())
00236 {
00237 TLOG(TLVL_DEBUG) << "runReceiver_: EndOfData conditions satisfied, ending receive loop";
00238 running_sources_[source_rank] = false;
00239 return;
00240 }
00241
00242 auto start_time = std::chrono::steady_clock::now();
00243 TLOG(16) << "runReceiver_: Calling receiveFragment";
00244 auto fragment = std::unique_ptr<Fragment>(new Fragment());
00245 #if 0
00246 auto ret = source_plugins_[source_rank]->receiveFragment(*fragment, receive_timeout_);
00247 TLOG(16) << "runReceiver_: Done with receiveFragment, ret=" << ret << " (should be " << source_rank << ")";
00248 if (ret != source_rank) continue;
00249 #else
00250 artdaq::detail::RawFragmentHeader hdr;
00251 auto ret1 = source_plugins_[source_rank]->receiveFragmentHeader(hdr, receive_timeout_);
00252 TLOG(16) << "runReceiver_: Done with receiveFragmentHeader, ret1=" << ret1 << " (should be " << source_rank << ")";
00253
00254 if (ret1 != source_rank) continue;
00255
00256 fragment->resize(hdr.word_count - hdr.num_words());
00257 memcpy(fragment->headerAddress(), &hdr, hdr.num_words() * sizeof(artdaq::RawDataType));
00258 auto ret2 = source_plugins_[source_rank]->receiveFragmentData(fragment->headerAddress() + hdr.num_words(), hdr.word_count - hdr.num_words());
00259 if (ret2 != ret1)
00260 {
00261 TLOG(TLVL_ERROR) << "ReceiveFragmentHeader returned " << ret1 << ", but ReceiveFragmentData returned " << ret2;
00262 continue;
00263 }
00264 #endif
00265
00266
00267 if (fragment->type() == artdaq::Fragment::EndOfDataFragmentType)
00268 {
00269 TLOG(TLVL_TRACE) << "runReceiver_: EndOfData Fragment received!";
00270 fragment_store_[source_rank].SetEndOfData(*reinterpret_cast<size_t*>(fragment->dataBegin()));
00271 }
00272 else if (fragment->type() == artdaq::Fragment::DataFragmentType || fragment->type() == artdaq::Fragment::ContainerFragmentType || fragment->isUserFragmentType(fragment->type()))
00273 {
00274 TLOG(TLVL_TRACE) << "runReceiver_: Data Fragment received!";
00275 recv_frag_count_.incSlot(source_rank);
00276 recv_frag_size_.incSlot(source_rank, fragment->size() * sizeof(RawDataType));
00277 recv_seq_count_.setSlot(source_rank, fragment->sequenceID());
00278 }
00279 else
00280 {
00281 continue;
00282 }
00283
00284 auto delta_t = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - start_time).count();
00285 source_metric_data_[source_rank].first += fragment->size() * sizeof(RawDataType);
00286 source_metric_data_[source_rank].second += delta_t;
00287
00288 if (metricMan && TimeUtils::GetElapsedTime(source_metric_send_time_[source_rank]) > 1)
00289 {
00290 TLOG(6) << "runReceiver_: Sending receive stats";
00291 metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].second, "s", 1, MetricMode::Accumulate);
00292 metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(source_metric_data_[source_rank].first), "B", 1, MetricMode::Accumulate);
00293 metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].first / source_metric_data_[source_rank].second, "B/s", 1, MetricMode::Average);
00294
00295 source_metric_send_time_[source_rank] = std::chrono::steady_clock::now();
00296 source_metric_data_[source_rank].first = 0;
00297 source_metric_data_[source_rank].second = 0.0;
00298 }
00299
00300
00301 fragment_store_[source_rank].emplace_back(std::move(fragment));
00302 TLOG(TLVL_TRACE) << "runReceiver_: There are now " << fragment_store_[source_rank].size() << " Fragments stored from this source";
00303 input_cv_.notify_all();
00304
00305 }
00306
00307 running_sources_[source_rank] = false;
00308 }