00001 #include <chrono>
00002
00003 #define TRACE_NAME (app_name + "_DataReceiverManager").c_str()
00004 #include "artdaq/DAQdata/Globals.hh"
00005 #include "artdaq/DAQrate/DataReceiverManager.hh"
00006 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
00007 #include "artdaq/TransferPlugins/detail/HostMap.hh"
00008 #include "cetlib_except/exception.h"
00009 #include <iomanip>
00010
00011 artdaq::DataReceiverManager::DataReceiverManager(const fhicl::ParameterSet& pset, std::shared_ptr<SharedMemoryEventManager> shm)
00012 : stop_requested_(false)
00013 , stop_requested_time_(0)
00014 , source_threads_()
00015 , source_plugins_()
00016 , source_metric_data_()
00017 , source_metric_send_time_()
00018 , enabled_sources_()
00019 , running_sources_()
00020 , recv_frag_count_()
00021 , recv_frag_size_()
00022 , recv_seq_count_()
00023 , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000))
00024 , stop_timeout_ms_(pset.get<size_t>("stop_timeout_ms", 1500))
00025 , shm_manager_(shm)
00026 , non_reliable_mode_enabled_(pset.get<bool>("non_reliable_mode", false))
00027 , non_reliable_mode_retry_count_(pset.get<size_t>("non_reliable_mode_retry_count", -1))
00028 {
00029 TLOG(TLVL_DEBUG) << "Constructor";
00030 auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>());
00031 auto enabled_srcs_empty = enabled_srcs.size() == 0;
00032
00033 if (non_reliable_mode_enabled_)
00034 {
00035 TLOG(TLVL_WARNING) << "DataReceiverManager is configured to drop data after " << non_reliable_mode_retry_count_
00036 << " failed attempts to put data into the SharedMemoryEventManager! If this is unexpected, please check your configuration!";
00037 }
00038
00039 if (enabled_srcs_empty)
00040 {
00041 TLOG(TLVL_INFO) << "enabled_sources not specified, assuming all sources enabled.";
00042 }
00043 else
00044 {
00045 for (auto& s : enabled_srcs)
00046 {
00047 enabled_sources_[s] = true;
00048 }
00049 }
00050
00051 hostMap_t host_map = MakeHostMap(pset);
00052 size_t tcp_receive_buffer_size = pset.get<size_t>("tcp_receive_buffer_size", 0);
00053 size_t max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0);
00054
00055 auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet());
00056 for (auto& s : srcs.get_pset_names())
00057 {
00058 auto src_pset = srcs.get<fhicl::ParameterSet>(s);
00059 host_map = MakeHostMap(src_pset, host_map);
00060 }
00061 auto host_map_pset = MakeHostMapPset(host_map);
00062 fhicl::ParameterSet srcs_mod;
00063 for (auto& s : srcs.get_pset_names())
00064 {
00065 auto src_pset = srcs.get<fhicl::ParameterSet>(s);
00066 src_pset.erase("host_map");
00067 src_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
00068
00069 if (tcp_receive_buffer_size != 0 && !src_pset.has_key("tcp_receive_buffer_size"))
00070 {
00071 src_pset.put<size_t>("tcp_receive_buffer_size", tcp_receive_buffer_size);
00072 }
00073 if (max_fragment_size_words != 0 && !src_pset.has_key("max_fragment_size_words"))
00074 {
00075 src_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words);
00076 }
00077
00078 srcs_mod.put<fhicl::ParameterSet>(s, src_pset);
00079 }
00080
00081 for (auto& s : srcs_mod.get_pset_names())
00082 {
00083 try
00084 {
00085 auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs_mod, s,
00086 TransferInterface::Role::kReceive));
00087 auto source_rank = transfer->source_rank();
00088 if (enabled_srcs_empty) enabled_sources_[source_rank] = true;
00089 else if (!enabled_sources_.count(source_rank)) enabled_sources_[source_rank] = false;
00090 running_sources_[source_rank] = false;
00091 source_plugins_[source_rank] = std::move(transfer);
00092 source_metric_send_time_[source_rank] = std::chrono::steady_clock::now();
00093 source_metric_data_[source_rank] = source_metric_data();
00094 }
00095 catch (cet::exception ex)
00096 {
00097 TLOG(TLVL_WARNING) << "cet::exception caught while setting up source " << s << ": " << ex.what();
00098 }
00099 catch (std::exception ex)
00100 {
00101 TLOG(TLVL_WARNING) << "std::exception caught while setting up source " << s << ": " << ex.what();
00102 }
00103 catch (...)
00104 {
00105 TLOG(TLVL_WARNING) << "Non-cet exception caught while setting up source " << s << ".";
00106 }
00107 }
00108 if (srcs.get_pset_names().size() == 0)
00109 {
00110 TLOG(TLVL_ERROR) << "No sources configured!";
00111 }
00112 }
00113
00114 artdaq::DataReceiverManager::~DataReceiverManager()
00115 {
00116 TLOG(TLVL_TRACE) << "~DataReceiverManager: BEGIN";
00117 stop_threads();
00118 shm_manager_.reset();
00119 TLOG(TLVL_TRACE) << "Destructor END";
00120 }
00121
00122
00123 void artdaq::DataReceiverManager::start_threads()
00124 {
00125 stop_requested_ = false;
00126 if (shm_manager_) shm_manager_->setRequestMode(artdaq::detail::RequestMessageMode::Normal);
00127 for (auto& source : source_plugins_)
00128 {
00129 auto& rank = source.first;
00130 if (enabled_sources_.count(rank) && enabled_sources_[rank].load())
00131 {
00132 source_metric_data_[rank] = source_metric_data();
00133 source_metric_send_time_[rank] = std::chrono::steady_clock::now();
00134
00135 recv_frag_count_.setSlot(rank, 0);
00136 recv_frag_size_.setSlot(rank,0);
00137 recv_seq_count_.setSlot(rank,0);
00138
00139 running_sources_[rank] = true;
00140 boost::thread::attributes attrs;
00141 attrs.set_stack_size(4096 * 2000);
00142 try {
00143 source_threads_[rank] = boost::thread(attrs, boost::bind(&DataReceiverManager::runReceiver_, this, rank));
00144 }
00145 catch (const boost::exception& e)
00146 {
00147 TLOG(TLVL_ERROR) << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
00148 std::cerr << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
00149 exit(5);
00150 }
00151 }
00152 }
00153 }
00154
00155 void artdaq::DataReceiverManager::stop_threads()
00156 {
00157 TLOG(TLVL_TRACE) << "stop_threads: BEGIN: Setting stop_requested to true, frags=" << count() << ", bytes=" << byteCount();
00158
00159 stop_requested_time_ = TimeUtils::gettimeofday_us();
00160 stop_requested_ = true;
00161
00162 TLOG(TLVL_TRACE) << "stop_threads: Joining all threads";
00163 for (auto& s : source_threads_)
00164 {
00165 auto& thread = s.second;
00166 if (thread.joinable()) thread.join();
00167 }
00168 }
00169
00170 std::set<int> artdaq::DataReceiverManager::enabled_sources() const
00171 {
00172 std::set<int> output;
00173 for (auto& src : enabled_sources_)
00174 {
00175 if (src.second) output.insert(src.first);
00176 }
00177 return output;
00178 }
00179
00180 std::set<int> artdaq::DataReceiverManager::running_sources() const
00181 {
00182 std::set<int> output;
00183 for (auto& src : running_sources_)
00184 {
00185 if (src.second) output.insert(src.first);
00186 }
00187 return output;
00188 }
00189
00190 void artdaq::DataReceiverManager::runReceiver_(int source_rank)
00191 {
00192 std::chrono::steady_clock::time_point start_time, after_header, before_body,after_body, end_time = std::chrono::steady_clock::now();
00193 int ret;
00194 detail::RawFragmentHeader header;
00195 size_t endOfDataCount = -1;
00196 auto sleep_time = receive_timeout_ / 100 > 100000 ? 100000 : receive_timeout_ / 100;
00197 if (sleep_time < 5000) sleep_time = 5000;
00198 auto max_retries = non_reliable_mode_retry_count_ * ceil(receive_timeout_ / sleep_time);
00199
00200 while (!(stop_requested_ && TimeUtils::gettimeofday_us() - stop_requested_time_ > stop_timeout_ms_ * 1000) && enabled_sources_.count(source_rank))
00201 {
00202 TLOG(16) << "runReceiver_: Begin loop";
00203 std::this_thread::yield();
00204
00205
00206 if (endOfDataCount <= recv_frag_count_.slotCount(source_rank) && !source_plugins_[source_rank]->isRunning())
00207 {
00208 TLOG(TLVL_DEBUG) << "runReceiver_: End of Data conditions met, ending runReceiver loop";
00209 break;
00210 }
00211
00212 start_time = std::chrono::steady_clock::now();
00213
00214 TLOG(16) << "runReceiver_: Calling receiveFragmentHeader tmo=" << receive_timeout_;
00215 ret = source_plugins_[source_rank]->receiveFragmentHeader(header, receive_timeout_);
00216 TLOG(16) << "runReceiver_: Done with receiveFragmentHeader, ret=" << ret << " (should be " << source_rank << ")";
00217 if (ret != source_rank)
00218 {
00219 if (ret >= 0) {
00220 TLOG(TLVL_WARNING) << "Received Fragment from rank " << ret << ", but was expecting one from rank " << source_rank << "!";
00221 }
00222 else if (ret == TransferInterface::DATA_END)
00223 {
00224 TLOG(TLVL_ERROR) << "Transfer Plugin returned DATA_END, ending receive loop!";
00225 break;
00226 }
00227 if ((*running_sources_.begin()).first == source_rank)
00228 {
00229 TLOG(TLVL_DEBUG) << "Calling SMEM::CheckPendingBuffers from DRM receiver thread for " << source_rank << " to make sure that things aren't stuck";
00230 shm_manager_->CheckPendingBuffers();
00231 }
00232
00233 usleep(sleep_time);
00234 continue;
00235 }
00236
00237 after_header = std::chrono::steady_clock::now();
00238
00239 if (Fragment::isUserFragmentType(header.type) || header.type == Fragment::DataFragmentType || header.type == Fragment::EmptyFragmentType || header.type == Fragment::ContainerFragmentType) {
00240 TLOG(TLVL_TRACE) << "Received Fragment Header from rank " << source_rank << ", sequence ID " << header.sequence_id << ", timestamp " << header.timestamp;
00241 RawDataType* loc = nullptr;
00242 size_t retries = 0;
00243 while (loc == nullptr)
00244 {
00245 loc = shm_manager_->WriteFragmentHeader(header);
00246 if (loc == nullptr && stop_requested_) return;
00247 if (loc == nullptr) usleep(sleep_time);
00248 retries++;
00249 if (non_reliable_mode_enabled_ && retries > max_retries)
00250 {
00251 loc = shm_manager_->WriteFragmentHeader(header, true);
00252 }
00253 }
00254 if (loc == nullptr)
00255 {
00256
00257 TLOG(TLVL_ERROR) << "runReceiver_: Could not get data location for event " << header.sequence_id;
00258 continue;
00259 }
00260 before_body = std::chrono::steady_clock::now();
00261
00262 auto hdrLoc = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(loc - artdaq::detail::RawFragmentHeader::num_words());
00263 TLOG(16) << "runReceiver_: Calling receiveFragmentData from rank " << source_rank << ", sequence ID " << header.sequence_id << ", timestamp " << header.timestamp;
00264 auto ret2 = source_plugins_[source_rank]->receiveFragmentData(loc, header.word_count - header.num_words());
00265 TLOG(16) << "runReceiver_: Done with receiveFragmentData, ret2=" << ret2 << " (should be " << source_rank << ")";
00266
00267 if (ret != ret2) {
00268 TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
00269 TLOG(TLVL_ERROR) << "Error receiving data from rank " << source_rank << ", data has been lost! Event " << header.sequence_id << " will most likely be Incomplete!";
00270
00271
00272
00273 hdrLoc->type = Fragment::ErrorFragmentType;
00274
00275 shm_manager_->DoneWritingFragment(header);
00276
00277 continue;
00278 }
00279
00280 shm_manager_->DoneWritingFragment(header);
00281 TLOG(TLVL_TRACE) << "Done receiving fragment with sequence ID " << header.sequence_id << " from rank " << source_rank;
00282
00283 recv_frag_count_.incSlot(source_rank);
00284 recv_frag_size_.incSlot(source_rank, header.word_count * sizeof(RawDataType));
00285 recv_seq_count_.setSlot(source_rank, header.sequence_id);
00286 if (endOfDataCount != static_cast<size_t>(-1))
00287 {
00288 TLOG(TLVL_DEBUG) << "Received fragment " << header.sequence_id << " from rank " << source_rank
00289 << " (" << recv_frag_count_.slotCount(source_rank) << "/" << endOfDataCount << ")";
00290 }
00291
00292 after_body = std::chrono::steady_clock::now();
00293
00294 source_metric_data_[source_rank].hdr_delta_t += TimeUtils::GetElapsedTime(start_time, after_header);
00295 source_metric_data_[source_rank].store_delta_t += TimeUtils::GetElapsedTime(after_header, before_body);
00296 source_metric_data_[source_rank].data_delta_t += TimeUtils::GetElapsedTime(before_body, after_body);
00297 source_metric_data_[source_rank].delta_t += TimeUtils::GetElapsedTime(start_time, after_body);
00298 source_metric_data_[source_rank].dead_t += TimeUtils::GetElapsedTime(end_time, start_time);
00299
00300 source_metric_data_[source_rank].data_size += header.word_count * sizeof(RawDataType);
00301 source_metric_data_[source_rank].header_size += header.num_words() * sizeof(RawDataType);
00302 source_metric_data_[source_rank].data_point_count++;
00303
00304 if (metricMan && TimeUtils::GetElapsedTime(source_metric_send_time_[source_rank]) > 1)
00305 {
00306 TLOG(6) << "runReceiver_: Sending receive stats for rank " << source_rank;
00307 metricMan->sendMetric("Total Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].delta_t, "s", 5, MetricMode::Accumulate);
00308 metricMan->sendMetric("Total Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(source_metric_data_[source_rank].data_size), "B", 5, MetricMode::Accumulate);
00309 metricMan->sendMetric("Total Receive Rate From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].data_size / source_metric_data_[source_rank].delta_t, "B/s", 5, MetricMode::Average);
00310
00311 metricMan->sendMetric("Header Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].hdr_delta_t, "s", 5, MetricMode::Accumulate);
00312 metricMan->sendMetric("Header Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(source_metric_data_[source_rank].header_size), "B", 5, MetricMode::Accumulate);
00313 metricMan->sendMetric("Header Receive Rate From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].header_size / source_metric_data_[source_rank].hdr_delta_t, "B/s", 5, MetricMode::Average);
00314
00315 auto payloadSize = source_metric_data_[source_rank].data_size - source_metric_data_[source_rank].header_size;
00316 metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].data_delta_t, "s", 5, MetricMode::Accumulate);
00317 metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(payloadSize), "B", 5, MetricMode::Accumulate);
00318 metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), payloadSize / source_metric_data_[source_rank].data_delta_t, "B/s", 5, MetricMode::Average);
00319
00320 metricMan->sendMetric("Data Receive Count From Rank " + std::to_string(source_rank), recv_frag_count_.slotCount(source_rank), "fragments", 3, MetricMode::LastPoint);
00321
00322 metricMan->sendMetric("Total Shared Memory Wait Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].store_delta_t, "s", 3, MetricMode::Accumulate);
00323 metricMan->sendMetric("Avg Shared Memory Wait Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].store_delta_t / source_metric_data_[source_rank].data_point_count, "s", 3, MetricMode::Average);
00324 metricMan->sendMetric("Avg Fragment Wait Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].dead_t / source_metric_data_[source_rank].data_point_count, "s", 3, MetricMode::Average);
00325
00326 TLOG(6) << "runReceiver_: Done sending receive stats for rank " << source_rank;
00327
00328 source_metric_send_time_[source_rank] = std::chrono::steady_clock::now();
00329 source_metric_data_[source_rank] = source_metric_data();
00330 }
00331
00332 end_time = std::chrono::steady_clock::now();
00333 }
00334 else if (header.type == Fragment::EndOfDataFragmentType || header.type == Fragment::InitFragmentType || header.type == Fragment::EndOfRunFragmentType || header.type == Fragment::EndOfSubrunFragmentType || header.type == Fragment::ShutdownFragmentType)
00335 {
00336 TLOG(TLVL_DEBUG) << "Received System Fragment from rank " << source_rank << " of type " << detail::RawFragmentHeader::SystemTypeToString(header.type) << ".";
00337
00338 FragmentPtr frag(new Fragment(header.word_count - header.num_words()));
00339 memcpy(frag->headerAddress(), &header, header.num_words() * sizeof(RawDataType));
00340 auto ret3 = source_plugins_[source_rank]->receiveFragmentData(frag->headerAddress() + header.num_words(), header.word_count - header.num_words());
00341 if (ret3 != source_rank)
00342 {
00343 TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
00344 throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
00345 }
00346
00347 switch (header.type)
00348 {
00349 case Fragment::EndOfDataFragmentType:
00350 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
00351 if(endOfDataCount == static_cast<size_t>(-1) ) endOfDataCount = *(frag->dataBegin());
00352 else endOfDataCount += *(frag->dataBegin());
00353 TLOG(TLVL_DEBUG) << "EndOfData Fragment indicates that " << endOfDataCount << " fragments are expected from rank " << source_rank
00354 << " (recvd " << recv_frag_count_.slotCount(source_rank) << ").";
00355 break;
00356 case Fragment::InitFragmentType:
00357 TLOG(TLVL_DEBUG) << "Received Init Fragment from rank " << source_rank << ".";
00358 shm_manager_->setRequestMode(detail::RequestMessageMode::Normal);
00359 shm_manager_->SetInitFragment(std::move(frag));
00360 break;
00361 case Fragment::EndOfRunFragmentType:
00362 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
00363
00364 break;
00365 case Fragment::EndOfSubrunFragmentType:
00366
00367 TLOG(TLVL_DEBUG) << "Received EndOfSubrun Fragment from rank " << source_rank
00368 << " with sequence_id " << header.sequence_id << ".";
00369 if (header.sequence_id != Fragment::InvalidSequenceID) shm_manager_->rolloverSubrun(header.sequence_id);
00370 else shm_manager_->rolloverSubrun(recv_seq_count_.slotCount(source_rank));
00371 break;
00372 case Fragment::ShutdownFragmentType:
00373 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
00374 break;
00375 }
00376 }
00377 }
00378
00379 TLOG(TLVL_DEBUG) << "runReceiver_ " << source_rank << " receive loop exited";
00380 running_sources_[source_rank] = false;
00381 }