00001 #include <chrono>
00002
00003 #define TRACE_NAME "DataReceiverManager"
00004 #include "artdaq/DAQdata/Globals.hh"
00005 #include "artdaq/DAQrate/DataReceiverManager.hh"
00006 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
00007 #include "artdaq/TransferPlugins/detail/HostMap.hh"
00008 #include "cetlib_except/exception.h"
00009 #include <iomanip>
00010
00011 artdaq::DataReceiverManager::DataReceiverManager(const fhicl::ParameterSet& pset, std::shared_ptr<SharedMemoryEventManager> shm)
00012 : stop_requested_(false)
00013 , stop_requested_time_(0)
00014 , source_threads_()
00015 , source_plugins_()
00016 , enabled_sources_()
00017 , running_sources_()
00018 , recv_frag_count_()
00019 , recv_frag_size_()
00020 , recv_seq_count_()
00021 , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000))
00022 , stop_timeout_ms_(pset.get<size_t>("stop_timeout_ms", 1500))
00023 , shm_manager_(shm)
00024 , non_reliable_mode_enabled_(pset.get<bool>("non_reliable_mode", false))
00025 , non_reliable_mode_retry_count_(pset.get<size_t>("non_reliable_mode_retry_count", -1))
00026 {
00027 TLOG(TLVL_DEBUG) << "Constructor";
00028 auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>());
00029 auto enabled_srcs_empty = enabled_srcs.size() == 0;
00030
00031 if (non_reliable_mode_enabled_)
00032 {
00033 TLOG(TLVL_WARNING) << "DataReceiverManager is configured to drop data after " << std::to_string(non_reliable_mode_retry_count_)
00034 << " failed attempts to put data into the SharedMemoryEventManager! If this is unexpected, please check your configuration!";
00035 }
00036
00037 if (enabled_srcs_empty)
00038 {
00039 TLOG(TLVL_INFO) << "enabled_sources not specified, assuming all sources enabled.";
00040 }
00041 else
00042 {
00043 for (auto& s : enabled_srcs)
00044 {
00045 enabled_sources_.insert(s);
00046 }
00047 }
00048
00049 hostMap_t host_map = MakeHostMap(pset);
00050 auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet());
00051 for (auto& s : srcs.get_pset_names())
00052 {
00053 auto src_pset = srcs.get<fhicl::ParameterSet>(s);
00054 host_map = MakeHostMap(src_pset, 0, host_map);
00055 }
00056 auto host_map_pset = MakeHostMapPset(host_map);
00057 fhicl::ParameterSet srcs_mod;
00058 for (auto& s : srcs.get_pset_names())
00059 {
00060 auto src_pset = srcs.get<fhicl::ParameterSet>(s);
00061 src_pset.erase("host_map");
00062 src_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
00063 srcs_mod.put<fhicl::ParameterSet>(s, src_pset);
00064 }
00065
00066 for (auto& s : srcs_mod.get_pset_names())
00067 {
00068 try
00069 {
00070 auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs_mod, s,
00071 TransferInterface::Role::kReceive));
00072 auto source_rank = transfer->source_rank();
00073 if (enabled_srcs_empty) enabled_sources_.insert(source_rank);
00074 source_plugins_[source_rank] = std::move(transfer);
00075 }
00076 catch (cet::exception ex)
00077 {
00078 TLOG(TLVL_WARNING) << "cet::exception caught while setting up source " << s << ": " << ex.what();
00079 }
00080 catch (std::exception ex)
00081 {
00082 TLOG(TLVL_WARNING) << "std::exception caught while setting up source " << s << ": " << ex.what();
00083 }
00084 catch (...)
00085 {
00086 TLOG(TLVL_WARNING) << "Non-cet exception caught while setting up source " << s << ".";
00087 }
00088 }
00089 if (srcs.get_pset_names().size() == 0)
00090 {
00091 TLOG(TLVL_ERROR) << "No sources configured!";
00092 }
00093 }
00094
00095 artdaq::DataReceiverManager::~DataReceiverManager()
00096 {
00097 TLOG(TLVL_TRACE) << "~DataReceiverManager: BEGIN";
00098 stop_threads();
00099 shm_manager_.reset();
00100 TLOG(TLVL_TRACE) << "Destructor END";
00101 }
00102
00103
00104 void artdaq::DataReceiverManager::start_threads()
00105 {
00106 stop_requested_ = false;
00107 if (shm_manager_) shm_manager_->setRequestMode(artdaq::detail::RequestMessageMode::Normal);
00108 for (auto& source : source_plugins_)
00109 {
00110 auto& rank = source.first;
00111 if (enabled_sources_.count(rank))
00112 {
00113 running_sources_.insert(rank);
00114 boost::thread::attributes attrs;
00115 attrs.set_stack_size(4096 * 500);
00116 source_threads_[rank] = boost::thread(attrs, boost::bind(&DataReceiverManager::runReceiver_, this, rank));
00117 }
00118 }
00119 }
00120
00121 void artdaq::DataReceiverManager::stop_threads()
00122 {
00123 TLOG(TLVL_TRACE) << "stop_threads: BEGIN: Setting stop_requested to true, frags=" << std::to_string(count()) << ", bytes=" << std::to_string(byteCount());
00124
00125 stop_requested_time_ = TimeUtils::gettimeofday_us();
00126 stop_requested_ = true;
00127
00128 TLOG(TLVL_TRACE) << "stop_threads: Joining all threads";
00129 for (auto& s : source_threads_)
00130 {
00131 auto& thread = s.second;
00132 if (thread.joinable()) thread.join();
00133 }
00134 }
00135
00136 void artdaq::DataReceiverManager::runReceiver_(int source_rank)
00137 {
00138 std::chrono::steady_clock::time_point start_time, after_header, before_body, eod_quiet_start;
00139 int ret;
00140 double delta_t, hdr_delta_t, store_delta_t, data_delta_t;
00141 detail::RawFragmentHeader header;
00142 size_t endOfDataCount = -1;
00143 auto sleep_time = receive_timeout_ / 100 > 100000 ? 100000 : receive_timeout_ / 100;
00144 auto max_retries = non_reliable_mode_retry_count_ * ceil(receive_timeout_ / sleep_time);
00145
00146 while (!(stop_requested_ && TimeUtils::gettimeofday_us() - stop_requested_time_ > stop_timeout_ms_ * 1000) && enabled_sources_.count(source_rank))
00147 {
00148 TLOG(16) << "runReceiver_: Begin loop";
00149 if (stop_requested_) { receive_timeout_ = stop_timeout_ms_; }
00150
00151
00152 if (endOfDataCount <= recv_frag_count_.slotCount(source_rank) && TimeUtils::GetElapsedTimeMilliseconds(eod_quiet_start) > 1000)
00153 {
00154 TLOG(TLVL_DEBUG) << "runReceiver_: End of Data conditions met, ending runReceiver loop";
00155 running_sources_.erase(source_rank);
00156 return;
00157 }
00158
00159 start_time = std::chrono::steady_clock::now();
00160
00161 TLOG(16) << "runReceiver_: Calling receiveFragmentHeader";
00162 ret = source_plugins_[source_rank]->receiveFragmentHeader(header, receive_timeout_);
00163 TLOG(16) << "runReceiver_: Done with receiveFragmentHeader, ret=" << ret << " (should be " << source_rank << ")";
00164 if (ret != source_rank)
00165 {
00166 if (ret >= 0) {
00167 TLOG(TLVL_WARNING) << "Received Fragment from rank " << ret << ", but was expecting one from rank " << source_rank << "!";
00168 }
00169 continue;
00170 }
00171
00172 after_header = std::chrono::steady_clock::now();
00173 eod_quiet_start = std::chrono::steady_clock::now();
00174
00175 if (Fragment::isUserFragmentType(header.type) || header.type == Fragment::DataFragmentType || header.type == Fragment::EmptyFragmentType || header.type == Fragment::ContainerFragmentType) {
00176 TLOG(TLVL_TRACE) << "Received Fragment Header from rank " << source_rank << ".";
00177 RawDataType* loc = nullptr;
00178 size_t retries = 0;
00179 while (loc == nullptr)
00180 {
00181 loc = shm_manager_->WriteFragmentHeader(header);
00182 if (loc == nullptr) usleep(sleep_time);
00183 if (stop_requested_) return;
00184 retries++;
00185 if (non_reliable_mode_enabled_ && retries > max_retries)
00186 {
00187 loc = shm_manager_->WriteFragmentHeader(header, true);
00188 }
00189 }
00190 if (loc == nullptr)
00191 {
00192
00193 TLOG(TLVL_ERROR) << "runReceiver_: Could not get data location for event " << std::to_string(header.sequence_id);
00194 continue;
00195 }
00196 before_body = std::chrono::steady_clock::now();
00197
00198 TLOG(16) << "runReceiver_: Calling receiveFragmentData";
00199 auto ret2 = source_plugins_[source_rank]->receiveFragmentData(loc, header.word_count - header.num_words());
00200 TLOG(16) << "runReceiver_: Done with receiveFragmentData, ret2=" << ret2 << " (should be " << source_rank << ")";
00201
00202 if (ret != ret2) {
00203 TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
00204 throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
00205 }
00206
00207 shm_manager_->DoneWritingFragment(header);
00208 TLOG(TLVL_TRACE) << "Done receiving fragment with sequence ID " << std::to_string(header.sequence_id) << " from rank " << source_rank;
00209
00210 recv_frag_count_.incSlot(source_rank);
00211 recv_frag_size_.incSlot(source_rank, header.word_count * sizeof(RawDataType));
00212 recv_seq_count_.setSlot(source_rank, header.sequence_id);
00213 if (endOfDataCount != static_cast<size_t>(-1))
00214 {
00215 TLOG(TLVL_DEBUG) << "Received fragment " << std::to_string(header.sequence_id) << " from rank " << source_rank
00216 << " (" << std::to_string(recv_frag_count_.slotCount(source_rank)) << "/" << std::to_string(endOfDataCount) << ")";
00217 }
00218
00219 if (metricMan)
00220 {
00221 TLOG(6) << "runReceiver_: Sending receive stats";
00222 delta_t = TimeUtils::GetElapsedTime(start_time);
00223 hdr_delta_t = TimeUtils::GetElapsedTime(start_time, after_header);
00224 store_delta_t = TimeUtils::GetElapsedTime(after_header, before_body);
00225 data_delta_t = TimeUtils::GetElapsedTime(before_body);
00226 metricMan->sendMetric("Total Receive Time From Rank " + std::to_string(source_rank), delta_t, "s", 5, MetricMode::Accumulate);
00227 metricMan->sendMetric("Total Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(header.word_count * sizeof(RawDataType)), "B", 5, MetricMode::Accumulate);
00228 metricMan->sendMetric("Total Receive Rate From Rank " + std::to_string(source_rank), header.word_count * sizeof(RawDataType) / delta_t, "B/s", 5, MetricMode::Average);
00229
00230 metricMan->sendMetric("Header Receive Time From Rank " + std::to_string(source_rank), hdr_delta_t, "s", 5, MetricMode::Accumulate);
00231 metricMan->sendMetric("Header Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(header.num_words() * sizeof(RawDataType)), "B", 5, MetricMode::Accumulate);
00232 metricMan->sendMetric("Header Receive Rate From Rank " + std::to_string(source_rank), header.num_words() * sizeof(RawDataType) / hdr_delta_t, "B/s", 5, MetricMode::Average);
00233
00234 metricMan->sendMetric("Shared Memory Wait Time From Rank " + std::to_string(source_rank), store_delta_t, "s", 5, MetricMode::Accumulate);
00235
00236 metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), data_delta_t, "s", 5, MetricMode::Accumulate);
00237 metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>((header.word_count - header.num_words()) * sizeof(RawDataType)), "B", 5, MetricMode::Accumulate);
00238 metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), (header.word_count - header.num_words()) * sizeof(RawDataType) / data_delta_t, "B/s", 5, MetricMode::Average);
00239 metricMan->sendMetric("Data Receive Count From Rank " + std::to_string(source_rank), recv_frag_count_.slotCount(source_rank), "fragments", 3, MetricMode::LastPoint);
00240 TLOG(6) << "runReceiver_: Done sending receive stats";
00241 }
00242 }
00243 else if (header.type == Fragment::EndOfDataFragmentType || header.type == Fragment::InitFragmentType || header.type == Fragment::EndOfRunFragmentType || header.type == Fragment::EndOfSubrunFragmentType || header.type == Fragment::ShutdownFragmentType)
00244 {
00245 TLOG(TLVL_DEBUG) << "Received System Fragment from rank " << source_rank << " of type " << detail::RawFragmentHeader::SystemTypeToString(header.type) << ".";
00246
00247 FragmentPtr frag(new Fragment(header.word_count - header.num_words()));
00248 memcpy(frag->headerAddress(), &header, header.num_words() * sizeof(RawDataType));
00249 auto ret3 = source_plugins_[source_rank]->receiveFragmentData(frag->headerAddress() + header.num_words(), header.word_count - header.num_words());
00250 if (ret3 != source_rank)
00251 {
00252 TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
00253 throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
00254 }
00255
00256 switch (header.type)
00257 {
00258 case Fragment::EndOfDataFragmentType:
00259 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
00260 endOfDataCount = *(frag->dataBegin());
00261 TLOG(TLVL_DEBUG) << "EndOfData Fragment indicates that " << std::to_string(endOfDataCount) << " fragments are expected from rank " << source_rank
00262 << " (recvd " << std::to_string(recv_frag_count_.slotCount(source_rank)) << ").";
00263 break;
00264 case Fragment::InitFragmentType:
00265 TLOG(TLVL_DEBUG) << "Received Init Fragment from rank " << source_rank << ".";
00266 shm_manager_->setRequestMode(detail::RequestMessageMode::Normal);
00267 shm_manager_->SetInitFragment(std::move(frag));
00268 break;
00269 case Fragment::EndOfRunFragmentType:
00270 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
00271 break;
00272 case Fragment::EndOfSubrunFragmentType:
00273 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
00274 break;
00275 case Fragment::ShutdownFragmentType:
00276 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
00277 break;
00278 }
00279 }
00280
00281 }
00282
00283
00284 running_sources_.erase(source_rank);
00285 }