3 #define TRACE_NAME "DataReceiverManager"
4 #include "artdaq/DAQdata/Globals.hh"
5 #include "artdaq/DAQrate/DataReceiverManager.hh"
6 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
7 #include "artdaq/TransferPlugins/detail/HostMap.hh"
8 #include "cetlib_except/exception.h"
12 : stop_requested_(false)
13 , stop_requested_time_(0)
16 , source_metric_data_()
17 , source_metric_send_time_()
23 , receive_timeout_(pset.get<size_t>(
"receive_timeout_usec", 100000))
24 , stop_timeout_ms_(pset.get<size_t>(
"stop_timeout_ms", 1500))
26 , non_reliable_mode_enabled_(pset.get<bool>(
"non_reliable_mode", false))
27 , non_reliable_mode_retry_count_(pset.get<size_t>(
"non_reliable_mode_retry_count", -1))
29 TLOG(TLVL_DEBUG) <<
"Constructor";
30 auto enabled_srcs = pset.get<std::vector<int>>(
"enabled_sources", std::vector<int>());
31 auto enabled_srcs_empty = enabled_srcs.size() == 0;
33 if (non_reliable_mode_enabled_)
35 TLOG(TLVL_WARNING) <<
"DataReceiverManager is configured to drop data after " << non_reliable_mode_retry_count_
36 <<
" failed attempts to put data into the SharedMemoryEventManager! If this is unexpected, please check your configuration!";
39 if (enabled_srcs_empty)
41 TLOG(TLVL_INFO) <<
"enabled_sources not specified, assuming all sources enabled.";
45 for (
auto& s : enabled_srcs)
47 enabled_sources_[s] =
true;
52 auto srcs = pset.get<fhicl::ParameterSet>(
"sources", fhicl::ParameterSet());
53 for (
auto& s : srcs.get_pset_names())
55 auto src_pset = srcs.get<fhicl::ParameterSet>(s);
59 fhicl::ParameterSet srcs_mod;
60 for (
auto& s : srcs.get_pset_names())
62 auto src_pset = srcs.get<fhicl::ParameterSet>(s);
63 src_pset.erase(
"host_map");
64 src_pset.put<std::vector<fhicl::ParameterSet>>(
"host_map", host_map_pset);
65 srcs_mod.put<fhicl::ParameterSet>(s, src_pset);
68 for (
auto& s : srcs_mod.get_pset_names())
74 auto source_rank = transfer->source_rank();
75 if (enabled_srcs_empty) enabled_sources_[source_rank] =
true;
76 else if (!enabled_sources_.count(source_rank)) enabled_sources_[source_rank] =
false;
77 running_sources_[source_rank] =
false;
78 source_plugins_[source_rank] = std::move(transfer);
79 source_metric_send_time_[source_rank] = std::chrono::steady_clock::now();
80 source_metric_data_[source_rank] = source_metric_data();
82 catch (cet::exception ex)
84 TLOG(TLVL_WARNING) <<
"cet::exception caught while setting up source " << s <<
": " << ex.what();
86 catch (std::exception ex)
88 TLOG(TLVL_WARNING) <<
"std::exception caught while setting up source " << s <<
": " << ex.what();
92 TLOG(TLVL_WARNING) <<
"Non-cet exception caught while setting up source " << s <<
".";
95 if (srcs.get_pset_names().size() == 0)
97 TLOG(TLVL_ERROR) <<
"No sources configured!";
103 TLOG(TLVL_TRACE) <<
"~DataReceiverManager: BEGIN";
105 shm_manager_.reset();
106 TLOG(TLVL_TRACE) <<
"Destructor END";
112 stop_requested_ =
false;
114 for (
auto& source : source_plugins_)
116 auto& rank = source.first;
117 if (enabled_sources_.count(rank) && enabled_sources_[rank].load())
119 running_sources_[rank] =
true;
120 boost::thread::attributes attrs;
121 attrs.set_stack_size(4096 * 500);
122 source_threads_[rank] = boost::thread(attrs, boost::bind(&DataReceiverManager::runReceiver_,
this, rank));
129 TLOG(TLVL_TRACE) <<
"stop_threads: BEGIN: Setting stop_requested to true, frags=" << count() <<
", bytes=" << byteCount();
131 stop_requested_time_ = TimeUtils::gettimeofday_us();
132 stop_requested_ =
true;
134 TLOG(TLVL_TRACE) <<
"stop_threads: Joining all threads";
135 for (
auto& s : source_threads_)
137 auto& thread = s.second;
138 if (thread.joinable()) thread.join();
144 std::set<int> output;
145 for (
auto& src : enabled_sources_)
147 if (src.second) output.insert(src.first);
154 std::set<int> output;
155 for (
auto& src : running_sources_)
157 if (src.second) output.insert(src.first);
162 void artdaq::DataReceiverManager::runReceiver_(
int source_rank)
164 std::chrono::steady_clock::time_point start_time, after_header, before_body;
166 detail::RawFragmentHeader header;
167 size_t endOfDataCount = -1;
168 auto sleep_time = receive_timeout_ / 100 > 100000 ? 100000 : receive_timeout_ / 100;
169 auto max_retries = non_reliable_mode_retry_count_ * ceil(receive_timeout_ / sleep_time);
171 while (!(stop_requested_ && TimeUtils::gettimeofday_us() - stop_requested_time_ > stop_timeout_ms_ * 1000) && enabled_sources_.count(source_rank))
173 TLOG(16) <<
"runReceiver_: Begin loop";
174 std::this_thread::yield();
177 if (endOfDataCount <= recv_frag_count_.slotCount(source_rank) && !source_plugins_[source_rank]->isRunning())
179 TLOG(TLVL_DEBUG) <<
"runReceiver_: End of Data conditions met, ending runReceiver loop";
183 start_time = std::chrono::steady_clock::now();
185 TLOG(16) <<
"runReceiver_: Calling receiveFragmentHeader tmo=" << receive_timeout_;
186 ret = source_plugins_[source_rank]->receiveFragmentHeader(header, receive_timeout_);
187 TLOG(16) <<
"runReceiver_: Done with receiveFragmentHeader, ret=" << ret <<
" (should be " << source_rank <<
")";
188 if (ret != source_rank)
191 TLOG(TLVL_WARNING) <<
"Received Fragment from rank " << ret <<
", but was expecting one from rank " << source_rank <<
"!";
195 TLOG(TLVL_ERROR) <<
"Transfer Plugin returned DATA_END, ending receive loop!";
201 after_header = std::chrono::steady_clock::now();
203 if (Fragment::isUserFragmentType(header.type) || header.type == Fragment::DataFragmentType || header.type == Fragment::EmptyFragmentType || header.type == Fragment::ContainerFragmentType) {
204 TLOG(TLVL_TRACE) <<
"Received Fragment Header from rank " << source_rank <<
".";
205 RawDataType* loc =
nullptr;
207 while (loc ==
nullptr)
209 loc = shm_manager_->WriteFragmentHeader(header);
210 if (loc ==
nullptr) usleep(sleep_time);
211 if (stop_requested_)
return;
213 if (non_reliable_mode_enabled_ && retries > max_retries)
215 loc = shm_manager_->WriteFragmentHeader(header,
true);
221 TLOG(TLVL_ERROR) <<
"runReceiver_: Could not get data location for event " << header.sequence_id;
224 before_body = std::chrono::steady_clock::now();
226 TLOG(16) <<
"runReceiver_: Calling receiveFragmentData";
227 auto ret2 = source_plugins_[source_rank]->receiveFragmentData(loc, header.word_count - header.num_words());
228 TLOG(16) <<
"runReceiver_: Done with receiveFragmentData, ret2=" << ret2 <<
" (should be " << source_rank <<
")";
231 TLOG(TLVL_ERROR) <<
"Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret <<
", Got: " << ret2 <<
")";
232 throw cet::exception(
"DataReceiverManager") <<
"Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret <<
", Got: " << ret2 <<
")";
235 shm_manager_->DoneWritingFragment(header);
236 TLOG(TLVL_TRACE) <<
"Done receiving fragment with sequence ID " << header.sequence_id <<
" from rank " << source_rank;
238 recv_frag_count_.incSlot(source_rank);
239 recv_frag_size_.incSlot(source_rank, header.word_count *
sizeof(RawDataType));
240 recv_seq_count_.setSlot(source_rank, header.sequence_id);
241 if (endOfDataCount != static_cast<size_t>(-1))
243 TLOG(TLVL_DEBUG) <<
"Received fragment " << header.sequence_id <<
" from rank " << source_rank
244 <<
" (" << recv_frag_count_.slotCount(source_rank) <<
"/" << endOfDataCount <<
")";
248 source_metric_data_[source_rank].delta_t += TimeUtils::GetElapsedTime(start_time);
249 source_metric_data_[source_rank].hdr_delta_t += TimeUtils::GetElapsedTime(start_time, after_header);
250 source_metric_data_[source_rank].store_delta_t += TimeUtils::GetElapsedTime(after_header, before_body);
251 source_metric_data_[source_rank].data_delta_t += TimeUtils::GetElapsedTime(before_body);
253 source_metric_data_[source_rank].data_size += header.word_count *
sizeof(RawDataType);
254 source_metric_data_[source_rank].header_size += header.num_words() *
sizeof(RawDataType);
256 if (metricMan && TimeUtils::GetElapsedTime(source_metric_send_time_[source_rank]) > 1)
258 TLOG(6) <<
"runReceiver_: Sending receive stats";
259 metricMan->sendMetric(
"Total Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].delta_t,
"s", 5, MetricMode::Accumulate);
260 metricMan->sendMetric(
"Total Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(source_metric_data_[source_rank].data_size),
"B", 5, MetricMode::Accumulate);
261 metricMan->sendMetric(
"Total Receive Rate From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].data_size / source_metric_data_[source_rank].delta_t,
"B/s", 5, MetricMode::Average);
263 metricMan->sendMetric(
"Header Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].hdr_delta_t,
"s", 5, MetricMode::Accumulate);
264 metricMan->sendMetric(
"Header Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(source_metric_data_[source_rank].header_size),
"B", 5, MetricMode::Accumulate);
265 metricMan->sendMetric(
"Header Receive Rate From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].header_size / source_metric_data_[source_rank].hdr_delta_t,
"B/s", 5, MetricMode::Average);
267 auto payloadSize = source_metric_data_[source_rank].data_size - source_metric_data_[source_rank].header_size;
268 metricMan->sendMetric(
"Data Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].data_delta_t,
"s", 5, MetricMode::Accumulate);
269 metricMan->sendMetric(
"Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(payloadSize),
"B", 5, MetricMode::Accumulate);
270 metricMan->sendMetric(
"Data Receive Rate From Rank " + std::to_string(source_rank), payloadSize / source_metric_data_[source_rank].data_delta_t,
"B/s", 5, MetricMode::Average);
272 metricMan->sendMetric(
"Data Receive Count From Rank " + std::to_string(source_rank), recv_frag_count_.slotCount(source_rank),
"fragments", 3, MetricMode::LastPoint);
274 metricMan->sendMetric(
"Shared Memory Wait Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].store_delta_t,
"s", 5, MetricMode::Accumulate);
276 TLOG(6) <<
"runReceiver_: Done sending receive stats";
278 source_metric_send_time_[source_rank] = std::chrono::steady_clock::now();
279 source_metric_data_[source_rank] = source_metric_data();
282 else if (header.type == Fragment::EndOfDataFragmentType || header.type == Fragment::InitFragmentType || header.type == Fragment::EndOfRunFragmentType || header.type == Fragment::EndOfSubrunFragmentType || header.type == Fragment::ShutdownFragmentType)
284 TLOG(TLVL_DEBUG) <<
"Received System Fragment from rank " << source_rank <<
" of type " << detail::RawFragmentHeader::SystemTypeToString(header.type) <<
".";
286 FragmentPtr frag(
new Fragment(header.word_count - header.num_words()));
287 memcpy(frag->headerAddress(), &header, header.num_words() *
sizeof(RawDataType));
288 auto ret3 = source_plugins_[source_rank]->receiveFragmentData(frag->headerAddress() + header.num_words(), header.word_count - header.num_words());
289 if (ret3 != source_rank)
291 TLOG(TLVL_ERROR) <<
"Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank <<
", Got: " << ret3 <<
")";
292 throw cet::exception(
"DataReceiverManager") <<
"Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank <<
", Got: " << ret3 <<
")";
297 case Fragment::EndOfDataFragmentType:
299 endOfDataCount = *(frag->dataBegin());
300 TLOG(TLVL_DEBUG) <<
"EndOfData Fragment indicates that " << endOfDataCount <<
" fragments are expected from rank " << source_rank
301 <<
" (recvd " << recv_frag_count_.slotCount(source_rank) <<
").";
303 case Fragment::InitFragmentType:
304 TLOG(TLVL_DEBUG) <<
"Received Init Fragment from rank " << source_rank <<
".";
306 shm_manager_->SetInitFragment(std::move(frag));
308 case Fragment::EndOfRunFragmentType:
312 case Fragment::EndOfSubrunFragmentType:
314 if (header.sequence_id != Fragment::InvalidSequenceID) shm_manager_->rolloverSubrun(header.sequence_id);
315 else shm_manager_->rolloverSubrun(recv_seq_count_.slotCount(source_rank));
317 case Fragment::ShutdownFragmentType:
325 running_sources_[source_rank] =
false;
std::set< int > running_sources() const
Get the list of sources which are still receiving data.
End of Run mode (Used to end request processing on receiver)
This TransferInterface is a Receiver.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
DataReceiverManager(const fhicl::ParameterSet &ps, std::shared_ptr< SharedMemoryEventManager > shm)
DataReceiverManager Constructor.
std::set< int > enabled_sources() const
Get the list of enabled sources.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, DestinationInfo > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
Value that is to be returned when a Transfer plugin determines that no more data will be arriving...
std::map< int, DestinationInfo > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
hostMap_t MakeHostMap(fhicl::ParameterSet pset, int masterPortOffset=0, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
virtual ~DataReceiverManager()
DataReceiverManager Destructor.
void start_threads()
Start receiver threads for all enabled sources.
void stop_threads()
Stop receiver threads.