3 #define TRACE_NAME "DataReceiverManager"
4 #include "artdaq/DAQdata/Globals.hh"
5 #include "artdaq/DAQrate/DataReceiverManager.hh"
6 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
7 #include "cetlib_except/exception.h"
11 : stop_requested_(false)
12 , stop_requested_time_(0)
19 , receive_timeout_(pset.get<size_t>(
"receive_timeout_usec", 100000))
20 , stop_timeout_ms_(pset.get<size_t>(
"stop_timeout_ms",3000))
22 , non_reliable_mode_enabled_(pset.get<bool>(
"non_reliable_mode", false))
23 , non_reliable_mode_retry_count_(pset.get<size_t>(
"non_reliable_mode_retry_count", -1))
25 TLOG_DEBUG(
"DataReceiverManager") <<
"Constructor" << TLOG_ENDL;
26 auto enabled_srcs = pset.get<std::vector<int>>(
"enabled_sources", std::vector<int>());
27 auto enabled_srcs_empty = enabled_srcs.size() == 0;
29 if (non_reliable_mode_enabled_)
31 TLOG_WARNING(
"DataReceiverManager") <<
"DataReceiverManager is configured to drop data after " << std::to_string(non_reliable_mode_retry_count_)
32 <<
" failed attempts to put data into the SharedMemoryEventManager! If this is unexpected, please check your configuration!" << TLOG_ENDL;
35 if (enabled_srcs_empty)
37 TLOG_INFO(
"DataReceiverManager") <<
"enabled_sources not specified, assuming all sources enabled." << TLOG_ENDL;
41 for (
auto& s : enabled_srcs)
43 enabled_sources_.insert(s);
47 auto srcs = pset.get<fhicl::ParameterSet>(
"sources", fhicl::ParameterSet());
48 for (
auto& s : srcs.get_pset_names())
54 auto source_rank = transfer->source_rank();
55 if (enabled_srcs_empty) enabled_sources_.insert(source_rank);
56 source_plugins_[source_rank] = std::move(transfer);
58 catch (cet::exception ex)
60 TLOG_WARNING(
"DataReceiverManager") <<
"cet::exception caught while setting up source " << s <<
": " << ex.what() << TLOG_ENDL;
62 catch (std::exception ex)
64 TLOG_WARNING(
"DataReceiverManager") <<
"std::exception caught while setting up source " << s <<
": " << ex.what() << TLOG_ENDL;
68 TLOG_WARNING(
"DataReceiverManager") <<
"Non-cet exception caught while setting up source " << s <<
"." << TLOG_ENDL;
71 if (srcs.get_pset_names().size() == 0)
73 TLOG_ERROR(
"DataReceiverManager") <<
"No sources configured!" << TLOG_ENDL;
79 TLOG_TRACE(
"DataReceiverManager") <<
"~DataReceiverManager: BEGIN: Setting stop_requested to true, frags=" << std::to_string(count()) <<
", bytes=" << std::to_string(byteCount()) << TLOG_ENDL;
80 stop_requested_time_ = TimeUtils::gettimeofday_us();
81 stop_requested_ =
true;
83 TLOG_TRACE(
"DataReceiverManager") <<
"~DataReceiverManager: Joining all threads" << TLOG_ENDL;
84 for (
auto& s : source_threads_)
86 auto& thread = s.second;
87 if (thread.joinable()) thread.join();
90 TLOG_TRACE(
"DataReceiverManager") <<
"Destructor END" << TLOG_ENDL;
96 for (
auto& source : source_plugins_)
98 auto& rank = source.first;
99 if (enabled_sources_.count(rank))
101 running_sources_.insert(rank);
102 boost::thread::attributes attrs;
103 attrs.set_stack_size(4096 * 500);
104 source_threads_[rank] = boost::thread(attrs, boost::bind(&DataReceiverManager::runReceiver_,
this, rank));
109 void artdaq::DataReceiverManager::runReceiver_(
int source_rank)
111 std::chrono::steady_clock::time_point start_time, after_header, before_body;
113 double delta_t, hdr_delta_t, store_delta_t, data_delta_t;
114 detail::RawFragmentHeader header;
115 size_t endOfDataCount = -1;
116 auto sleep_time = receive_timeout_ / 100 > 100000 ? 100000 : receive_timeout_ / 100;
117 auto max_retries = non_reliable_mode_retry_count_ * ceil(receive_timeout_ / sleep_time);
119 while (!(stop_requested_ && TimeUtils::gettimeofday_us() - stop_requested_time_ > stop_timeout_ms_ * 1000) && enabled_sources_.count(source_rank))
121 TRACE(16,
"DataReceiverManager::runReceiver_: Begin loop");
122 if (stop_requested_) { receive_timeout_ = stop_timeout_ms_; }
124 start_time = std::chrono::steady_clock::now();
126 TRACE(16,
"DataReceiverManager::runReceiver_: Calling receiveFragmentHeader");
127 ret = source_plugins_[source_rank]->receiveFragmentHeader(header, receive_timeout_);
128 TRACE(16,
"DataReceiverManager::runReceiver_: Done with receiveFragmentHeader, ret=%d (should be %d)", ret, source_rank);
129 if (ret != source_rank)
134 after_header = std::chrono::steady_clock::now();
136 if (Fragment::isUserFragmentType(header.type) || header.type == Fragment::DataFragmentType || header.type == Fragment::EmptyFragmentType || header.type == Fragment::ContainerFragmentType) {
137 TLOG_TRACE(
"DataReceiverManager") <<
"Received Fragment Header from rank " << source_rank <<
"." << TLOG_ENDL;
138 RawDataType* loc =
nullptr;
140 while (loc ==
nullptr )
142 loc = shm_manager_->WriteFragmentHeader(header);
143 if (loc ==
nullptr) usleep(sleep_time);
144 if (stop_requested_)
return;
146 if (non_reliable_mode_enabled_ && retries > max_retries)
148 loc = shm_manager_->WriteFragmentHeader(header,
true);
154 TLOG_ERROR(
"DataReceiverManager") <<
"runReceiver_: Could not get data location for event " << std::to_string(header.sequence_id) << TLOG_ENDL;
157 before_body = std::chrono::steady_clock::now();
159 TRACE(16,
"DataReceiverManager::runReceiver_: Calling receiveFragmentData");
160 auto ret2 = source_plugins_[source_rank]->receiveFragmentData(loc, header.word_count - header.num_words());
161 TRACE(16,
"DataReceiverManager::runReceiver_: Done with receiveFragmentData, ret2=%d (should be %d)", ret2, source_rank);
164 TLOG_ERROR(
"DataReceiverManager") <<
"Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret <<
", Got: " << ret2 <<
")" << TLOG_ENDL;
165 throw cet::exception(
"DataReceiverManager") <<
"Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret <<
", Got: " << ret2 <<
")";
168 shm_manager_->DoneWritingFragment(header);
169 TLOG_TRACE(
"DataReceiverManager") <<
"Done receiving fragment with sequence ID " << std::to_string(header.sequence_id) <<
" from rank " << source_rank << TLOG_ENDL;
171 recv_frag_count_.incSlot(source_rank);
172 recv_frag_size_.incSlot(source_rank, header.word_count *
sizeof(RawDataType));
173 recv_seq_count_.setSlot(source_rank, header.sequence_id);
174 if (endOfDataCount != static_cast<size_t>(-1))
176 TLOG_DEBUG(
"DataReceiverManager") <<
"Received fragment " << std::to_string(header.sequence_id) <<
" from rank " << source_rank
177 <<
" (" << std::to_string(recv_frag_count_.slotCount(source_rank)) <<
"/" << std::to_string(endOfDataCount) <<
")" << TLOG_ENDL;
182 TRACE(6,
"DataReceiverManager::runReceiver_: Sending receive stats");
183 delta_t = TimeUtils::GetElapsedTime(start_time);
184 hdr_delta_t = TimeUtils::GetElapsedTime(start_time, after_header);
185 store_delta_t = TimeUtils::GetElapsedTime(after_header, before_body);
186 data_delta_t = TimeUtils::GetElapsedTime(before_body);
187 metricMan->sendMetric(
"Total Receive Time From Rank " + std::to_string(source_rank), delta_t,
"s", 1, MetricMode::Accumulate);
188 metricMan->sendMetric(
"Total Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(header.word_count *
sizeof(RawDataType)),
"B", 1, MetricMode::Accumulate);
189 metricMan->sendMetric(
"Total Receive Rate From Rank " + std::to_string(source_rank), header.word_count *
sizeof(RawDataType) / delta_t,
"B/s", 1, MetricMode::Average);
191 metricMan->sendMetric(
"Header Receive Time From Rank " + std::to_string(source_rank), hdr_delta_t,
"s", 1, MetricMode::Accumulate);
192 metricMan->sendMetric(
"Header Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(header.num_words() *
sizeof(RawDataType)),
"B", 1, MetricMode::Accumulate);
193 metricMan->sendMetric(
"Header Receive Rate From Rank " + std::to_string(source_rank), header.num_words() *
sizeof(RawDataType) / hdr_delta_t,
"B/s", 1, MetricMode::Average);
195 metricMan->sendMetric(
"Shared Memory Wait Time From Rank " + std::to_string(source_rank), store_delta_t,
"s", 1, MetricMode::Accumulate);
197 metricMan->sendMetric(
"Data Receive Time From Rank " + std::to_string(source_rank), data_delta_t,
"s", 1, MetricMode::Accumulate);
198 metricMan->sendMetric(
"Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>((header.word_count - header.num_words()) *
sizeof(RawDataType)),
"B", 1, MetricMode::Accumulate);
199 metricMan->sendMetric(
"Data Receive Rate From Rank " + std::to_string(source_rank), (header.word_count - header.num_words()) *
sizeof(RawDataType) / data_delta_t,
"B/s", 1, MetricMode::Average);
200 metricMan->sendMetric(
"Data Receive Count From Rank " + std::to_string(source_rank), recv_frag_count_.slotCount(source_rank),
"fragments", 3, MetricMode::Accumulate);
201 TRACE(6,
"DataReceiverManager::runReceiver_: Done sending receive stats");
204 else if (header.type == Fragment::EndOfDataFragmentType || header.type == Fragment::InitFragmentType || header.type == Fragment::EndOfRunFragmentType || header.type == Fragment::EndOfSubrunFragmentType || header.type == Fragment::ShutdownFragmentType)
206 TLOG_DEBUG(
"DataReceiverManager") <<
"Received System Fragment from rank " << source_rank <<
" of type " << detail::RawFragmentHeader::SystemTypeToString(header.type) <<
"." << TLOG_ENDL;
208 FragmentPtr frag(
new Fragment(header.word_count - header.num_words()));
209 memcpy(frag->headerAddress(), &header, header.num_words() *
sizeof(RawDataType));
210 auto ret3 = source_plugins_[source_rank]->receiveFragmentData(frag->headerAddress() + header.num_words(), header.word_count - header.num_words());
211 if (ret3 != source_rank)
213 TLOG_ERROR(
"DataReceiverManager") <<
"Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank <<
", Got: " << ret3 <<
")" << TLOG_ENDL;
214 throw cet::exception(
"DataReceiverManager") <<
"Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank <<
", Got: " << ret3 <<
")";
219 case Fragment::EndOfDataFragmentType:
221 endOfDataCount = *(frag->dataBegin());
222 TLOG_DEBUG(
"DataReceiverManager") <<
"EndOfData Fragment indicates that " << std::to_string(endOfDataCount) <<
" fragments are expected from rank " << source_rank
223 <<
" (recvd " << std::to_string(recv_frag_count_.slotCount(source_rank)) <<
")." << TLOG_ENDL;
225 case Fragment::InitFragmentType:
226 TLOG_DEBUG(
"DataReceiverManager") <<
"Received Init Fragment from rank " << source_rank <<
"." << TLOG_ENDL;
228 shm_manager_->SetInitFragment(std::move(frag));
230 case Fragment::EndOfRunFragmentType:
233 case Fragment::EndOfSubrunFragmentType:
236 case Fragment::ShutdownFragmentType:
242 if (endOfDataCount <= recv_frag_count_.slotCount(source_rank))
244 running_sources_.erase(source_rank);
End of Run mode (Used to end request processing on receiver)
This TransferInterface is a Receiver.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
DataReceiverManager(const fhicl::ParameterSet &ps, std::shared_ptr< SharedMemoryEventManager > shm)
DataReceiverManager Constructor.
virtual ~DataReceiverManager()
DataReceiverManager Destructor.
void start_threads()
Start receiver threads for all enabled sources.