1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_DataReceiverManager").c_str()
4 #include "artdaq/DAQdata/HostMap.hh"
5 #include "artdaq/DAQrate/DataReceiverManager.hh"
6 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
8 #include "cetlib_except/exception.h"
9 #include "fhiclcpp/ParameterSet.h"
11 #include <boost/bind.hpp>
12 #include <boost/exception/all.hpp>
13 #include <boost/thread.hpp>
21 : stop_requested_(false)
22 , stop_requested_time_(0)
26 , receive_timeout_(pset.get<size_t>(
"receive_timeout_usec", 100000))
27 , stop_timeout_ms_(pset.get<size_t>(
"stop_timeout_ms", 1500))
28 , shm_manager_(std::move(std::move(shm)))
29 , non_reliable_mode_enabled_(pset.get<bool>(
"non_reliable_mode", false))
30 , non_reliable_mode_retry_count_(pset.get<size_t>(
"non_reliable_mode_retry_count", -1))
32 TLOG(TLVL_DEBUG + 32) <<
"Constructor";
33 auto enabled_srcs = pset.get<std::vector<int>>(
"enabled_sources", std::vector<int>());
34 auto enabled_srcs_empty = enabled_srcs.empty();
36 if (non_reliable_mode_enabled_)
38 TLOG(TLVL_WARNING) <<
"DataReceiverManager is configured to drop data after " << non_reliable_mode_retry_count_
39 <<
" failed attempts to put data into the SharedMemoryEventManager! If this is unexpected, please check your configuration!";
42 if (enabled_srcs_empty)
44 TLOG(TLVL_INFO) <<
"enabled_sources not specified, assuming all sources enabled.";
48 for (
auto& s : enabled_srcs)
50 enabled_sources_[s] =
true;
55 auto tcp_receive_buffer_size = pset.get<
size_t>(
"tcp_receive_buffer_size", 0);
56 auto max_fragment_size_words = pset.get<
size_t>(
"max_fragment_size_words", 0);
58 auto srcs = pset.get<fhicl::ParameterSet>(
"sources", fhicl::ParameterSet());
59 for (
auto& s : srcs.get_pset_names())
61 auto src_pset = srcs.get<fhicl::ParameterSet>(s);
65 fhicl::ParameterSet srcs_mod;
66 for (
auto& s : srcs.get_pset_names())
68 auto src_pset = srcs.get<fhicl::ParameterSet>(s);
69 src_pset.erase(
"host_map");
70 src_pset.put<std::vector<fhicl::ParameterSet>>(
"host_map", host_map_pset);
72 if (tcp_receive_buffer_size != 0 && !src_pset.has_key(
"tcp_receive_buffer_size"))
74 src_pset.put<
size_t>(
"tcp_receive_buffer_size", tcp_receive_buffer_size);
76 if (max_fragment_size_words != 0 && !src_pset.has_key(
"max_fragment_size_words"))
78 src_pset.put<
size_t>(
"max_fragment_size_words", max_fragment_size_words);
81 srcs_mod.put<fhicl::ParameterSet>(s, src_pset);
84 for (
auto& s : srcs_mod.get_pset_names())
90 auto source_rank = transfer->source_rank();
91 if (enabled_srcs_empty)
93 enabled_sources_[source_rank] =
true;
95 else if (enabled_sources_.count(source_rank) == 0u)
97 enabled_sources_[source_rank] =
false;
99 running_sources_[source_rank] =
false;
100 source_plugins_[source_rank] = std::move(transfer);
102 catch (
const cet::exception& ex)
104 TLOG(TLVL_WARNING) <<
"cet::exception caught while setting up source " << s <<
": " << ex.what();
106 catch (
const std::exception& ex)
108 TLOG(TLVL_WARNING) <<
"std::exception caught while setting up source " << s <<
": " << ex.what();
112 TLOG(TLVL_WARNING) <<
"Non-cet exception caught while setting up source " << s <<
".";
115 if (srcs.get_pset_names().empty())
117 TLOG(TLVL_ERROR) <<
"No sources configured!";
123 TLOG(TLVL_DEBUG + 33) <<
"~DataReceiverManager: BEGIN";
125 shm_manager_.reset();
126 TLOG(TLVL_DEBUG + 33) <<
"Destructor END";
131 stop_requested_ =
false;
136 for (
auto& source : source_plugins_)
138 auto& rank = source.first;
139 if ((enabled_sources_.count(rank) != 0u) && enabled_sources_[rank].load())
141 recv_frag_count_.setSlot(rank, 0);
142 recv_frag_size_.setSlot(rank, 0);
143 recv_seq_count_.setSlot(rank, 0);
145 running_sources_[rank] =
true;
146 boost::thread::attributes attrs;
147 attrs.set_stack_size(4096 * 2000);
150 source_threads_[rank] = boost::thread(attrs, boost::bind(&DataReceiverManager::runReceiver_,
this, rank));
152 snprintf(tname,
sizeof(tname) - 1,
"%d-%d RECV", rank, my_rank);
153 tname[
sizeof(tname) - 1] =
'\0';
154 auto handle = source_threads_[rank].native_handle();
155 pthread_setname_np(handle, tname);
157 catch (
const boost::exception& e)
159 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Receiver " << rank <<
" thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
160 std::cerr <<
"Caught boost::exception starting Receiver " << rank <<
" thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
169 TLOG(TLVL_DEBUG + 33) <<
"stop_threads: BEGIN: Setting stop_requested to true, frags=" << count() <<
", bytes=" << byteCount();
171 stop_requested_time_ = TimeUtils::gettimeofday_us();
172 stop_requested_ =
true;
174 auto initial_count = running_sources().size();
175 TLOG(TLVL_DEBUG + 33) <<
"stop_threads: Waiting for " << initial_count <<
" running receiver threads to stop";
176 auto wait_start = std::chrono::steady_clock::now();
177 auto last_report = std::chrono::steady_clock::now();
178 while (!running_sources().empty() && TimeUtils::GetElapsedTime(wait_start) < 60.0)
181 if (TimeUtils::GetElapsedTime(last_report) > 1.0)
183 TLOG(TLVL_DEBUG + 32) <<
"stop_threads: Waited " << TimeUtils::GetElapsedTime(wait_start) <<
" s for " << initial_count
184 <<
" receiver threads to end (" << running_sources().size() <<
" remain)";
185 last_report = std::chrono::steady_clock::now();
188 if (!running_sources().empty())
190 TLOG(TLVL_WARNING) <<
"stop_threads: Timeout expired while waiting for all receiver threads to end. There are "
191 << running_sources().size() <<
" threads remaining.";
194 TLOG(TLVL_DEBUG + 33) <<
"stop_threads: Joining " << source_threads_.size() <<
" receiver threads";
195 for (
auto& source_thread : source_threads_)
197 TLOG(TLVL_DEBUG + 33) <<
"stop_threads: Joining thread for source_rank " << source_thread.first;
200 if (source_thread.second.joinable())
202 source_thread.second.join();
206 TLOG(TLVL_ERROR) <<
"stop_threads: Thread for source rank " << source_thread.first <<
" is not joinable!";
214 source_threads_.clear();
216 TLOG(TLVL_DEBUG + 33) <<
"stop_threads: END";
221 std::set<int> output;
222 for (
auto& src : enabled_sources_)
226 output.insert(src.first);
234 std::set<int> output;
235 for (
auto& src : running_sources_)
239 output.insert(src.first);
245 void artdaq::DataReceiverManager::runReceiver_(
int source_rank)
247 std::chrono::steady_clock::time_point start_time, after_header, before_body, after_body, end_time = std::chrono::steady_clock::now();
249 detail::RawFragmentHeader header;
250 size_t endOfDataCount = -1;
251 auto sleep_time = receive_timeout_ / 100 > 100000 ? 100000 : receive_timeout_ / 100;
252 if (sleep_time < 5000)
256 auto max_retries = non_reliable_mode_retry_count_ * ceil(receive_timeout_ / sleep_time);
258 while (!(stop_requested_ && TimeUtils::gettimeofday_us() - stop_requested_time_ > stop_timeout_ms_ * 1000) && (enabled_sources_.count(source_rank) != 0u))
260 TLOG(TLVL_DEBUG + 35) <<
"runReceiver_: Begin loop stop_requested_=" << stop_requested_ <<
", stop_timeout_ms_=" << stop_timeout_ms_ <<
", enabled_sources_.count(source_rank)=" << enabled_sources_.count(source_rank) <<
", now - stop_requested_time_=" << (TimeUtils::gettimeofday_us() - stop_requested_time_);
261 std::this_thread::yield();
264 if (endOfDataCount <= recv_frag_count_.slotCount(source_rank) && !source_plugins_[source_rank]->isRunning())
266 TLOG(TLVL_DEBUG + 32) <<
"runReceiver_: End of Data conditions met, ending runReceiver loop";
270 start_time = std::chrono::steady_clock::now();
272 TLOG(TLVL_DEBUG + 35) <<
"runReceiver_: Calling receiveFragmentHeader tmo=" << receive_timeout_;
273 ret = source_plugins_[source_rank]->receiveFragmentHeader(header, receive_timeout_);
274 TLOG(TLVL_DEBUG + 35) <<
"runReceiver_: Done with receiveFragmentHeader, ret=" << ret <<
" (should be " << source_rank <<
")";
275 if (ret != source_rank)
279 TLOG(TLVL_WARNING) <<
"Received Fragment from rank " << ret <<
", but was expecting one from rank " << source_rank <<
"!";
283 TLOG(TLVL_ERROR) <<
"Transfer Plugin returned DATA_END, ending receive loop!";
286 if (*running_sources().begin() == source_rank)
288 TLOG(TLVL_DEBUG + 34) <<
"Calling SMEM::CheckPendingBuffers from DRM receiver thread for " << source_rank <<
" to make sure that things aren't stuck";
289 shm_manager_->CheckPendingBuffers();
296 after_header = std::chrono::steady_clock::now();
298 if (Fragment::isUserFragmentType(header.type) || header.type == Fragment::DataFragmentType || header.type == Fragment::EmptyFragmentType || header.type == Fragment::ContainerFragmentType)
300 TLOG(TLVL_DEBUG + 33) <<
"Received Fragment Header from rank " << source_rank <<
", sequence ID " << header.sequence_id <<
", timestamp " << header.timestamp;
301 RawDataType* loc =
nullptr;
303 auto latency_s = header.getLatency(
true);
304 auto latency = latency_s.tv_sec + (latency_s.tv_nsec / 1000000000.0);
305 while (loc ==
nullptr)
307 loc = shm_manager_->WriteFragmentHeader(header);
310 if (loc ==
nullptr && stop_requested_)
320 if (non_reliable_mode_enabled_ && retries > max_retries)
322 loc = shm_manager_->WriteFragmentHeader(header,
true);
326 if (loc ==
nullptr && stop_requested_)
333 TLOG(TLVL_ERROR) <<
"runReceiver_: Could not get data location for event " << header.sequence_id;
336 before_body = std::chrono::steady_clock::now();
338 TLOG(TLVL_DEBUG + 35) <<
"runReceiver_: Calling receiveFragmentData from rank " << source_rank <<
", sequence ID " << header.sequence_id <<
", timestamp " << header.timestamp;
339 auto ret2 = source_plugins_[source_rank]->receiveFragmentData(loc, header.word_count - header.num_words());
340 TLOG(TLVL_DEBUG + 35) <<
"runReceiver_: Done with receiveFragmentData, ret2=" << ret2 <<
" (should be " << source_rank <<
")";
344 TLOG(TLVL_ERROR) <<
"Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret <<
", Got: " << ret2 <<
")";
345 TLOG(TLVL_ERROR) <<
"Error receiving data from rank " << source_rank <<
", data has been lost! Event " << header.sequence_id <<
" will most likely be Incomplete!";
348 header.valid =
false;
349 header.complete =
false;
351 shm_manager_->DoneWritingFragment(header);
356 shm_manager_->DoneWritingFragment(header);
357 TLOG(TLVL_DEBUG + 33) <<
"Done receiving fragment with sequence ID " << header.sequence_id <<
" from rank " << source_rank;
359 recv_frag_count_.incSlot(source_rank);
360 recv_frag_size_.incSlot(source_rank, header.word_count *
sizeof(RawDataType));
361 recv_seq_count_.setSlot(source_rank, header.sequence_id);
362 if (endOfDataCount != static_cast<size_t>(-1))
364 TLOG(TLVL_DEBUG + 32) <<
"Received fragment " << header.sequence_id <<
" from rank " << source_rank
365 <<
" (" << recv_frag_count_.slotCount(source_rank) <<
"/" << endOfDataCount <<
")";
368 after_body = std::chrono::steady_clock::now();
370 auto hdr_delta_t = TimeUtils::GetElapsedTime(start_time, after_header);
371 auto store_delta_t = TimeUtils::GetElapsedTime(after_header, before_body);
372 auto data_delta_t = TimeUtils::GetElapsedTime(before_body, after_body);
373 auto delta_t = TimeUtils::GetElapsedTime(start_time, after_body);
374 auto dead_t = TimeUtils::GetElapsedTime(end_time, start_time);
375 auto recv_wait_t = hdr_delta_t - latency;
377 uint64_t data_size = header.word_count *
sizeof(RawDataType);
378 auto header_size = header.num_words() *
sizeof(RawDataType);
382 TLOG(TLVL_DEBUG + 34) <<
"runReceiver_: Sending receive stats for rank " << source_rank;
383 metricMan->sendMetric(
"Total Receive Time From Rank " + std::to_string(source_rank), delta_t,
"s", 5, MetricMode::Accumulate);
384 metricMan->sendMetric(
"Total Receive Size From Rank " + std::to_string(source_rank), data_size,
"B", 5, MetricMode::Accumulate);
385 metricMan->sendMetric(
"Total Receive Rate From Rank " + std::to_string(source_rank), data_size / delta_t,
"B/s", 5, MetricMode::Average);
387 metricMan->sendMetric(
"Header Receive Time From Rank " + std::to_string(source_rank), hdr_delta_t,
"s", 5, MetricMode::Accumulate);
388 metricMan->sendMetric(
"Header Receive Size From Rank " + std::to_string(source_rank), header_size,
"B", 5, MetricMode::Accumulate);
389 metricMan->sendMetric(
"Header Receive Rate From Rank " + std::to_string(source_rank), header_size / hdr_delta_t,
"B/s", 5, MetricMode::Average);
391 auto payloadSize = data_size - header_size;
392 metricMan->sendMetric(
"Data Receive Time From Rank " + std::to_string(source_rank), data_delta_t,
"s", 5, MetricMode::Accumulate);
393 metricMan->sendMetric(
"Data Receive Size From Rank " + std::to_string(source_rank), payloadSize,
"B", 5, MetricMode::Accumulate);
394 metricMan->sendMetric(
"Data Receive Rate From Rank " + std::to_string(source_rank), payloadSize / data_delta_t,
"B/s", 5, MetricMode::Average);
396 metricMan->sendMetric(
"Data Receive Count From Rank " + std::to_string(source_rank), recv_frag_count_.slotCount(source_rank),
"fragments", 3, MetricMode::LastPoint);
398 metricMan->sendMetric(
"Total Shared Memory Wait Time From Rank " + std::to_string(source_rank), store_delta_t,
"s", 3, MetricMode::Accumulate);
399 metricMan->sendMetric(
"Avg Shared Memory Wait Time From Rank " + std::to_string(source_rank), store_delta_t,
"s", 3, MetricMode::Average);
400 metricMan->sendMetric(
"Avg Fragment Wait Time From Rank " + std::to_string(source_rank), dead_t,
"s", 3, MetricMode::Average);
402 metricMan->sendMetric(
"Rank", std::to_string(my_rank),
"", 3, MetricMode::LastPoint);
403 metricMan->sendMetric(
"App Name", app_name,
"", 3, MetricMode::LastPoint);
404 metricMan->sendMetric(
"Fragment Latency at Receive From Rank " + std::to_string(source_rank), latency,
"s", 4, MetricMode::Average | MetricMode::Maximum);
405 metricMan->sendMetric(
"Header Receive Wait Time From Rank" + std::to_string(source_rank), recv_wait_t,
"s", 4, MetricMode::Average | MetricMode::Maximum | MetricMode::Minimum);
407 TLOG(TLVL_DEBUG + 34) <<
"runReceiver_: Done sending receive stats for rank " << source_rank;
410 end_time = std::chrono::steady_clock::now();
412 else if (header.type == Fragment::EndOfDataFragmentType || header.type == Fragment::InitFragmentType || header.type == Fragment::EndOfRunFragmentType || header.type == Fragment::EndOfSubrunFragmentType || header.type == Fragment::ShutdownFragmentType)
414 TLOG(TLVL_DEBUG + 32) <<
"Received System Fragment from rank " << source_rank <<
" of type " << detail::RawFragmentHeader::SystemTypeToString(header.type) <<
".";
416 FragmentPtr frag(
new Fragment(header.word_count - header.num_words()));
417 memcpy(frag->headerAddress(), &header, header.num_words() *
sizeof(RawDataType));
418 auto ret3 = source_plugins_[source_rank]->receiveFragmentData(frag->headerAddress() + header.num_words(), header.word_count - header.num_words());
419 if (ret3 != source_rank)
421 TLOG(TLVL_ERROR) <<
"Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank <<
", Got: " << ret3 <<
")";
422 throw cet::exception(
"DataReceiverManager") <<
"Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank <<
", Got: " << ret3 <<
")";
427 case Fragment::EndOfDataFragmentType:
429 if (endOfDataCount == static_cast<size_t>(-1))
431 endOfDataCount = *(frag->dataBegin());
435 endOfDataCount += *(frag->dataBegin());
437 TLOG(TLVL_DEBUG + 32) <<
"EndOfData Fragment indicates that " << endOfDataCount <<
" fragments are expected from rank " << source_rank
438 <<
" (recvd " << recv_frag_count_.slotCount(source_rank) <<
").";
440 case Fragment::InitFragmentType:
441 TLOG(TLVL_DEBUG + 32) <<
"Received Init Fragment from rank " << source_rank <<
".";
443 shm_manager_->AddInitFragment(frag);
445 case Fragment::EndOfRunFragmentType:
449 case Fragment::EndOfSubrunFragmentType:
451 TLOG(TLVL_DEBUG + 32) <<
"Received EndOfSubrun Fragment from rank " << source_rank
452 <<
" with sequence_id " << header.sequence_id <<
".";
453 if (header.sequence_id != Fragment::InvalidSequenceID)
455 shm_manager_->rolloverSubrun(header.sequence_id, header.timestamp);
459 shm_manager_->rolloverSubrun(recv_seq_count_.slotCount(source_rank), header.timestamp);
462 case Fragment::ShutdownFragmentType:
471 source_plugins_[source_rank]->flush_buffers();
473 TLOG(TLVL_DEBUG + 32) <<
"runReceiver_ " << source_rank <<
" receive loop exited";
474 running_sources_[source_rank] =
false;
std::set< int > running_sources() const
Get the list of sources which are still receiving data.
End of Run mode (Used to end request processing on receiver)
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
This TransferInterface is a Receiver.
DataReceiverManager(const fhicl::ParameterSet &ps, std::shared_ptr< SharedMemoryEventManager > shm)
DataReceiverManager Constructor.
std::set< int > enabled_sources() const
Get the list of enabled sources.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, std::string > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
Value that is to be returned when a Transfer plugin determines that no more data will be arriving...
virtual ~DataReceiverManager()
DataReceiverManager Destructor.
void start_threads()
Start receiver threads for all enabled sources.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
hostMap_t MakeHostMap(fhicl::ParameterSet const &pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
void stop_threads()
Stop receiver threads.