artdaq  v3_01_00
DataReceiverManager.cc
1 #include <chrono>
2 
3 #define TRACE_NAME "DataReceiverManager"
4 #include "artdaq/DAQdata/Globals.hh"
5 #include "artdaq/DAQrate/DataReceiverManager.hh"
6 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
7 #include "artdaq/TransferPlugins/detail/HostMap.hh"
8 #include "cetlib_except/exception.h"
9 #include <iomanip>
10 
11 artdaq::DataReceiverManager::DataReceiverManager(const fhicl::ParameterSet& pset, std::shared_ptr<SharedMemoryEventManager> shm)
12  : stop_requested_(false)
13  , stop_requested_time_(0)
14  , source_threads_()
15  , source_plugins_()
16  , enabled_sources_()
17  , running_sources_()
18  , recv_frag_count_()
19  , recv_frag_size_()
20  , recv_seq_count_()
21  , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000))
22  , stop_timeout_ms_(pset.get<size_t>("stop_timeout_ms", 1500))
23  , shm_manager_(shm)
24  , non_reliable_mode_enabled_(pset.get<bool>("non_reliable_mode", false))
25  , non_reliable_mode_retry_count_(pset.get<size_t>("non_reliable_mode_retry_count", -1))
26 {
27  TLOG(TLVL_DEBUG) << "Constructor";
28  auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>());
29  auto enabled_srcs_empty = enabled_srcs.size() == 0;
30 
31  if (non_reliable_mode_enabled_)
32  {
33  TLOG(TLVL_WARNING) << "DataReceiverManager is configured to drop data after " << std::to_string(non_reliable_mode_retry_count_)
34  << " failed attempts to put data into the SharedMemoryEventManager! If this is unexpected, please check your configuration!";
35  }
36 
37  if (enabled_srcs_empty)
38  {
39  TLOG(TLVL_INFO) << "enabled_sources not specified, assuming all sources enabled.";
40  }
41  else
42  {
43  for (auto& s : enabled_srcs)
44  {
45  enabled_sources_.insert(s);
46  }
47  }
48 
49  hostMap_t host_map = MakeHostMap(pset);
50  auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet());
51  for (auto& s : srcs.get_pset_names())
52  {
53  auto src_pset = srcs.get<fhicl::ParameterSet>(s);
54  host_map = MakeHostMap(src_pset, 0, host_map);
55  }
56  auto host_map_pset = MakeHostMapPset(host_map);
57  fhicl::ParameterSet srcs_mod;
58  for (auto& s : srcs.get_pset_names())
59  {
60  auto src_pset = srcs.get<fhicl::ParameterSet>(s);
61  src_pset.erase("host_map");
62  src_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
63  srcs_mod.put<fhicl::ParameterSet>(s, src_pset);
64  }
65 
66  for (auto& s : srcs_mod.get_pset_names())
67  {
68  try
69  {
70  auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs_mod, s,
72  auto source_rank = transfer->source_rank();
73  if (enabled_srcs_empty) enabled_sources_.insert(source_rank);
74  source_plugins_[source_rank] = std::move(transfer);
75  }
76  catch (cet::exception ex)
77  {
78  TLOG(TLVL_WARNING) << "cet::exception caught while setting up source " << s << ": " << ex.what();
79  }
80  catch (std::exception ex)
81  {
82  TLOG(TLVL_WARNING) << "std::exception caught while setting up source " << s << ": " << ex.what();
83  }
84  catch (...)
85  {
86  TLOG(TLVL_WARNING) << "Non-cet exception caught while setting up source " << s << ".";
87  }
88  }
89  if (srcs.get_pset_names().size() == 0)
90  {
91  TLOG(TLVL_ERROR) << "No sources configured!";
92  }
93 }
94 
96 {
97  TLOG(TLVL_TRACE) << "~DataReceiverManager: BEGIN";
98  stop_threads();
99  shm_manager_.reset();
100  TLOG(TLVL_TRACE) << "Destructor END";
101 }
102 
103 
105 {
106  stop_requested_ = false;
107  if (shm_manager_) shm_manager_->setRequestMode(artdaq::detail::RequestMessageMode::Normal);
108  for (auto& source : source_plugins_)
109  {
110  auto& rank = source.first;
111  if (enabled_sources_.count(rank))
112  {
113  running_sources_.insert(rank);
114  boost::thread::attributes attrs;
115  attrs.set_stack_size(4096 * 500); // 2000 KB
116  source_threads_[rank] = boost::thread(attrs, boost::bind(&DataReceiverManager::runReceiver_, this, rank));
117  }
118  }
119 }
120 
122 {
123  TLOG(TLVL_TRACE) << "stop_threads: BEGIN: Setting stop_requested to true, frags=" << std::to_string(count()) << ", bytes=" << std::to_string(byteCount());
124 
125  stop_requested_time_ = TimeUtils::gettimeofday_us();
126  stop_requested_ = true;
127 
128  TLOG(TLVL_TRACE) << "stop_threads: Joining all threads";
129  for (auto& s : source_threads_)
130  {
131  auto& thread = s.second;
132  if (thread.joinable()) thread.join();
133  }
134 }
135 
136 void artdaq::DataReceiverManager::runReceiver_(int source_rank)
137 {
138  std::chrono::steady_clock::time_point start_time, after_header, before_body, eod_quiet_start;
139  int ret;
140  double delta_t, hdr_delta_t, store_delta_t, data_delta_t;
141  detail::RawFragmentHeader header;
142  size_t endOfDataCount = -1;
143  auto sleep_time = receive_timeout_ / 100 > 100000 ? 100000 : receive_timeout_ / 100;
144  auto max_retries = non_reliable_mode_retry_count_ * ceil(receive_timeout_ / sleep_time);
145 
146  while (!(stop_requested_ && TimeUtils::gettimeofday_us() - stop_requested_time_ > stop_timeout_ms_ * 1000) && enabled_sources_.count(source_rank))
147  {
148  TLOG(16) << "runReceiver_: Begin loop";
149  if (stop_requested_) { receive_timeout_ = stop_timeout_ms_; }
150 
151  // Don't stop receiving until we haven't received anything for 1 second
152  if (endOfDataCount <= recv_frag_count_.slotCount(source_rank) && TimeUtils::GetElapsedTimeMilliseconds(eod_quiet_start) > 1000)
153  {
154  TLOG(TLVL_DEBUG) << "runReceiver_: End of Data conditions met, ending runReceiver loop";
155  running_sources_.erase(source_rank);
156  return;
157  }
158 
159  start_time = std::chrono::steady_clock::now();
160 
161  TLOG(16) << "runReceiver_: Calling receiveFragmentHeader";
162  ret = source_plugins_[source_rank]->receiveFragmentHeader(header, receive_timeout_);
163  TLOG(16) << "runReceiver_: Done with receiveFragmentHeader, ret=" << ret << " (should be " << source_rank << ")";
164  if (ret != source_rank)
165  {
166  if (ret >= 0) {
167  TLOG(TLVL_WARNING) << "Received Fragment from rank " << ret << ", but was expecting one from rank " << source_rank << "!";
168  }
169  continue; // Receive timeout or other oddness
170  }
171 
172  after_header = std::chrono::steady_clock::now();
173  eod_quiet_start = std::chrono::steady_clock::now();
174 
175  if (Fragment::isUserFragmentType(header.type) || header.type == Fragment::DataFragmentType || header.type == Fragment::EmptyFragmentType || header.type == Fragment::ContainerFragmentType) {
176  TLOG(TLVL_TRACE) << "Received Fragment Header from rank " << source_rank << ".";
177  RawDataType* loc = nullptr;
178  size_t retries = 0;
179  while (loc == nullptr)//&& TimeUtils::GetElapsedTimeMicroseconds(after_header)) < receive_timeout_)
180  {
181  loc = shm_manager_->WriteFragmentHeader(header);
182  if (loc == nullptr) usleep(sleep_time);
183  if (stop_requested_) return;
184  retries++;
185  if (non_reliable_mode_enabled_ && retries > max_retries)
186  {
187  loc = shm_manager_->WriteFragmentHeader(header, true);
188  }
189  }
190  if (loc == nullptr)
191  {
192  // Could not enqueue event!
193  TLOG(TLVL_ERROR) << "runReceiver_: Could not get data location for event " << std::to_string(header.sequence_id);
194  continue;
195  }
196  before_body = std::chrono::steady_clock::now();
197 
198  TLOG(16) << "runReceiver_: Calling receiveFragmentData";
199  auto ret2 = source_plugins_[source_rank]->receiveFragmentData(loc, header.word_count - header.num_words());
200  TLOG(16) << "runReceiver_: Done with receiveFragmentData, ret2=" << ret2 << " (should be " << source_rank << ")";
201 
202  if (ret != ret2) {
203  TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
204  throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
205  }
206 
207  shm_manager_->DoneWritingFragment(header);
208  TLOG(TLVL_TRACE) << "Done receiving fragment with sequence ID " << std::to_string(header.sequence_id) << " from rank " << source_rank;
209 
210  recv_frag_count_.incSlot(source_rank);
211  recv_frag_size_.incSlot(source_rank, header.word_count * sizeof(RawDataType));
212  recv_seq_count_.setSlot(source_rank, header.sequence_id);
213  if (endOfDataCount != static_cast<size_t>(-1))
214  {
215  TLOG(TLVL_DEBUG) << "Received fragment " << std::to_string(header.sequence_id) << " from rank " << source_rank
216  << " (" << std::to_string(recv_frag_count_.slotCount(source_rank)) << "/" << std::to_string(endOfDataCount) << ")";
217  }
218 
219  if (metricMan)
220  {//&& recv_frag_count_.slotCount(source_rank) % 100 == 0) {
221  TLOG(6) << "runReceiver_: Sending receive stats";
222  delta_t = TimeUtils::GetElapsedTime(start_time);
223  hdr_delta_t = TimeUtils::GetElapsedTime(start_time, after_header);
224  store_delta_t = TimeUtils::GetElapsedTime(after_header, before_body);
225  data_delta_t = TimeUtils::GetElapsedTime(before_body);
226  metricMan->sendMetric("Total Receive Time From Rank " + std::to_string(source_rank), delta_t, "s", 5, MetricMode::Accumulate);
227  metricMan->sendMetric("Total Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(header.word_count * sizeof(RawDataType)), "B", 5, MetricMode::Accumulate);
228  metricMan->sendMetric("Total Receive Rate From Rank " + std::to_string(source_rank), header.word_count * sizeof(RawDataType) / delta_t, "B/s", 5, MetricMode::Average);
229 
230  metricMan->sendMetric("Header Receive Time From Rank " + std::to_string(source_rank), hdr_delta_t, "s", 5, MetricMode::Accumulate);
231  metricMan->sendMetric("Header Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(header.num_words() * sizeof(RawDataType)), "B", 5, MetricMode::Accumulate);
232  metricMan->sendMetric("Header Receive Rate From Rank " + std::to_string(source_rank), header.num_words() * sizeof(RawDataType) / hdr_delta_t, "B/s", 5, MetricMode::Average);
233 
234  metricMan->sendMetric("Shared Memory Wait Time From Rank " + std::to_string(source_rank), store_delta_t, "s", 5, MetricMode::Accumulate);
235 
236  metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), data_delta_t, "s", 5, MetricMode::Accumulate);
237  metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>((header.word_count - header.num_words()) * sizeof(RawDataType)), "B", 5, MetricMode::Accumulate);
238  metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), (header.word_count - header.num_words()) * sizeof(RawDataType) / data_delta_t, "B/s", 5, MetricMode::Average);
239  metricMan->sendMetric("Data Receive Count From Rank " + std::to_string(source_rank), recv_frag_count_.slotCount(source_rank), "fragments", 3, MetricMode::LastPoint);
240  TLOG(6) << "runReceiver_: Done sending receive stats";
241  }
242  }
243  else if (header.type == Fragment::EndOfDataFragmentType || header.type == Fragment::InitFragmentType || header.type == Fragment::EndOfRunFragmentType || header.type == Fragment::EndOfSubrunFragmentType || header.type == Fragment::ShutdownFragmentType)
244  {
245  TLOG(TLVL_DEBUG) << "Received System Fragment from rank " << source_rank << " of type " << detail::RawFragmentHeader::SystemTypeToString(header.type) << ".";
246 
247  FragmentPtr frag(new Fragment(header.word_count - header.num_words()));
248  memcpy(frag->headerAddress(), &header, header.num_words() * sizeof(RawDataType));
249  auto ret3 = source_plugins_[source_rank]->receiveFragmentData(frag->headerAddress() + header.num_words(), header.word_count - header.num_words());
250  if (ret3 != source_rank)
251  {
252  TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
253  throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
254  }
255 
256  switch (header.type)
257  {
258  case Fragment::EndOfDataFragmentType:
259  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
260  endOfDataCount = *(frag->dataBegin());
261  TLOG(TLVL_DEBUG) << "EndOfData Fragment indicates that " << std::to_string(endOfDataCount) << " fragments are expected from rank " << source_rank
262  << " (recvd " << std::to_string(recv_frag_count_.slotCount(source_rank)) << ").";
263  break;
264  case Fragment::InitFragmentType:
265  TLOG(TLVL_DEBUG) << "Received Init Fragment from rank " << source_rank << ".";
266  shm_manager_->setRequestMode(detail::RequestMessageMode::Normal);
267  shm_manager_->SetInitFragment(std::move(frag));
268  break;
269  case Fragment::EndOfRunFragmentType:
270  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
271  break;
272  case Fragment::EndOfSubrunFragmentType:
273  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
274  break;
275  case Fragment::ShutdownFragmentType:
276  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
277  break;
278  }
279  }
280 
281  }
282 
283 
284  running_sources_.erase(source_rank);
285 }
End of Run mode (Used to end request processing on receiver)
This TransferInterface is a Receiver.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
DataReceiverManager(const fhicl::ParameterSet &ps, std::shared_ptr< SharedMemoryEventManager > shm)
DataReceiverManager Constructor.
virtual ~DataReceiverManager()
DataReceiverManager Destructor.
void start_threads()
Start receiver threads for all enabled sources.
void stop_threads()
Stop receiver threads.