artdaq  v3_10_02
DataReceiverManager.cc
1 #include <chrono>
2 
3 #include <iomanip>
4 #include <utility>
5 #include "artdaq/DAQdata/Globals.hh"
6 #define TRACE_NAME (app_name + "_DataReceiverManager").c_str()
7 #include "artdaq/DAQrate/DataReceiverManager.hh"
8 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
9 #include "artdaq/TransferPlugins/detail/HostMap.hh"
10 #include "cetlib_except/exception.h"
11 
12 artdaq::DataReceiverManager::DataReceiverManager(const fhicl::ParameterSet& pset, std::shared_ptr<SharedMemoryEventManager> shm)
13  : stop_requested_(false)
14  , stop_requested_time_(0)
15  , recv_frag_count_()
16  , recv_frag_size_()
17  , recv_seq_count_()
18  , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000))
19  , stop_timeout_ms_(pset.get<size_t>("stop_timeout_ms", 1500))
20  , shm_manager_(std::move(std::move(shm)))
21  , non_reliable_mode_enabled_(pset.get<bool>("non_reliable_mode", false))
22  , non_reliable_mode_retry_count_(pset.get<size_t>("non_reliable_mode_retry_count", -1))
23 {
24  TLOG(TLVL_DEBUG) << "Constructor";
25  auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>());
26  auto enabled_srcs_empty = enabled_srcs.empty();
27 
28  if (non_reliable_mode_enabled_)
29  {
30  TLOG(TLVL_WARNING) << "DataReceiverManager is configured to drop data after " << non_reliable_mode_retry_count_
31  << " failed attempts to put data into the SharedMemoryEventManager! If this is unexpected, please check your configuration!";
32  }
33 
34  if (enabled_srcs_empty)
35  {
36  TLOG(TLVL_INFO) << "enabled_sources not specified, assuming all sources enabled.";
37  }
38  else
39  {
40  for (auto& s : enabled_srcs)
41  {
42  enabled_sources_[s] = true;
43  }
44  }
45 
46  hostMap_t host_map = MakeHostMap(pset);
47  auto tcp_receive_buffer_size = pset.get<size_t>("tcp_receive_buffer_size", 0);
48  auto max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0);
49 
50  auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet());
51  for (auto& s : srcs.get_pset_names())
52  {
53  auto src_pset = srcs.get<fhicl::ParameterSet>(s);
54  host_map = MakeHostMap(src_pset, host_map);
55  }
56  auto host_map_pset = MakeHostMapPset(host_map);
57  fhicl::ParameterSet srcs_mod;
58  for (auto& s : srcs.get_pset_names())
59  {
60  auto src_pset = srcs.get<fhicl::ParameterSet>(s);
61  src_pset.erase("host_map");
62  src_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
63 
64  if (tcp_receive_buffer_size != 0 && !src_pset.has_key("tcp_receive_buffer_size"))
65  {
66  src_pset.put<size_t>("tcp_receive_buffer_size", tcp_receive_buffer_size);
67  }
68  if (max_fragment_size_words != 0 && !src_pset.has_key("max_fragment_size_words"))
69  {
70  src_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words);
71  }
72 
73  srcs_mod.put<fhicl::ParameterSet>(s, src_pset);
74  }
75 
76  for (auto& s : srcs_mod.get_pset_names())
77  {
78  try
79  {
80  auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs_mod, s,
82  auto source_rank = transfer->source_rank();
83  if (enabled_srcs_empty)
84  {
85  enabled_sources_[source_rank] = true;
86  }
87  else if (enabled_sources_.count(source_rank) == 0u)
88  {
89  enabled_sources_[source_rank] = false;
90  }
91  running_sources_[source_rank] = false;
92  source_plugins_[source_rank] = std::move(transfer);
93  }
94  catch (const cet::exception& ex)
95  {
96  TLOG(TLVL_WARNING) << "cet::exception caught while setting up source " << s << ": " << ex.what();
97  }
98  catch (const std::exception& ex)
99  {
100  TLOG(TLVL_WARNING) << "std::exception caught while setting up source " << s << ": " << ex.what();
101  }
102  catch (...)
103  {
104  TLOG(TLVL_WARNING) << "Non-cet exception caught while setting up source " << s << ".";
105  }
106  }
107  if (srcs.get_pset_names().empty())
108  {
109  TLOG(TLVL_ERROR) << "No sources configured!";
110  }
111 }
112 
114 {
115  TLOG(TLVL_TRACE) << "~DataReceiverManager: BEGIN";
116  stop_threads();
117  shm_manager_.reset();
118  TLOG(TLVL_TRACE) << "Destructor END";
119 }
120 
122 {
123  stop_requested_ = false;
124  if (shm_manager_)
125  {
126  shm_manager_->setRequestMode(artdaq::detail::RequestMessageMode::Normal);
127  }
128  for (auto& source : source_plugins_)
129  {
130  auto& rank = source.first;
131  if ((enabled_sources_.count(rank) != 0u) && enabled_sources_[rank].load())
132  {
133  recv_frag_count_.setSlot(rank, 0);
134  recv_frag_size_.setSlot(rank, 0);
135  recv_seq_count_.setSlot(rank, 0);
136 
137  running_sources_[rank] = true;
138  boost::thread::attributes attrs;
139  attrs.set_stack_size(4096 * 2000); // 2000 KB
140  try
141  {
142  source_threads_[rank] = boost::thread(attrs, boost::bind(&DataReceiverManager::runReceiver_, this, rank));
143  char tname[16]; // Size 16 - see man page pthread_setname_np(3) and/or prctl(2)
144  snprintf(tname, sizeof(tname) - 1, "%d-%d RECV", rank, my_rank); // NOLINT
145  tname[sizeof(tname) - 1] = '\0'; // assure term. snprintf is not too evil :)
146  auto handle = source_threads_[rank].native_handle();
147  pthread_setname_np(handle, tname);
148  }
149  catch (const boost::exception& e)
150  {
151  TLOG(TLVL_ERROR) << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
152  std::cerr << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
153  exit(5);
154  }
155  }
156  }
157 }
158 
160 {
161  TLOG(TLVL_TRACE) << "stop_threads: BEGIN: Setting stop_requested to true, frags=" << count() << ", bytes=" << byteCount();
162 
163  stop_requested_time_ = TimeUtils::gettimeofday_us();
164  stop_requested_ = true;
165 
166  auto initial_count = running_sources().size();
167  TLOG(TLVL_TRACE) << "stop_threads: Waiting for " << initial_count << " running receiver threads to stop";
168  auto wait_start = std::chrono::steady_clock::now();
169  auto last_report = std::chrono::steady_clock::now();
170  while (!running_sources().empty() && TimeUtils::GetElapsedTime(wait_start) < 60.0)
171  {
172  usleep(10000);
173  if (TimeUtils::GetElapsedTime(last_report) > 1.0)
174  {
175  TLOG(TLVL_DEBUG) << "stop_threads: Waited " << TimeUtils::GetElapsedTime(wait_start) << " s for " << initial_count
176  << " receiver threads to end (" << running_sources().size() << " remain)";
177  last_report = std::chrono::steady_clock::now();
178  }
179  }
180  if (!running_sources().empty())
181  {
182  TLOG(TLVL_WARNING) << "stop_threads: Timeout expired while waiting for all receiver threads to end. There are "
183  << running_sources().size() << " threads remaining.";
184  }
185 
186  TLOG(TLVL_TRACE) << "stop_threads: Joining " << source_threads_.size() << " receiver threads";
187  for (auto& source_thread : source_threads_)
188  {
189  TLOG(TLVL_TRACE) << "stop_threads: Joining thread for source_rank " << source_thread.first;
190  try
191  {
192  if (source_thread.second.joinable())
193  {
194  source_thread.second.join();
195  }
196  else
197  {
198  TLOG(TLVL_ERROR) << "stop_threads: Thread for source rank " << source_thread.first << " is not joinable!";
199  }
200  }
201  catch (...)
202  {
203  // IGNORED
204  }
205  }
206  source_threads_.clear(); // To prevent error messages from shutdown-after-stop
207 
208  TLOG(TLVL_TRACE) << "stop_threads: END";
209 }
210 
212 {
213  std::set<int> output;
214  for (auto& src : enabled_sources_)
215  {
216  if (src.second)
217  {
218  output.insert(src.first);
219  }
220  }
221  return output;
222 }
223 
225 {
226  std::set<int> output;
227  for (auto& src : running_sources_)
228  {
229  if (src.second)
230  {
231  output.insert(src.first);
232  }
233  }
234  return output;
235 }
236 
237 void artdaq::DataReceiverManager::runReceiver_(int source_rank)
238 {
239  std::chrono::steady_clock::time_point start_time, after_header, before_body, after_body, end_time = std::chrono::steady_clock::now();
240  int ret;
241  detail::RawFragmentHeader header;
242  size_t endOfDataCount = -1;
243  auto sleep_time = receive_timeout_ / 100 > 100000 ? 100000 : receive_timeout_ / 100;
244  if (sleep_time < 5000)
245  {
246  sleep_time = 5000;
247  }
248  auto max_retries = non_reliable_mode_retry_count_ * ceil(receive_timeout_ / sleep_time);
249 
250  while (!(stop_requested_ && TimeUtils::gettimeofday_us() - stop_requested_time_ > stop_timeout_ms_ * 1000) && (enabled_sources_.count(source_rank) != 0u))
251  {
252  TLOG(16) << "runReceiver_: Begin loop stop_requested_=" << stop_requested_ << ", stop_timeout_ms_=" << stop_timeout_ms_ << ", enabled_sources_.count(source_rank)=" << enabled_sources_.count(source_rank) << ", now - stop_requested_time_=" << (TimeUtils::gettimeofday_us() - stop_requested_time_);
253  std::this_thread::yield();
254 
255  // Don't stop receiving until we haven't received anything for 1 second
256  if (endOfDataCount <= recv_frag_count_.slotCount(source_rank) && !source_plugins_[source_rank]->isRunning())
257  {
258  TLOG(TLVL_DEBUG) << "runReceiver_: End of Data conditions met, ending runReceiver loop";
259  break;
260  }
261 
262  start_time = std::chrono::steady_clock::now();
263 
264  TLOG(16) << "runReceiver_: Calling receiveFragmentHeader tmo=" << receive_timeout_;
265  ret = source_plugins_[source_rank]->receiveFragmentHeader(header, receive_timeout_);
266  TLOG(16) << "runReceiver_: Done with receiveFragmentHeader, ret=" << ret << " (should be " << source_rank << ")";
267  if (ret != source_rank)
268  {
269  if (ret >= 0)
270  {
271  TLOG(TLVL_WARNING) << "Received Fragment from rank " << ret << ", but was expecting one from rank " << source_rank << "!";
272  }
273  else if (ret == TransferInterface::DATA_END)
274  {
275  TLOG(TLVL_ERROR) << "Transfer Plugin returned DATA_END, ending receive loop!";
276  break;
277  }
278  if (*running_sources().begin() == source_rank) // Only do this for the first sender in the running_sources_ map
279  {
280  TLOG(TLVL_DEBUG + 3) << "Calling SMEM::CheckPendingBuffers from DRM receiver thread for " << source_rank << " to make sure that things aren't stuck";
281  shm_manager_->CheckPendingBuffers();
282  }
283 
284  usleep(sleep_time);
285  continue; // Receive timeout or other oddness
286  }
287 
288  after_header = std::chrono::steady_clock::now();
289 
290  if (Fragment::isUserFragmentType(header.type) || header.type == Fragment::DataFragmentType || header.type == Fragment::EmptyFragmentType || header.type == Fragment::ContainerFragmentType)
291  {
292  TLOG(TLVL_TRACE) << "Received Fragment Header from rank " << source_rank << ", sequence ID " << header.sequence_id << ", timestamp " << header.timestamp;
293  RawDataType* loc = nullptr;
294  size_t retries = 0;
295  auto latency_s = header.getLatency(true);
296  auto latency = latency_s.tv_sec + (latency_s.tv_nsec / 1000000000.0);
297  while (loc == nullptr) //&& TimeUtils::GetElapsedTimeMicroseconds(after_header)) < receive_timeout_)
298  {
299  loc = shm_manager_->WriteFragmentHeader(header);
300 
301  // Break here and outside of the loop to go to the cleanup steps at the end of runReceiver_
302  if (loc == nullptr && stop_requested_)
303  {
304  break;
305  }
306 
307  if (loc == nullptr)
308  {
309  usleep(sleep_time);
310  }
311  retries++;
312  if (non_reliable_mode_enabled_ && retries > max_retries)
313  {
314  loc = shm_manager_->WriteFragmentHeader(header, true);
315  }
316  }
317  // Break here to go to cleanup at the end of runReceiver_
318  if (loc == nullptr && stop_requested_)
319  {
320  break;
321  }
322  if (loc == nullptr)
323  {
324  // Could not enqueue event!
325  TLOG(TLVL_ERROR) << "runReceiver_: Could not get data location for event " << header.sequence_id;
326  continue;
327  }
328  before_body = std::chrono::steady_clock::now();
329 
330  auto hdrLoc = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(loc - artdaq::detail::RawFragmentHeader::num_words()); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast,cppcoreguidelines-pro-bounds-pointer-arithmetic)
331  TLOG(16) << "runReceiver_: Calling receiveFragmentData from rank " << source_rank << ", sequence ID " << header.sequence_id << ", timestamp " << header.timestamp;
332  auto ret2 = source_plugins_[source_rank]->receiveFragmentData(loc, header.word_count - header.num_words());
333  TLOG(16) << "runReceiver_: Done with receiveFragmentData, ret2=" << ret2 << " (should be " << source_rank << ")";
334 
335  if (ret != ret2)
336  {
337  TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
338  TLOG(TLVL_ERROR) << "Error receiving data from rank " << source_rank << ", data has been lost! Event " << header.sequence_id << " will most likely be Incomplete!";
339 
340  // Mark the Fragment as invalid
341  /* \todo Make a RawFragmentHeader field that marks it as invalid while maintaining previous type! */
342  hdrLoc->type = Fragment::ErrorFragmentType;
343 
344  shm_manager_->DoneWritingFragment(header);
345  //throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
346  continue;
347  }
348 
349  shm_manager_->DoneWritingFragment(header);
350  TLOG(TLVL_TRACE) << "Done receiving fragment with sequence ID " << header.sequence_id << " from rank " << source_rank;
351 
352  recv_frag_count_.incSlot(source_rank);
353  recv_frag_size_.incSlot(source_rank, header.word_count * sizeof(RawDataType));
354  recv_seq_count_.setSlot(source_rank, header.sequence_id);
355  if (endOfDataCount != static_cast<size_t>(-1))
356  {
357  TLOG(TLVL_DEBUG) << "Received fragment " << header.sequence_id << " from rank " << source_rank
358  << " (" << recv_frag_count_.slotCount(source_rank) << "/" << endOfDataCount << ")";
359  }
360 
361  after_body = std::chrono::steady_clock::now();
362 
363  auto hdr_delta_t = TimeUtils::GetElapsedTime(start_time, after_header);
364  auto store_delta_t = TimeUtils::GetElapsedTime(after_header, before_body);
365  auto data_delta_t = TimeUtils::GetElapsedTime(before_body, after_body);
366  auto delta_t = TimeUtils::GetElapsedTime(start_time, after_body);
367  auto dead_t = TimeUtils::GetElapsedTime(end_time, start_time);
368  auto recv_wait_t = hdr_delta_t - latency;
369 
370  uint64_t data_size = header.word_count * sizeof(RawDataType);
371  auto header_size = header.num_words() * sizeof(RawDataType);
372 
373  if (metricMan)
374  { //&& recv_frag_count_.slotCount(source_rank) % 100 == 0) {
375  TLOG(TLVL_DEBUG + 3) << "runReceiver_: Sending receive stats for rank " << source_rank;
376  metricMan->sendMetric("Total Receive Time From Rank " + std::to_string(source_rank), delta_t, "s", 5, MetricMode::Accumulate);
377  metricMan->sendMetric("Total Receive Size From Rank " + std::to_string(source_rank), data_size, "B", 5, MetricMode::Accumulate);
378  metricMan->sendMetric("Total Receive Rate From Rank " + std::to_string(source_rank), data_size / delta_t, "B/s", 5, MetricMode::Average);
379 
380  metricMan->sendMetric("Header Receive Time From Rank " + std::to_string(source_rank), hdr_delta_t, "s", 5, MetricMode::Accumulate);
381  metricMan->sendMetric("Header Receive Size From Rank " + std::to_string(source_rank), header_size, "B", 5, MetricMode::Accumulate);
382  metricMan->sendMetric("Header Receive Rate From Rank " + std::to_string(source_rank), header_size / hdr_delta_t, "B/s", 5, MetricMode::Average);
383 
384  auto payloadSize = data_size - header_size;
385  metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), data_delta_t, "s", 5, MetricMode::Accumulate);
386  metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), payloadSize, "B", 5, MetricMode::Accumulate);
387  metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), payloadSize / data_delta_t, "B/s", 5, MetricMode::Average);
388 
389  metricMan->sendMetric("Data Receive Count From Rank " + std::to_string(source_rank), recv_frag_count_.slotCount(source_rank), "fragments", 3, MetricMode::LastPoint);
390 
391  metricMan->sendMetric("Total Shared Memory Wait Time From Rank " + std::to_string(source_rank), store_delta_t, "s", 3, MetricMode::Accumulate);
392  metricMan->sendMetric("Avg Shared Memory Wait Time From Rank " + std::to_string(source_rank), store_delta_t, "s", 3, MetricMode::Average);
393  metricMan->sendMetric("Avg Fragment Wait Time From Rank " + std::to_string(source_rank), dead_t, "s", 3, MetricMode::Average);
394 
395  metricMan->sendMetric("Fragment Latency at Receive From Rank " + std::to_string(source_rank), latency, "s", 4, MetricMode::Average | MetricMode::Maximum);
396  metricMan->sendMetric("Header Receive Wait Time From Rank" + std::to_string(source_rank), recv_wait_t, "s", 4, MetricMode::Average | MetricMode::Maximum | MetricMode::Minimum);
397 
398  TLOG(TLVL_DEBUG + 3) << "runReceiver_: Done sending receive stats for rank " << source_rank;
399  }
400 
401  end_time = std::chrono::steady_clock::now();
402  }
403  else if (header.type == Fragment::EndOfDataFragmentType || header.type == Fragment::InitFragmentType || header.type == Fragment::EndOfRunFragmentType || header.type == Fragment::EndOfSubrunFragmentType || header.type == Fragment::ShutdownFragmentType)
404  {
405  TLOG(TLVL_DEBUG) << "Received System Fragment from rank " << source_rank << " of type " << detail::RawFragmentHeader::SystemTypeToString(header.type) << ".";
406 
407  FragmentPtr frag(new Fragment(header.word_count - header.num_words()));
408  memcpy(frag->headerAddress(), &header, header.num_words() * sizeof(RawDataType));
409  auto ret3 = source_plugins_[source_rank]->receiveFragmentData(frag->headerAddress() + header.num_words(), header.word_count - header.num_words()); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
410  if (ret3 != source_rank)
411  {
412  TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
413  throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")"; // NOLINT(cert-err60-cpp)
414  }
415 
416  switch (header.type)
417  {
418  case Fragment::EndOfDataFragmentType:
419  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
420  if (endOfDataCount == static_cast<size_t>(-1))
421  {
422  endOfDataCount = *(frag->dataBegin());
423  }
424  else
425  {
426  endOfDataCount += *(frag->dataBegin());
427  }
428  TLOG(TLVL_DEBUG) << "EndOfData Fragment indicates that " << endOfDataCount << " fragments are expected from rank " << source_rank
429  << " (recvd " << recv_frag_count_.slotCount(source_rank) << ").";
430  break;
431  case Fragment::InitFragmentType:
432  TLOG(TLVL_DEBUG) << "Received Init Fragment from rank " << source_rank << ".";
433  shm_manager_->setRequestMode(detail::RequestMessageMode::Normal);
434  shm_manager_->AddInitFragment(frag);
435  break;
436  case Fragment::EndOfRunFragmentType:
437  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
438  //shm_manager_->endRun();
439  break;
440  case Fragment::EndOfSubrunFragmentType:
441  //shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
442  TLOG(TLVL_DEBUG) << "Received EndOfSubrun Fragment from rank " << source_rank
443  << " with sequence_id " << header.sequence_id << ".";
444  if (header.sequence_id != Fragment::InvalidSequenceID)
445  {
446  shm_manager_->rolloverSubrun(header.sequence_id, header.timestamp);
447  }
448  else
449  {
450  shm_manager_->rolloverSubrun(recv_seq_count_.slotCount(source_rank), header.timestamp);
451  }
452  break;
453  case Fragment::ShutdownFragmentType:
454  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
455  break;
456  default:
457  break;
458  }
459  }
460  }
461 
462  source_plugins_[source_rank]->flush_buffers();
463 
464  TLOG(TLVL_DEBUG) << "runReceiver_ " << source_rank << " receive loop exited";
465  running_sources_[source_rank] = false;
466 }
std::set< int > running_sources() const
Get the list of sources which are still receiving data.
End of Run mode (Used to end request processing on receiver)
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
This TransferInterface is a Receiver.
DataReceiverManager(const fhicl::ParameterSet &ps, std::shared_ptr< SharedMemoryEventManager > shm)
DataReceiverManager Constructor.
std::set< int > enabled_sources() const
Get the list of enabled sources.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, std::string > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
Definition: HostMap.hh:46
Value that is to be returned when a Transfer plugin determines that no more data will be arriving...
virtual ~DataReceiverManager()
DataReceiverManager Destructor.
void start_threads()
Start receiver threads for all enabled sources.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:39
hostMap_t MakeHostMap(fhicl::ParameterSet const &pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:65
void stop_threads()
Stop receiver threads.