artdaq  v3_07_01
DataReceiverManager.cc
1 #include <chrono>
2 
3 #include <iomanip>
4 #include "artdaq/DAQdata/Globals.hh"
5 #define TRACE_NAME (app_name + "_DataReceiverManager").c_str()
6 #include "artdaq/DAQrate/DataReceiverManager.hh"
7 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
8 #include "artdaq/TransferPlugins/detail/HostMap.hh"
9 #include "cetlib_except/exception.h"
10 
11 artdaq::DataReceiverManager::DataReceiverManager(const fhicl::ParameterSet& pset, std::shared_ptr<SharedMemoryEventManager> shm)
12  : stop_requested_(false)
13  , stop_requested_time_(0)
14  , source_threads_()
15  , source_plugins_()
16  , enabled_sources_()
17  , running_sources_()
18  , recv_frag_count_()
19  , recv_frag_size_()
20  , recv_seq_count_()
21  , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000))
22  , stop_timeout_ms_(pset.get<size_t>("stop_timeout_ms", 1500))
23  , shm_manager_(shm)
24  , non_reliable_mode_enabled_(pset.get<bool>("non_reliable_mode", false))
25  , non_reliable_mode_retry_count_(pset.get<size_t>("non_reliable_mode_retry_count", -1))
26 {
27  TLOG(TLVL_DEBUG) << "Constructor";
28  auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>());
29  auto enabled_srcs_empty = enabled_srcs.size() == 0;
30 
31  if (non_reliable_mode_enabled_)
32  {
33  TLOG(TLVL_WARNING) << "DataReceiverManager is configured to drop data after " << non_reliable_mode_retry_count_
34  << " failed attempts to put data into the SharedMemoryEventManager! If this is unexpected, please check your configuration!";
35  }
36 
37  if (enabled_srcs_empty)
38  {
39  TLOG(TLVL_INFO) << "enabled_sources not specified, assuming all sources enabled.";
40  }
41  else
42  {
43  for (auto& s : enabled_srcs)
44  {
45  enabled_sources_[s] = true;
46  }
47  }
48 
49  hostMap_t host_map = MakeHostMap(pset);
50  size_t tcp_receive_buffer_size = pset.get<size_t>("tcp_receive_buffer_size", 0);
51  size_t max_fragment_size_words = pset.get<size_t>("max_fragment_size_words", 0);
52 
53  auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet());
54  for (auto& s : srcs.get_pset_names())
55  {
56  auto src_pset = srcs.get<fhicl::ParameterSet>(s);
57  host_map = MakeHostMap(src_pset, host_map);
58  }
59  auto host_map_pset = MakeHostMapPset(host_map);
60  fhicl::ParameterSet srcs_mod;
61  for (auto& s : srcs.get_pset_names())
62  {
63  auto src_pset = srcs.get<fhicl::ParameterSet>(s);
64  src_pset.erase("host_map");
65  src_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
66 
67  if (tcp_receive_buffer_size != 0 && !src_pset.has_key("tcp_receive_buffer_size"))
68  {
69  src_pset.put<size_t>("tcp_receive_buffer_size", tcp_receive_buffer_size);
70  }
71  if (max_fragment_size_words != 0 && !src_pset.has_key("max_fragment_size_words"))
72  {
73  src_pset.put<size_t>("max_fragment_size_words", max_fragment_size_words);
74  }
75 
76  srcs_mod.put<fhicl::ParameterSet>(s, src_pset);
77  }
78 
79  for (auto& s : srcs_mod.get_pset_names())
80  {
81  try
82  {
83  auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs_mod, s,
85  auto source_rank = transfer->source_rank();
86  if (enabled_srcs_empty)
87  enabled_sources_[source_rank] = true;
88  else if (!enabled_sources_.count(source_rank))
89  enabled_sources_[source_rank] = false;
90  running_sources_[source_rank] = false;
91  source_plugins_[source_rank] = std::move(transfer);
92  }
93  catch (const cet::exception& ex)
94  {
95  TLOG(TLVL_WARNING) << "cet::exception caught while setting up source " << s << ": " << ex.what();
96  }
97  catch (const std::exception& ex)
98  {
99  TLOG(TLVL_WARNING) << "std::exception caught while setting up source " << s << ": " << ex.what();
100  }
101  catch (...)
102  {
103  TLOG(TLVL_WARNING) << "Non-cet exception caught while setting up source " << s << ".";
104  }
105  }
106  if (srcs.get_pset_names().size() == 0)
107  {
108  TLOG(TLVL_ERROR) << "No sources configured!";
109  }
110 }
111 
113 {
114  TLOG(TLVL_TRACE) << "~DataReceiverManager: BEGIN";
115  stop_threads();
116  shm_manager_.reset();
117  TLOG(TLVL_TRACE) << "Destructor END";
118 }
119 
121 {
122  stop_requested_ = false;
123  if (shm_manager_) shm_manager_->setRequestMode(artdaq::detail::RequestMessageMode::Normal);
124  for (auto& source : source_plugins_)
125  {
126  auto& rank = source.first;
127  if (enabled_sources_.count(rank) && enabled_sources_[rank].load())
128  {
129  recv_frag_count_.setSlot(rank, 0);
130  recv_frag_size_.setSlot(rank, 0);
131  recv_seq_count_.setSlot(rank, 0);
132 
133  running_sources_[rank] = true;
134  boost::thread::attributes attrs;
135  attrs.set_stack_size(4096 * 2000); // 2000 KB
136  try
137  {
138  source_threads_[rank] = boost::thread(attrs, boost::bind(&DataReceiverManager::runReceiver_, this, rank));
139  }
140  catch (const boost::exception& e)
141  {
142  TLOG(TLVL_ERROR) << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
143  std::cerr << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
144  exit(5);
145  }
146  }
147  }
148 }
149 
151 {
152  TLOG(TLVL_TRACE) << "stop_threads: BEGIN: Setting stop_requested to true, frags=" << count() << ", bytes=" << byteCount();
153 
154  stop_requested_time_ = TimeUtils::gettimeofday_us();
155  stop_requested_ = true;
156 
157  auto initial_count = running_sources().size();
158  TLOG(TLVL_TRACE) << "stop_threads: Waiting for " << initial_count << " running receiver threads to stop";
159  auto wait_start = std::chrono::steady_clock::now();
160  auto last_report = std::chrono::steady_clock::now();
161  while (running_sources().size() && TimeUtils::GetElapsedTime(wait_start) < 60.0)
162  {
163  usleep(10000);
164  if (TimeUtils::GetElapsedTime(last_report) > 1.0)
165  {
166  TLOG(TLVL_DEBUG) << "stop_threads: Waited " << TimeUtils::GetElapsedTime(wait_start) << " s for " << initial_count
167  << " receiver threads to end (" << running_sources().size() << " remain)";
168  last_report = std::chrono::steady_clock::now();
169  }
170  }
171  if (running_sources().size())
172  {
173  TLOG(TLVL_WARNING) << "stop_threads: Timeout expired while waiting for all receiver threads to end. There are "
174  << running_sources().size() << " threads remaining.";
175  }
176 
177  TLOG(TLVL_TRACE) << "stop_threads: Joining " << source_threads_.size() << " receiver threads";
178  for (auto it = source_threads_.begin(); it != source_threads_.end(); ++it)
179  {
180  TLOG(TLVL_TRACE) << "stop_threads: Joining thread for source_rank " << (*it).first;
181  if ((*it).second.joinable())
182  (*it).second.join();
183  else
184  TLOG(TLVL_ERROR) << "stop_threads: Thread for source rank " << (*it).first << " is not joinable!";
185  }
186  source_threads_.clear(); // To prevent error messages from shutdown-after-stop
187 
188  TLOG(TLVL_TRACE) << "stop_threads: END";
189 }
190 
192 {
193  std::set<int> output;
194  for (auto& src : enabled_sources_)
195  {
196  if (src.second) output.insert(src.first);
197  }
198  return output;
199 }
200 
202 {
203  std::set<int> output;
204  for (auto& src : running_sources_)
205  {
206  if (src.second) output.insert(src.first);
207  }
208  return output;
209 }
210 
211 void artdaq::DataReceiverManager::runReceiver_(int source_rank)
212 {
213  std::chrono::steady_clock::time_point start_time, after_header, before_body, after_body, end_time = std::chrono::steady_clock::now();
214  int ret;
215  detail::RawFragmentHeader header;
216  size_t endOfDataCount = -1;
217  auto sleep_time = receive_timeout_ / 100 > 100000 ? 100000 : receive_timeout_ / 100;
218  if (sleep_time < 5000) sleep_time = 5000;
219  auto max_retries = non_reliable_mode_retry_count_ * ceil(receive_timeout_ / sleep_time);
220 
221  while (!(stop_requested_ && TimeUtils::gettimeofday_us() - stop_requested_time_ > stop_timeout_ms_ * 1000) && enabled_sources_.count(source_rank))
222  {
223  TLOG(16) << "runReceiver_: Begin loop";
224  std::this_thread::yield();
225 
226  // Don't stop receiving until we haven't received anything for 1 second
227  if (endOfDataCount <= recv_frag_count_.slotCount(source_rank) && !source_plugins_[source_rank]->isRunning())
228  {
229  TLOG(TLVL_DEBUG) << "runReceiver_: End of Data conditions met, ending runReceiver loop";
230  break;
231  }
232 
233  start_time = std::chrono::steady_clock::now();
234 
235  TLOG(16) << "runReceiver_: Calling receiveFragmentHeader tmo=" << receive_timeout_;
236  ret = source_plugins_[source_rank]->receiveFragmentHeader(header, receive_timeout_);
237  TLOG(16) << "runReceiver_: Done with receiveFragmentHeader, ret=" << ret << " (should be " << source_rank << ")";
238  if (ret != source_rank)
239  {
240  if (ret >= 0)
241  {
242  TLOG(TLVL_WARNING) << "Received Fragment from rank " << ret << ", but was expecting one from rank " << source_rank << "!";
243  }
244  else if (ret == TransferInterface::DATA_END)
245  {
246  TLOG(TLVL_ERROR) << "Transfer Plugin returned DATA_END, ending receive loop!";
247  break;
248  }
249  if (*running_sources().begin() == source_rank) // Only do this for the first sender in the running_sources_ map
250  {
251  TLOG(6) << "Calling SMEM::CheckPendingBuffers from DRM receiver thread for " << source_rank << " to make sure that things aren't stuck";
252  shm_manager_->CheckPendingBuffers();
253  }
254 
255  usleep(sleep_time);
256  continue; // Receive timeout or other oddness
257  }
258 
259  after_header = std::chrono::steady_clock::now();
260 
261  if (Fragment::isUserFragmentType(header.type) || header.type == Fragment::DataFragmentType || header.type == Fragment::EmptyFragmentType || header.type == Fragment::ContainerFragmentType)
262  {
263  TLOG(TLVL_TRACE) << "Received Fragment Header from rank " << source_rank << ", sequence ID " << header.sequence_id << ", timestamp " << header.timestamp;
264  RawDataType* loc = nullptr;
265  size_t retries = 0;
266  auto latency_s = header.getLatency(true);
267  auto latency = latency_s.tv_sec + (latency_s.tv_nsec / 1000000000.0);
268  while (loc == nullptr) //&& TimeUtils::GetElapsedTimeMicroseconds(after_header)) < receive_timeout_)
269  {
270  loc = shm_manager_->WriteFragmentHeader(header);
271 
272  // Break here and outside of the loop to go to the cleanup steps at the end of runReceiver_
273  if (loc == nullptr && stop_requested_) break;
274 
275  if (loc == nullptr) usleep(sleep_time);
276  retries++;
277  if (non_reliable_mode_enabled_ && retries > max_retries)
278  {
279  loc = shm_manager_->WriteFragmentHeader(header, true);
280  }
281  }
282  // Break here to go to cleanup at the end of runReceiver_
283  if (loc == nullptr && stop_requested_) break;
284  if (loc == nullptr)
285  {
286  // Could not enqueue event!
287  TLOG(TLVL_ERROR) << "runReceiver_: Could not get data location for event " << header.sequence_id;
288  continue;
289  }
290  before_body = std::chrono::steady_clock::now();
291 
292  auto hdrLoc = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(loc - artdaq::detail::RawFragmentHeader::num_words());
293  TLOG(16) << "runReceiver_: Calling receiveFragmentData from rank " << source_rank << ", sequence ID " << header.sequence_id << ", timestamp " << header.timestamp;
294  auto ret2 = source_plugins_[source_rank]->receiveFragmentData(loc, header.word_count - header.num_words());
295  TLOG(16) << "runReceiver_: Done with receiveFragmentData, ret2=" << ret2 << " (should be " << source_rank << ")";
296 
297  if (ret != ret2)
298  {
299  TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
300  TLOG(TLVL_ERROR) << "Error receiving data from rank " << source_rank << ", data has been lost! Event " << header.sequence_id << " will most likely be Incomplete!";
301 
302  // Mark the Fragment as invalid
303  /* \todo Make a RawFragmentHeader field that marks it as invalid while maintaining previous type! */
304  hdrLoc->type = Fragment::ErrorFragmentType;
305 
306  shm_manager_->DoneWritingFragment(header);
307  //throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
308  continue;
309  }
310 
311  shm_manager_->DoneWritingFragment(header);
312  TLOG(TLVL_TRACE) << "Done receiving fragment with sequence ID " << header.sequence_id << " from rank " << source_rank;
313 
314  recv_frag_count_.incSlot(source_rank);
315  recv_frag_size_.incSlot(source_rank, header.word_count * sizeof(RawDataType));
316  recv_seq_count_.setSlot(source_rank, header.sequence_id);
317  if (endOfDataCount != static_cast<size_t>(-1))
318  {
319  TLOG(TLVL_DEBUG) << "Received fragment " << header.sequence_id << " from rank " << source_rank
320  << " (" << recv_frag_count_.slotCount(source_rank) << "/" << endOfDataCount << ")";
321  }
322 
323  after_body = std::chrono::steady_clock::now();
324 
325  auto hdr_delta_t = TimeUtils::GetElapsedTime(start_time, after_header);
326  auto store_delta_t = TimeUtils::GetElapsedTime(after_header, before_body);
327  auto data_delta_t = TimeUtils::GetElapsedTime(before_body, after_body);
328  auto delta_t = TimeUtils::GetElapsedTime(start_time, after_body);
329  auto dead_t = TimeUtils::GetElapsedTime(end_time, start_time);
330  auto recv_wait_t = hdr_delta_t - latency;
331 
332  auto data_size = header.word_count * sizeof(RawDataType);
333  auto header_size = header.num_words() * sizeof(RawDataType);
334 
335  if (metricMan)
336  { //&& recv_frag_count_.slotCount(source_rank) % 100 == 0) {
337  TLOG(6) << "runReceiver_: Sending receive stats for rank " << source_rank;
338  metricMan->sendMetric("Total Receive Time From Rank " + std::to_string(source_rank), delta_t, "s", 5, MetricMode::Accumulate);
339  metricMan->sendMetric("Total Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(data_size), "B", 5, MetricMode::Accumulate);
340  metricMan->sendMetric("Total Receive Rate From Rank " + std::to_string(source_rank), data_size / delta_t, "B/s", 5, MetricMode::Average);
341 
342  metricMan->sendMetric("Header Receive Time From Rank " + std::to_string(source_rank), hdr_delta_t, "s", 5, MetricMode::Accumulate);
343  metricMan->sendMetric("Header Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(header_size), "B", 5, MetricMode::Accumulate);
344  metricMan->sendMetric("Header Receive Rate From Rank " + std::to_string(source_rank), header_size / hdr_delta_t, "B/s", 5, MetricMode::Average);
345 
346  auto payloadSize = data_size - header_size;
347  metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), data_delta_t, "s", 5, MetricMode::Accumulate);
348  metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(payloadSize), "B", 5, MetricMode::Accumulate);
349  metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), payloadSize / data_delta_t, "B/s", 5, MetricMode::Average);
350 
351  metricMan->sendMetric("Data Receive Count From Rank " + std::to_string(source_rank), recv_frag_count_.slotCount(source_rank), "fragments", 3, MetricMode::LastPoint);
352 
353  metricMan->sendMetric("Total Shared Memory Wait Time From Rank " + std::to_string(source_rank), store_delta_t, "s", 3, MetricMode::Accumulate);
354  metricMan->sendMetric("Avg Shared Memory Wait Time From Rank " + std::to_string(source_rank), store_delta_t, "s", 3, MetricMode::Average);
355  metricMan->sendMetric("Avg Fragment Wait Time From Rank " + std::to_string(source_rank), dead_t, "s", 3, MetricMode::Average);
356 
357  metricMan->sendMetric("Fragment Latency at Receive From Rank " + std::to_string(source_rank), latency, "s", 4, MetricMode::Average | MetricMode::Maximum);
358  metricMan->sendMetric("Header Receive Wait Time From Rank" + std::to_string(source_rank), recv_wait_t, "s", 4, MetricMode::Average | MetricMode::Maximum | MetricMode::Minimum);
359 
360  TLOG(6) << "runReceiver_: Done sending receive stats for rank " << source_rank;
361  }
362 
363  end_time = std::chrono::steady_clock::now();
364  }
365  else if (header.type == Fragment::EndOfDataFragmentType || header.type == Fragment::InitFragmentType || header.type == Fragment::EndOfRunFragmentType || header.type == Fragment::EndOfSubrunFragmentType || header.type == Fragment::ShutdownFragmentType)
366  {
367  TLOG(TLVL_DEBUG) << "Received System Fragment from rank " << source_rank << " of type " << detail::RawFragmentHeader::SystemTypeToString(header.type) << ".";
368 
369  FragmentPtr frag(new Fragment(header.word_count - header.num_words()));
370  memcpy(frag->headerAddress(), &header, header.num_words() * sizeof(RawDataType));
371  auto ret3 = source_plugins_[source_rank]->receiveFragmentData(frag->headerAddress() + header.num_words(), header.word_count - header.num_words());
372  if (ret3 != source_rank)
373  {
374  TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
375  throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
376  }
377 
378  switch (header.type)
379  {
380  case Fragment::EndOfDataFragmentType:
381  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
382  if (endOfDataCount == static_cast<size_t>(-1))
383  endOfDataCount = *(frag->dataBegin());
384  else
385  endOfDataCount += *(frag->dataBegin());
386  TLOG(TLVL_DEBUG) << "EndOfData Fragment indicates that " << endOfDataCount << " fragments are expected from rank " << source_rank
387  << " (recvd " << recv_frag_count_.slotCount(source_rank) << ").";
388  break;
389  case Fragment::InitFragmentType:
390  TLOG(TLVL_DEBUG) << "Received Init Fragment from rank " << source_rank << ".";
391  shm_manager_->setRequestMode(detail::RequestMessageMode::Normal);
392  shm_manager_->SetInitFragment(std::move(frag));
393  break;
394  case Fragment::EndOfRunFragmentType:
395  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
396  //shm_manager_->endRun();
397  break;
398  case Fragment::EndOfSubrunFragmentType:
399  //shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
400  TLOG(TLVL_DEBUG) << "Received EndOfSubrun Fragment from rank " << source_rank
401  << " with sequence_id " << header.sequence_id << ".";
402  if (header.sequence_id != Fragment::InvalidSequenceID)
403  shm_manager_->rolloverSubrun(header.sequence_id, header.timestamp);
404  else
405  shm_manager_->rolloverSubrun(recv_seq_count_.slotCount(source_rank), header.timestamp);
406  break;
407  case Fragment::ShutdownFragmentType:
408  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
409  break;
410  }
411  }
412  }
413 
414  source_plugins_[source_rank]->flush_buffers();
415 
416  TLOG(TLVL_DEBUG) << "runReceiver_ " << source_rank << " receive loop exited";
417  running_sources_[source_rank] = false;
418 }
std::set< int > running_sources() const
Get the list of sources which are still receiving data.
End of Run mode (Used to end request processing on receiver)
hostMap_t MakeHostMap(fhicl::ParameterSet pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:66
This TransferInterface is a Receiver.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
DataReceiverManager(const fhicl::ParameterSet &ps, std::shared_ptr< SharedMemoryEventManager > shm)
DataReceiverManager Constructor.
std::set< int > enabled_sources() const
Get the list of enabled sources.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, std::string > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
Definition: HostMap.hh:46
Value that is to be returned when a Transfer plugin determines that no more data will be arriving...
virtual ~DataReceiverManager()
DataReceiverManager Destructor.
void start_threads()
Start receiver threads for all enabled sources.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:39
void stop_threads()
Stop receiver threads.