artdaq  v3_02_01
DataReceiverManager.cc
1 #include <chrono>
2 
3 #define TRACE_NAME "DataReceiverManager"
4 #include "artdaq/DAQdata/Globals.hh"
5 #include "artdaq/DAQrate/DataReceiverManager.hh"
6 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
7 #include "artdaq/TransferPlugins/detail/HostMap.hh"
8 #include "cetlib_except/exception.h"
9 #include <iomanip>
10 
11 artdaq::DataReceiverManager::DataReceiverManager(const fhicl::ParameterSet& pset, std::shared_ptr<SharedMemoryEventManager> shm)
12  : stop_requested_(false)
13  , stop_requested_time_(0)
14  , source_threads_()
15  , source_plugins_()
16  , source_metric_data_()
17  , source_metric_send_time_()
18  , enabled_sources_()
19  , running_sources_()
20  , recv_frag_count_()
21  , recv_frag_size_()
22  , recv_seq_count_()
23  , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000))
24  , stop_timeout_ms_(pset.get<size_t>("stop_timeout_ms", 1500))
25  , shm_manager_(shm)
26  , non_reliable_mode_enabled_(pset.get<bool>("non_reliable_mode", false))
27  , non_reliable_mode_retry_count_(pset.get<size_t>("non_reliable_mode_retry_count", -1))
28 {
29  TLOG(TLVL_DEBUG) << "Constructor";
30  auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>());
31  auto enabled_srcs_empty = enabled_srcs.size() == 0;
32 
33  if (non_reliable_mode_enabled_)
34  {
35  TLOG(TLVL_WARNING) << "DataReceiverManager is configured to drop data after " << non_reliable_mode_retry_count_
36  << " failed attempts to put data into the SharedMemoryEventManager! If this is unexpected, please check your configuration!";
37  }
38 
39  if (enabled_srcs_empty)
40  {
41  TLOG(TLVL_INFO) << "enabled_sources not specified, assuming all sources enabled.";
42  }
43  else
44  {
45  for (auto& s : enabled_srcs)
46  {
47  enabled_sources_[s] = true;
48  }
49  }
50 
51  hostMap_t host_map = MakeHostMap(pset);
52  auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet());
53  for (auto& s : srcs.get_pset_names())
54  {
55  auto src_pset = srcs.get<fhicl::ParameterSet>(s);
56  host_map = MakeHostMap(src_pset, 0, host_map);
57  }
58  auto host_map_pset = MakeHostMapPset(host_map);
59  fhicl::ParameterSet srcs_mod;
60  for (auto& s : srcs.get_pset_names())
61  {
62  auto src_pset = srcs.get<fhicl::ParameterSet>(s);
63  src_pset.erase("host_map");
64  src_pset.put<std::vector<fhicl::ParameterSet>>("host_map", host_map_pset);
65  srcs_mod.put<fhicl::ParameterSet>(s, src_pset);
66  }
67 
68  for (auto& s : srcs_mod.get_pset_names())
69  {
70  try
71  {
72  auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs_mod, s,
74  auto source_rank = transfer->source_rank();
75  if (enabled_srcs_empty) enabled_sources_[source_rank] = true;
76  else if (!enabled_sources_.count(source_rank)) enabled_sources_[source_rank] = false;
77  running_sources_[source_rank] = false;
78  source_plugins_[source_rank] = std::move(transfer);
79  source_metric_send_time_[source_rank] = std::chrono::steady_clock::now();
80  source_metric_data_[source_rank] = source_metric_data();
81  }
82  catch (cet::exception ex)
83  {
84  TLOG(TLVL_WARNING) << "cet::exception caught while setting up source " << s << ": " << ex.what();
85  }
86  catch (std::exception ex)
87  {
88  TLOG(TLVL_WARNING) << "std::exception caught while setting up source " << s << ": " << ex.what();
89  }
90  catch (...)
91  {
92  TLOG(TLVL_WARNING) << "Non-cet exception caught while setting up source " << s << ".";
93  }
94  }
95  if (srcs.get_pset_names().size() == 0)
96  {
97  TLOG(TLVL_ERROR) << "No sources configured!";
98  }
99 }
100 
102 {
103  TLOG(TLVL_TRACE) << "~DataReceiverManager: BEGIN";
104  stop_threads();
105  shm_manager_.reset();
106  TLOG(TLVL_TRACE) << "Destructor END";
107 }
108 
109 
111 {
112  stop_requested_ = false;
113  if (shm_manager_) shm_manager_->setRequestMode(artdaq::detail::RequestMessageMode::Normal);
114  for (auto& source : source_plugins_)
115  {
116  auto& rank = source.first;
117  if (enabled_sources_.count(rank) && enabled_sources_[rank].load())
118  {
119  running_sources_[rank] = true;
120  boost::thread::attributes attrs;
121  attrs.set_stack_size(4096 * 500); // 2000 KB
122  source_threads_[rank] = boost::thread(attrs, boost::bind(&DataReceiverManager::runReceiver_, this, rank));
123  }
124  }
125 }
126 
128 {
129  TLOG(TLVL_TRACE) << "stop_threads: BEGIN: Setting stop_requested to true, frags=" << count() << ", bytes=" << byteCount();
130 
131  stop_requested_time_ = TimeUtils::gettimeofday_us();
132  stop_requested_ = true;
133 
134  TLOG(TLVL_TRACE) << "stop_threads: Joining all threads";
135  for (auto& s : source_threads_)
136  {
137  auto& thread = s.second;
138  if (thread.joinable()) thread.join();
139  }
140 }
141 
143 {
144  std::set<int> output;
145  for (auto& src : enabled_sources_)
146  {
147  if (src.second) output.insert(src.first);
148  }
149  return output;
150 }
151 
153 {
154  std::set<int> output;
155  for (auto& src : running_sources_)
156  {
157  if (src.second) output.insert(src.first);
158  }
159  return output;
160 }
161 
162 void artdaq::DataReceiverManager::runReceiver_(int source_rank)
163 {
164  std::chrono::steady_clock::time_point start_time, after_header, before_body;
165  int ret;
166  detail::RawFragmentHeader header;
167  size_t endOfDataCount = -1;
168  auto sleep_time = receive_timeout_ / 100 > 100000 ? 100000 : receive_timeout_ / 100;
169  auto max_retries = non_reliable_mode_retry_count_ * ceil(receive_timeout_ / sleep_time);
170 
171  while (!(stop_requested_ && TimeUtils::gettimeofday_us() - stop_requested_time_ > stop_timeout_ms_ * 1000) && enabled_sources_.count(source_rank))
172  {
173  TLOG(16) << "runReceiver_: Begin loop";
174  std::this_thread::yield();
175 
176  // Don't stop receiving until we haven't received anything for 1 second
177  if (endOfDataCount <= recv_frag_count_.slotCount(source_rank) && !source_plugins_[source_rank]->isRunning())
178  {
179  TLOG(TLVL_DEBUG) << "runReceiver_: End of Data conditions met, ending runReceiver loop";
180  break;
181  }
182 
183  start_time = std::chrono::steady_clock::now();
184 
185  TLOG(16) << "runReceiver_: Calling receiveFragmentHeader tmo=" << receive_timeout_;
186  ret = source_plugins_[source_rank]->receiveFragmentHeader(header, receive_timeout_);
187  TLOG(16) << "runReceiver_: Done with receiveFragmentHeader, ret=" << ret << " (should be " << source_rank << ")";
188  if (ret != source_rank)
189  {
190  if (ret >= 0) {
191  TLOG(TLVL_WARNING) << "Received Fragment from rank " << ret << ", but was expecting one from rank " << source_rank << "!";
192  }
193  else if(ret == TransferInterface::DATA_END)
194  {
195  TLOG(TLVL_ERROR) << "Transfer Plugin returned DATA_END, ending receive loop!";
196  break;
197  }
198  continue; // Receive timeout or other oddness
199  }
200 
201  after_header = std::chrono::steady_clock::now();
202 
203  if (Fragment::isUserFragmentType(header.type) || header.type == Fragment::DataFragmentType || header.type == Fragment::EmptyFragmentType || header.type == Fragment::ContainerFragmentType) {
204  TLOG(TLVL_TRACE) << "Received Fragment Header from rank " << source_rank << ".";
205  RawDataType* loc = nullptr;
206  size_t retries = 0;
207  while (loc == nullptr)//&& TimeUtils::GetElapsedTimeMicroseconds(after_header)) < receive_timeout_)
208  {
209  loc = shm_manager_->WriteFragmentHeader(header);
210  if (loc == nullptr) usleep(sleep_time);
211  if (stop_requested_) return;
212  retries++;
213  if (non_reliable_mode_enabled_ && retries > max_retries)
214  {
215  loc = shm_manager_->WriteFragmentHeader(header, true);
216  }
217  }
218  if (loc == nullptr)
219  {
220  // Could not enqueue event!
221  TLOG(TLVL_ERROR) << "runReceiver_: Could not get data location for event " << header.sequence_id;
222  continue;
223  }
224  before_body = std::chrono::steady_clock::now();
225 
226  TLOG(16) << "runReceiver_: Calling receiveFragmentData";
227  auto ret2 = source_plugins_[source_rank]->receiveFragmentData(loc, header.word_count - header.num_words());
228  TLOG(16) << "runReceiver_: Done with receiveFragmentData, ret2=" << ret2 << " (should be " << source_rank << ")";
229 
230  if (ret != ret2) {
231  TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
232  throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
233  }
234 
235  shm_manager_->DoneWritingFragment(header);
236  TLOG(TLVL_TRACE) << "Done receiving fragment with sequence ID " << header.sequence_id << " from rank " << source_rank;
237 
238  recv_frag_count_.incSlot(source_rank);
239  recv_frag_size_.incSlot(source_rank, header.word_count * sizeof(RawDataType));
240  recv_seq_count_.setSlot(source_rank, header.sequence_id);
241  if (endOfDataCount != static_cast<size_t>(-1))
242  {
243  TLOG(TLVL_DEBUG) << "Received fragment " << header.sequence_id << " from rank " << source_rank
244  << " (" << recv_frag_count_.slotCount(source_rank) << "/" << endOfDataCount << ")";
245  }
246 
247 
248  source_metric_data_[source_rank].delta_t += TimeUtils::GetElapsedTime(start_time);
249  source_metric_data_[source_rank].hdr_delta_t += TimeUtils::GetElapsedTime(start_time, after_header);
250  source_metric_data_[source_rank].store_delta_t += TimeUtils::GetElapsedTime(after_header, before_body);
251  source_metric_data_[source_rank].data_delta_t += TimeUtils::GetElapsedTime(before_body);
252 
253  source_metric_data_[source_rank].data_size += header.word_count * sizeof(RawDataType);
254  source_metric_data_[source_rank].header_size += header.num_words() * sizeof(RawDataType);
255 
256  if (metricMan && TimeUtils::GetElapsedTime(source_metric_send_time_[source_rank]) > 1)
257  {//&& recv_frag_count_.slotCount(source_rank) % 100 == 0) {
258  TLOG(6) << "runReceiver_: Sending receive stats";
259  metricMan->sendMetric("Total Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].delta_t, "s", 5, MetricMode::Accumulate);
260  metricMan->sendMetric("Total Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(source_metric_data_[source_rank].data_size), "B", 5, MetricMode::Accumulate);
261  metricMan->sendMetric("Total Receive Rate From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].data_size / source_metric_data_[source_rank].delta_t, "B/s", 5, MetricMode::Average);
262 
263  metricMan->sendMetric("Header Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].hdr_delta_t, "s", 5, MetricMode::Accumulate);
264  metricMan->sendMetric("Header Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(source_metric_data_[source_rank].header_size), "B", 5, MetricMode::Accumulate);
265  metricMan->sendMetric("Header Receive Rate From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].header_size / source_metric_data_[source_rank].hdr_delta_t, "B/s", 5, MetricMode::Average);
266 
267  auto payloadSize = source_metric_data_[source_rank].data_size - source_metric_data_[source_rank].header_size;
268  metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].data_delta_t, "s", 5, MetricMode::Accumulate);
269  metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(payloadSize), "B", 5, MetricMode::Accumulate);
270  metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), payloadSize / source_metric_data_[source_rank].data_delta_t, "B/s", 5, MetricMode::Average);
271 
272  metricMan->sendMetric("Data Receive Count From Rank " + std::to_string(source_rank), recv_frag_count_.slotCount(source_rank), "fragments", 3, MetricMode::LastPoint);
273 
274  metricMan->sendMetric("Shared Memory Wait Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].store_delta_t, "s", 5, MetricMode::Accumulate);
275 
276  TLOG(6) << "runReceiver_: Done sending receive stats";
277 
278  source_metric_send_time_[source_rank] = std::chrono::steady_clock::now();
279  source_metric_data_[source_rank] = source_metric_data();
280  }
281  }
282  else if (header.type == Fragment::EndOfDataFragmentType || header.type == Fragment::InitFragmentType || header.type == Fragment::EndOfRunFragmentType || header.type == Fragment::EndOfSubrunFragmentType || header.type == Fragment::ShutdownFragmentType)
283  {
284  TLOG(TLVL_DEBUG) << "Received System Fragment from rank " << source_rank << " of type " << detail::RawFragmentHeader::SystemTypeToString(header.type) << ".";
285 
286  FragmentPtr frag(new Fragment(header.word_count - header.num_words()));
287  memcpy(frag->headerAddress(), &header, header.num_words() * sizeof(RawDataType));
288  auto ret3 = source_plugins_[source_rank]->receiveFragmentData(frag->headerAddress() + header.num_words(), header.word_count - header.num_words());
289  if (ret3 != source_rank)
290  {
291  TLOG(TLVL_ERROR) << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
292  throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
293  }
294 
295  switch (header.type)
296  {
297  case Fragment::EndOfDataFragmentType:
298  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
299  endOfDataCount = *(frag->dataBegin());
300  TLOG(TLVL_DEBUG) << "EndOfData Fragment indicates that " << endOfDataCount << " fragments are expected from rank " << source_rank
301  << " (recvd " << recv_frag_count_.slotCount(source_rank) << ").";
302  break;
303  case Fragment::InitFragmentType:
304  TLOG(TLVL_DEBUG) << "Received Init Fragment from rank " << source_rank << ".";
305  shm_manager_->setRequestMode(detail::RequestMessageMode::Normal);
306  shm_manager_->SetInitFragment(std::move(frag));
307  break;
308  case Fragment::EndOfRunFragmentType:
309  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
310  //shm_manager_->endRun();
311  break;
312  case Fragment::EndOfSubrunFragmentType:
313  //shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
314  if (header.sequence_id != Fragment::InvalidSequenceID) shm_manager_->rolloverSubrun(header.sequence_id);
315  else shm_manager_->rolloverSubrun(recv_seq_count_.slotCount(source_rank));
316  break;
317  case Fragment::ShutdownFragmentType:
318  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
319  break;
320  }
321  }
322  }
323 
324 
325  running_sources_[source_rank] = false;
326 }
std::set< int > running_sources() const
Get the list of sources which are still receiving data.
End of Run mode (Used to end request processing on receiver)
This TransferInterface is a Receiver.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
DataReceiverManager(const fhicl::ParameterSet &ps, std::shared_ptr< SharedMemoryEventManager > shm)
DataReceiverManager Constructor.
std::set< int > enabled_sources() const
Get the list of enabled sources.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, DestinationInfo > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
Definition: HostMap.hh:56
Value that is to be returned when a Transfer plugin determines that no more data will be arriving...
std::map< int, DestinationInfo > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Definition: HostMap.hh:49
hostMap_t MakeHostMap(fhicl::ParameterSet pset, int masterPortOffset=0, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:77
virtual ~DataReceiverManager()
DataReceiverManager Destructor.
void start_threads()
Start receiver threads for all enabled sources.
void stop_threads()
Stop receiver threads.