artdaq  v3_00_01
DataReceiverManager.cc
1 #include <chrono>
2 
3 #define TRACE_NAME "DataReceiverManager"
4 #include "artdaq/DAQdata/Globals.hh"
5 #include "artdaq/DAQrate/DataReceiverManager.hh"
6 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
7 #include "cetlib_except/exception.h"
8 #include <iomanip>
9 
10 artdaq::DataReceiverManager::DataReceiverManager(const fhicl::ParameterSet& pset, std::shared_ptr<SharedMemoryEventManager> shm)
11  : stop_requested_(false)
12  , stop_requested_time_(0)
13  , source_threads_()
14  , source_plugins_()
15  , enabled_sources_()
16  , recv_frag_count_()
17  , recv_frag_size_()
18  , recv_seq_count_()
19  , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000))
20  , stop_timeout_ms_(pset.get<size_t>("stop_timeout_ms",3000))
21  , shm_manager_(shm)
22  , non_reliable_mode_enabled_(pset.get<bool>("non_reliable_mode", false))
23  , non_reliable_mode_retry_count_(pset.get<size_t>("non_reliable_mode_retry_count", -1))
24 {
25  TLOG_DEBUG("DataReceiverManager") << "Constructor" << TLOG_ENDL;
26  auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>());
27  auto enabled_srcs_empty = enabled_srcs.size() == 0;
28 
29  if (non_reliable_mode_enabled_)
30  {
31  TLOG_WARNING("DataReceiverManager") << "DataReceiverManager is configured to drop data after " << std::to_string(non_reliable_mode_retry_count_)
32  << " failed attempts to put data into the SharedMemoryEventManager! If this is unexpected, please check your configuration!" << TLOG_ENDL;
33  }
34 
35  if (enabled_srcs_empty)
36  {
37  TLOG_INFO("DataReceiverManager") << "enabled_sources not specified, assuming all sources enabled." << TLOG_ENDL;
38  }
39  else
40  {
41  for (auto& s : enabled_srcs)
42  {
43  enabled_sources_.insert(s);
44  }
45  }
46 
47  auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet());
48  for (auto& s : srcs.get_pset_names())
49  {
50  try
51  {
52  auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs, s,
54  auto source_rank = transfer->source_rank();
55  if (enabled_srcs_empty) enabled_sources_.insert(source_rank);
56  source_plugins_[source_rank] = std::move(transfer);
57  }
58  catch (cet::exception ex)
59  {
60  TLOG_WARNING("DataReceiverManager") << "cet::exception caught while setting up source " << s << ": " << ex.what() << TLOG_ENDL;
61  }
62  catch (std::exception ex)
63  {
64  TLOG_WARNING("DataReceiverManager") << "std::exception caught while setting up source " << s << ": " << ex.what() << TLOG_ENDL;
65  }
66  catch (...)
67  {
68  TLOG_WARNING("DataReceiverManager") << "Non-cet exception caught while setting up source " << s << "." << TLOG_ENDL;
69  }
70  }
71  if (srcs.get_pset_names().size() == 0)
72  {
73  TLOG_ERROR("DataReceiverManager") << "No sources configured!" << TLOG_ENDL;
74  }
75 }
76 
78 {
79  TLOG_TRACE("DataReceiverManager") << "~DataReceiverManager: BEGIN: Setting stop_requested to true, frags=" << std::to_string(count()) << ", bytes=" << std::to_string(byteCount()) << TLOG_ENDL;
80  stop_requested_time_ = TimeUtils::gettimeofday_us();
81  stop_requested_ = true;
82 
83  TLOG_TRACE("DataReceiverManager") << "~DataReceiverManager: Joining all threads" << TLOG_ENDL;
84  for (auto& s : source_threads_)
85  {
86  auto& thread = s.second;
87  if (thread.joinable()) thread.join();
88  }
89  shm_manager_.reset();
90  TLOG_TRACE("DataReceiverManager") << "Destructor END" << TLOG_ENDL;
91 }
92 
93 
95 {
96  for (auto& source : source_plugins_)
97  {
98  auto& rank = source.first;
99  if (enabled_sources_.count(rank))
100  {
101  running_sources_.insert(rank);
102  boost::thread::attributes attrs;
103  attrs.set_stack_size(4096 * 500); // 2000 KB
104  source_threads_[rank] = boost::thread(attrs, boost::bind(&DataReceiverManager::runReceiver_, this, rank));
105  }
106  }
107 }
108 
109 void artdaq::DataReceiverManager::runReceiver_(int source_rank)
110 {
111  std::chrono::steady_clock::time_point start_time, after_header, before_body;
112  int ret;
113  double delta_t, hdr_delta_t, store_delta_t, data_delta_t;
114  detail::RawFragmentHeader header;
115  size_t endOfDataCount = -1;
116  auto sleep_time = receive_timeout_ / 100 > 100000 ? 100000 : receive_timeout_ / 100;
117  auto max_retries = non_reliable_mode_retry_count_ * ceil(receive_timeout_ / sleep_time);
118 
119  while (!(stop_requested_ && TimeUtils::gettimeofday_us() - stop_requested_time_ > stop_timeout_ms_ * 1000) && enabled_sources_.count(source_rank))
120  {
121  TRACE(16, "DataReceiverManager::runReceiver_: Begin loop");
122  if (stop_requested_) { receive_timeout_ = stop_timeout_ms_; }
123 
124  start_time = std::chrono::steady_clock::now();
125 
126  TRACE(16, "DataReceiverManager::runReceiver_: Calling receiveFragmentHeader");
127  ret = source_plugins_[source_rank]->receiveFragmentHeader(header, receive_timeout_);
128  TRACE(16, "DataReceiverManager::runReceiver_: Done with receiveFragmentHeader, ret=%d (should be %d)", ret, source_rank);
129  if (ret != source_rank)
130  {
131  continue; // Receive timeout or other oddness
132  }
133 
134  after_header = std::chrono::steady_clock::now();
135 
136  if (Fragment::isUserFragmentType(header.type) || header.type == Fragment::DataFragmentType || header.type == Fragment::EmptyFragmentType || header.type == Fragment::ContainerFragmentType) {
137  TLOG_TRACE("DataReceiverManager") << "Received Fragment Header from rank " << source_rank << "." << TLOG_ENDL;
138  RawDataType* loc = nullptr;
139  size_t retries = 0;
140  while (loc == nullptr )//&& TimeUtils::GetElapsedTimeMicroseconds(after_header)) < receive_timeout_)
141  {
142  loc = shm_manager_->WriteFragmentHeader(header);
143  if (loc == nullptr) usleep(sleep_time);
144  if (stop_requested_) return;
145  retries++;
146  if (non_reliable_mode_enabled_ && retries > max_retries)
147  {
148  loc = shm_manager_->WriteFragmentHeader(header, true);
149  }
150  }
151  if (loc == nullptr)
152  {
153  // Could not enqueue event!
154  TLOG_ERROR("DataReceiverManager") << "runReceiver_: Could not get data location for event " << std::to_string(header.sequence_id) << TLOG_ENDL;
155  continue;
156  }
157  before_body = std::chrono::steady_clock::now();
158 
159  TRACE(16, "DataReceiverManager::runReceiver_: Calling receiveFragmentData");
160  auto ret2 = source_plugins_[source_rank]->receiveFragmentData(loc, header.word_count - header.num_words());
161  TRACE(16, "DataReceiverManager::runReceiver_: Done with receiveFragmentData, ret2=%d (should be %d)", ret2, source_rank);
162 
163  if (ret != ret2) {
164  TLOG_ERROR("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")" << TLOG_ENDL;
165  throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
166  }
167 
168  shm_manager_->DoneWritingFragment(header);
169  TLOG_TRACE("DataReceiverManager") << "Done receiving fragment with sequence ID " << std::to_string(header.sequence_id) << " from rank " << source_rank << TLOG_ENDL;
170 
171  recv_frag_count_.incSlot(source_rank);
172  recv_frag_size_.incSlot(source_rank, header.word_count * sizeof(RawDataType));
173  recv_seq_count_.setSlot(source_rank, header.sequence_id);
174  if (endOfDataCount != static_cast<size_t>(-1))
175  {
176  TLOG_DEBUG("DataReceiverManager") << "Received fragment " << std::to_string(header.sequence_id) << " from rank " << source_rank
177  << " (" << std::to_string(recv_frag_count_.slotCount(source_rank)) << "/" << std::to_string(endOfDataCount) << ")" << TLOG_ENDL;
178  }
179 
180  if (metricMan)
181  {//&& recv_frag_count_.slotCount(source_rank) % 100 == 0) {
182  TRACE(6, "DataReceiverManager::runReceiver_: Sending receive stats");
183  delta_t = TimeUtils::GetElapsedTime(start_time);
184  hdr_delta_t = TimeUtils::GetElapsedTime(start_time, after_header);
185  store_delta_t = TimeUtils::GetElapsedTime(after_header, before_body);
186  data_delta_t = TimeUtils::GetElapsedTime(before_body);
187  metricMan->sendMetric("Total Receive Time From Rank " + std::to_string(source_rank), delta_t, "s", 1, MetricMode::Accumulate);
188  metricMan->sendMetric("Total Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(header.word_count * sizeof(RawDataType)), "B", 1, MetricMode::Accumulate);
189  metricMan->sendMetric("Total Receive Rate From Rank " + std::to_string(source_rank), header.word_count * sizeof(RawDataType) / delta_t, "B/s", 1, MetricMode::Average);
190 
191  metricMan->sendMetric("Header Receive Time From Rank " + std::to_string(source_rank), hdr_delta_t, "s", 1, MetricMode::Accumulate);
192  metricMan->sendMetric("Header Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(header.num_words() * sizeof(RawDataType)), "B", 1, MetricMode::Accumulate);
193  metricMan->sendMetric("Header Receive Rate From Rank " + std::to_string(source_rank), header.num_words() * sizeof(RawDataType) / hdr_delta_t, "B/s", 1, MetricMode::Average);
194 
195  metricMan->sendMetric("Shared Memory Wait Time From Rank " + std::to_string(source_rank), store_delta_t, "s", 1, MetricMode::Accumulate);
196 
197  metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), data_delta_t, "s", 1, MetricMode::Accumulate);
198  metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>((header.word_count - header.num_words()) * sizeof(RawDataType)), "B", 1, MetricMode::Accumulate);
199  metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), (header.word_count - header.num_words()) * sizeof(RawDataType) / data_delta_t, "B/s", 1, MetricMode::Average);
200  metricMan->sendMetric("Data Receive Count From Rank " + std::to_string(source_rank), recv_frag_count_.slotCount(source_rank), "fragments", 3, MetricMode::Accumulate);
201  TRACE(6, "DataReceiverManager::runReceiver_: Done sending receive stats");
202  }
203  }
204  else if (header.type == Fragment::EndOfDataFragmentType || header.type == Fragment::InitFragmentType || header.type == Fragment::EndOfRunFragmentType || header.type == Fragment::EndOfSubrunFragmentType || header.type == Fragment::ShutdownFragmentType)
205  {
206  TLOG_DEBUG("DataReceiverManager") << "Received System Fragment from rank " << source_rank << " of type " << detail::RawFragmentHeader::SystemTypeToString(header.type) << "." << TLOG_ENDL;
207 
208  FragmentPtr frag(new Fragment(header.word_count - header.num_words()));
209  memcpy(frag->headerAddress(), &header, header.num_words() * sizeof(RawDataType));
210  auto ret3 = source_plugins_[source_rank]->receiveFragmentData(frag->headerAddress() + header.num_words(), header.word_count - header.num_words());
211  if (ret3 != source_rank)
212  {
213  TLOG_ERROR("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")" << TLOG_ENDL;
214  throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
215  }
216 
217  switch (header.type)
218  {
219  case Fragment::EndOfDataFragmentType:
220  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
221  endOfDataCount = *(frag->dataBegin());
222  TLOG_DEBUG("DataReceiverManager") << "EndOfData Fragment indicates that " << std::to_string(endOfDataCount) << " fragments are expected from rank " << source_rank
223  << " (recvd " << std::to_string(recv_frag_count_.slotCount(source_rank)) << ")." << TLOG_ENDL;
224  break;
225  case Fragment::InitFragmentType:
226  TLOG_DEBUG("DataReceiverManager") << "Received Init Fragment from rank " << source_rank << "." << TLOG_ENDL;
227  shm_manager_->setRequestMode(detail::RequestMessageMode::Normal);
228  shm_manager_->SetInitFragment(std::move(frag));
229  break;
230  case Fragment::EndOfRunFragmentType:
231  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
232  break;
233  case Fragment::EndOfSubrunFragmentType:
234  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
235  break;
236  case Fragment::ShutdownFragmentType:
237  shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
238  break;
239  }
240  }
241 
242  if (endOfDataCount <= recv_frag_count_.slotCount(source_rank))
243  {
244  running_sources_.erase(source_rank);
245  return;
246  }
247  }
248 }
End of Run mode (Used to end request processing on receiver)
This TransferInterface is a Receiver.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
DataReceiverManager(const fhicl::ParameterSet &ps, std::shared_ptr< SharedMemoryEventManager > shm)
DataReceiverManager Constructor.
virtual ~DataReceiverManager()
DataReceiverManager Destructor.
void start_threads()
Start receiver threads for all enabled sources.