artdaq  v3_07_01
FragmentReceiverManager.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_FragmentReceiverManager").c_str()
3 
4 #include <chrono>
5 
6 #include "artdaq/DAQrate/FragmentReceiverManager.hh"
7 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
8 #include "cetlib_except/exception.h"
9 
11  : stop_requested_(false)
12  , source_threads_()
13  , source_plugins_()
14  , source_metric_data_()
15  , source_metric_send_time_()
16  , enabled_sources_()
17  , fragment_store_()
18  , recv_frag_count_()
19  , recv_frag_size_()
20  , recv_seq_count_()
21  , suppress_noisy_senders_(pset.get<bool>("auto_suppression_enabled", true))
22  , suppression_threshold_(pset.get<size_t>("max_receive_difference", 50))
23  , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000))
24  , last_source_(-1)
25 {
26  TLOG(TLVL_DEBUG) << "Constructor";
27  auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>());
28  auto enabled_srcs_empty = enabled_srcs.size() == 0;
29  if (enabled_srcs_empty)
30  {
31  TLOG(TLVL_INFO) << "enabled_sources not specified, assuming all sources enabled.";
32  }
33  else
34  {
35  for (auto& s : enabled_srcs)
36  {
37  enabled_sources_[s] = true;
38  }
39  }
40 
41  auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet());
42  for (auto& s : srcs.get_pset_names())
43  {
44  try
45  {
46  auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs, s,
48  auto source_rank = transfer->source_rank();
49  if (enabled_srcs_empty)
50  enabled_sources_[source_rank] = true;
51  else if (!enabled_sources_.count(source_rank))
52  enabled_sources_[source_rank] = false;
53  running_sources_[source_rank] = false;
54  source_plugins_[source_rank] = std::move(transfer);
55  fragment_store_[source_rank];
56  source_metric_send_time_[source_rank] = std::chrono::steady_clock::now();
57  source_metric_data_[source_rank] = std::pair<size_t, double>();
58  }
59  catch (const cet::exception& ex)
60  {
61  TLOG(TLVL_WARNING) << "cet::exception caught while setting up source " << s << ": " << ex.what();
62  }
63  catch (const std::exception& ex)
64  {
65  TLOG(TLVL_WARNING) << "std::exception caught while setting up source " << s << ": " << ex.what();
66  }
67  catch (...)
68  {
69  TLOG(TLVL_WARNING) << "Non-cet exception caught while setting up source " << s << ".";
70  }
71  }
72  if (srcs.get_pset_names().size() == 0)
73  {
74  TLOG(TLVL_ERROR) << "No sources configured!";
75  }
76 }
77 
79 {
80  TLOG(TLVL_DEBUG) << "Destructor";
81  TLOG(5) << "~FragmentReceiverManager: BEGIN: Setting stop_requested to true, frags=" << count() << ", bytes=" << byteCount();
82  stop_requested_ = true;
83 
84  TLOG(5) << "~FragmentReceiverManager: Notifying all threads";
85  output_cv_.notify_all();
86 
87  TLOG(5) << "~FragmentReceiverManager: Joining all threads";
88  for (auto& s : source_threads_)
89  {
90  auto& thread = s.second;
91  if (thread.joinable()) thread.join();
92  }
93  TLOG(5) << "~FragmentReceiverManager: DONE";
94 }
95 
96 bool artdaq::FragmentReceiverManager::fragments_ready_() const
97 {
98  for (auto& it : fragment_store_)
99  {
100  if (!enabled_sources_.count(it.first)) continue;
101  if (!it.second.empty()) { return true; }
102  }
103  return false;
104 }
105 
106 int artdaq::FragmentReceiverManager::get_next_source_() const
107 {
108  //std::unique_lock<std::mutex> lck(fragment_store_mutex_);
109  std::set<int> ready_sources;
110  for (auto& it : fragment_store_)
111  {
112  if (!enabled_sources_.count(it.first)) continue;
113  if (!it.second.empty())
114  {
115  ready_sources.insert(it.first);
116  }
117  }
118 
119  if (ready_sources.size())
120  {
121  auto iter = ready_sources.find(last_source_);
122  if (iter == ready_sources.end() || ++iter == ready_sources.end())
123  {
124  TLOG(10) << "get_next_source returning " << *ready_sources.begin();
125  last_source_ = *ready_sources.begin();
126  return *ready_sources.begin();
127  }
128 
129  TLOG(10) << "get_next_source returning " << *iter;
130  last_source_ = *iter;
131  return *iter;
132  }
133 
134  TLOG(10) << "get_next_source returning -1";
135  return -1;
136 }
137 
139 {
140  for (auto& source : source_plugins_)
141  {
142  auto& rank = source.first;
143  if (enabled_sources_.count(rank))
144  {
145  running_sources_[rank] = true;
146  try
147  {
148  source_threads_[rank] = boost::thread(&FragmentReceiverManager::runReceiver_, this, rank);
149  }
150  catch (const boost::exception& e)
151  {
152  TLOG(TLVL_ERROR) << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
153  std::cerr << "Caught boost::exception starting Receiver " << rank << " thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
154  exit(5);
155  }
156  }
157  }
158 }
159 
160 artdaq::FragmentPtr artdaq::FragmentReceiverManager::recvFragment(int& rank, size_t timeout_usec)
161 {
162  TLOG(5) << "recvFragment entered tmo=" << timeout_usec << " us";
163 
164  if (timeout_usec == 0) timeout_usec = 1000000;
165 
166  auto ready = fragments_ready_();
167  size_t waited = 0;
168  auto wait_amount = timeout_usec / 1000 > 1000 ? timeout_usec / 1000 : 1000;
169  TLOG(5) << "recvFragment fragment_ready_=" << ready << " before wait";
170  while (!ready && waited < timeout_usec)
171  {
172  {
173  std::unique_lock<std::mutex> lck(input_cv_mutex_);
174  input_cv_.wait_for(lck, std::chrono::microseconds(wait_amount));
175  }
176  waited += wait_amount;
177  ready = fragments_ready_();
178  if (running_sources().size() == 0) break;
179  }
180  TLOG(5) << "recvFragment fragment_ready_=" << ready << " after waited=" << waited;
181  if (!ready)
182  {
183  TLOG(5) << "recvFragment: No fragments ready, returning empty";
185  return std::unique_ptr<Fragment>{};
186  }
187 
188  int current_source = get_next_source_();
189  FragmentPtr current_fragment = fragment_store_[current_source].front();
190  output_cv_.notify_all();
191  rank = current_source;
192 
193  if (current_fragment != nullptr)
194  TLOG(5) << "recvFragment: Done rank=" << rank << ", fragment size=" << std::to_string(current_fragment->size()) << " words, seqId=" << current_fragment->sequenceID();
195  return current_fragment;
196 }
197 
199 {
200  std::set<int> output;
201  for (auto& src : running_sources_)
202  {
203  if (src.second) output.insert(src.first);
204  }
205  return output;
206 }
207 
209 {
210  std::set<int> output;
211  for (auto& src : enabled_sources_)
212  {
213  if (src.second) output.insert(src.first);
214  }
215  return output;
216 }
217 
218 void artdaq::FragmentReceiverManager::runReceiver_(int source_rank)
219 {
220  while (!stop_requested_ && enabled_sources_.count(source_rank))
221  {
222  TLOG(16) << "runReceiver_ " << source_rank << ": Begin loop";
223  auto is_suppressed = suppress_noisy_senders_ && recv_seq_count_.slotCount(source_rank) > suppression_threshold_ + recv_seq_count_.minCount();
224  while (!stop_requested_ && is_suppressed)
225  {
226  TLOG(6) << "runReceiver_: Suppressing receiver rank " << source_rank;
227  if (!is_suppressed)
228  input_cv_.notify_all();
229  else
230  {
231  std::unique_lock<std::mutex> lck(output_cv_mutex_);
232  output_cv_.wait_for(lck, std::chrono::seconds(1));
233  }
234  is_suppressed = suppress_noisy_senders_ && recv_seq_count_.slotCount(source_rank) > suppression_threshold_ + recv_seq_count_.minCount();
235  }
236  if (stop_requested_)
237  {
238  running_sources_[source_rank] = false;
239  return;
240  }
241 
242  if (fragment_store_[source_rank].GetEndOfData() <= recv_frag_count_.slotCount(source_rank) && !source_plugins_[source_rank]->isRunning())
243  {
244  TLOG(TLVL_DEBUG) << "runReceiver_: EndOfData conditions satisfied, ending receive loop";
245  running_sources_[source_rank] = false;
246  return;
247  }
248 
249  auto start_time = std::chrono::steady_clock::now();
250  TLOG(16) << "runReceiver_: Calling receiveFragment";
251  auto fragment = std::unique_ptr<Fragment>(new Fragment());
252 #if 0
253  auto ret = source_plugins_[source_rank]->receiveFragment(*fragment, receive_timeout_);
254  TLOG(16) << "runReceiver_: Done with receiveFragment, ret=" << ret << " (should be " << source_rank << ")";
255  if (ret != source_rank) continue; // Receive timeout or other oddness
256 #else
257  artdaq::detail::RawFragmentHeader hdr;
258  auto ret1 = source_plugins_[source_rank]->receiveFragmentHeader(hdr, receive_timeout_);
259  TLOG(16) << "runReceiver_: Done with receiveFragmentHeader, ret1=" << ret1 << " (should be " << source_rank << ")";
260 
261  if (ret1 != source_rank) continue; // Receive timeout or other oddness
262 
263  fragment->resize(hdr.word_count - hdr.num_words());
264  memcpy(fragment->headerAddress(), &hdr, hdr.num_words() * sizeof(artdaq::RawDataType));
265  auto ret2 = source_plugins_[source_rank]->receiveFragmentData(fragment->headerAddress() + hdr.num_words(), hdr.word_count - hdr.num_words());
266  if (ret2 != ret1)
267  {
268  TLOG(TLVL_ERROR) << "ReceiveFragmentHeader returned " << ret1 << ", but ReceiveFragmentData returned " << ret2;
269  continue;
270  }
271 #endif
272 
273  if (fragment->type() == artdaq::Fragment::EndOfDataFragmentType)
274  {
275  TLOG(TLVL_TRACE) << "runReceiver_: EndOfData Fragment received!";
276  fragment_store_[source_rank].SetEndOfData(*reinterpret_cast<size_t*>(fragment->dataBegin()));
277  }
278  else if (fragment->type() == artdaq::Fragment::DataFragmentType || fragment->type() == artdaq::Fragment::ContainerFragmentType || fragment->isUserFragmentType(fragment->type()))
279  {
280  TLOG(TLVL_TRACE) << "runReceiver_: Data Fragment received!";
281  recv_frag_count_.incSlot(source_rank);
282  recv_frag_size_.incSlot(source_rank, fragment->size() * sizeof(RawDataType));
283  recv_seq_count_.setSlot(source_rank, fragment->sequenceID());
284  }
285  else
286  {
287  continue;
288  }
289 
290  auto delta_t = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - start_time).count();
291  source_metric_data_[source_rank].first += fragment->size() * sizeof(RawDataType);
292  source_metric_data_[source_rank].second += delta_t;
293 
294  if (metricMan && TimeUtils::GetElapsedTime(source_metric_send_time_[source_rank]) > 1)
295  {
296  TLOG(6) << "runReceiver_: Sending receive stats";
297  metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].second, "s", 1, MetricMode::Accumulate);
298  metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(source_metric_data_[source_rank].first), "B", 1, MetricMode::Accumulate);
299  metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), source_metric_data_[source_rank].first / source_metric_data_[source_rank].second, "B/s", 1, MetricMode::Average);
300 
301  source_metric_send_time_[source_rank] = std::chrono::steady_clock::now();
302  source_metric_data_[source_rank].first = 0;
303  source_metric_data_[source_rank].second = 0.0;
304  }
305 
306  fragment_store_[source_rank].emplace_back(std::move(fragment));
307  TLOG(TLVL_TRACE) << "runReceiver_: There are now " << fragment_store_[source_rank].size() << " Fragments stored from this source";
308  input_cv_.notify_all();
309  }
310 
311  running_sources_[source_rank] = false;
312 }
void start_threads()
Start receiver threads for all enabled sources.
std::set< int > enabled_sources() const
Get the list of enabled sources.
virtual ~FragmentReceiverManager()
FragmentReceiverManager Destructor.
This TransferInterface is a Receiver.
FragmentReceiverManager(const fhicl::ParameterSet &ps)
FragmentReceiverManager Constructor.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
FragmentPtr recvFragment(int &rank, size_t timeout_usec=0)
Receive a Fragment.
std::set< int > running_sources() const
Get the list of sources which are still receiving data.
Value to be returned upon receive timeout.