artdaq  v2_03_02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Pages
AggregatorCore.cc
1 #pragma GCC diagnostic push
2 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3 #include <xmlrpc-c/client_simple.hpp>
4 #pragma GCC diagnostic pop
5 
6 #include <errno.h>
7 #include <sstream>
8 #include <iomanip>
9 #include <bitset>
10 
11 #include <boost/tokenizer.hpp>
12 #include <boost/filesystem.hpp>
13 #include <boost/algorithm/string.hpp>
14 #include "art/Framework/Art/artapp.h"
15 #include "cetlib/BasicPluginFactory.h"
16 
17 #include "artdaq-core/Core/SimpleQueueReader.hh"
18 #include "artdaq-core/Utilities/ExceptionHandler.hh"
19 #include "artdaq-core/Data/RawEvent.hh"
20 
21 #include "artdaq/Application/AggregatorCore.hh"
22 #include "artdaq/DAQrate/EventStore.hh"
23 #include "artdaq/DAQrate/detail/FragCounter.hh"
24 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
25 
26 
27 namespace BFS = boost::filesystem;
28 
29 const std::string artdaq::AggregatorCore::INPUT_EVENTS_STAT_KEY("AggregatorCoreInputEvents");
30 const std::string artdaq::AggregatorCore::INPUT_WAIT_STAT_KEY("AggregatorCoreInputWaitTime");
31 const std::string artdaq::AggregatorCore::STORE_EVENT_WAIT_STAT_KEY("AggregatorCoreStoreEventWaitTime");
32 const std::string artdaq::AggregatorCore::SHM_COPY_TIME_STAT_KEY("AggregatorCoreShmCopyTime");
33 const std::string artdaq::AggregatorCore::FILE_CHECK_TIME_STAT_KEY("AggregatorCoreFileCheckTime");
34 
35 namespace artdaq
36 {
43  void display_bits(void* memstart, size_t nbytes, std::string sourcename)
44  {
45  std::stringstream bitstr;
46  bitstr << "The " << nbytes << "-byte chunk of memory beginning at " << static_cast<void*>(memstart) << " is : ";
47 
48  for (unsigned int i = 0; i < nbytes; i++)
49  {
50  if (i % 4 == 0)
51  {
52  bitstr << "\n";
53  }
54 
55  bitstr << std::bitset<8>(*((reinterpret_cast<uint8_t*>(memstart)) + i)) << " ";
56  }
57 
58  TLOG_DEBUG(sourcename) << bitstr.str() << TLOG_ENDL;
59  }
60 }
61 
62 
63 artdaq::AggregatorCore::AggregatorCore(int rank, std::string name)
64  : name_(name)
65  , art_initialized_(false)
66  , event_queue_(artdaq::getGlobalQueue(10))
67  , stop_requested_(false)
68  , local_pause_requested_(false)
69  , processing_fragments_(false)
70  , system_pause_requested_(false)
71  , previous_run_duration_(-1.0)
72  , new_transfers_(0)
73 {
74  TLOG_DEBUG(name_) << "Constructor" << TLOG_ENDL;
80  metricMan = &metricMan_;
81  my_rank = rank;
82 }
83 
85 {
86  TLOG_DEBUG(name_) << "Destructor" << TLOG_ENDL;
87 }
88 
89 bool artdaq::AggregatorCore::initialize(fhicl::ParameterSet const& pset)
90 {
91  init_string_ = pset.to_string();
92  TLOG_DEBUG(name_) << "initialize method called with DAQ " << "ParameterSet = \"" << init_string_ << "\"." << TLOG_ENDL;
93 
94  // pull out the relevant parts of the ParameterSet
95  fhicl::ParameterSet daq_pset;
96  try
97  {
98  daq_pset = pset.get<fhicl::ParameterSet>("daq");
99  }
100  catch (...)
101  {
102  TLOG_ERROR(name_)
103  << "Unable to find the DAQ parameters in the initialization "
104  << "ParameterSet: \"" + pset.to_string() + "\"." << TLOG_ENDL;
105  return false;
106  }
107  fhicl::ParameterSet agg_pset;
108  try
109  {
110  agg_pset = daq_pset.get<fhicl::ParameterSet>("aggregator");
111  data_pset_ = agg_pset;
112  }
113  catch (...)
114  {
115  TLOG_ERROR(name_)
116  << "Unable to find the aggregator parameters in the DAQ "
117  << "initialization ParameterSet: \"" + daq_pset.to_string() + "\"." << TLOG_ENDL;
118  return false;
119  }
120  try
121  {
122  expected_events_per_bunch_ =
123  agg_pset.get<size_t>("expected_events_per_bunch");
124  }
125  catch (...)
126  {
127  TLOG_ERROR(name_)
128  << "The expected_events_per_bunch parameter was not specified "
129  << "in the aggregator initialization PSet: \"" << pset.to_string()
130  << "\"." << TLOG_ENDL;
131  return false;
132  }
133 
134  enq_timeout_ = static_cast<detail::seconds>(agg_pset.get<size_t>("enq_timeout", 5.0));
135 
136  // 15-Jun-2016, KAB: added ability to specify either is_data_logger or
137  // is_online_monitor in the parameter set. If neither are set in the PSet,
138  // then we default to the old-style of behavior in which the first AG is the
139  // data logger and the second is the online monitor.
140  is_data_logger_ = false;
141  is_online_monitor_ = false;
142  is_dispatcher_ = false;
143  std::string metricsReportingInstanceName = "Data Logger";
144  bool agtype_was_specified = false;
145  if (!agtype_was_specified)
146  {
147  try
148  {
149  is_data_logger_ = agg_pset.get<bool>("is_data_logger");
150  agtype_was_specified = true;
151  }
152  catch (...) {} // leave agtype_was_specified set to false
153  }
154  if (!agtype_was_specified)
155  {
156  try
157  {
158  is_online_monitor_ = agg_pset.get<bool>("is_online_monitor");
159  metricsReportingInstanceName = "Online Monitor";
160  agtype_was_specified = true;
161  }
162  catch (...) {} // leave agtype_was_specified set to false
163  }
164  if (!agtype_was_specified)
165  {
166  try
167  {
168  is_dispatcher_ = agg_pset.get<bool>("is_dispatcher");
169  metricsReportingInstanceName = "Dispatcher";
170  agtype_was_specified = true;
171  }
172  catch (...) {} // leave agtype_was_specified set to false
173  }
174 
175  if (!agtype_was_specified)
176  {
177  throw cet::exception("ConfigurationException", "You must specify one of is_data_logger, is_online_monitor or is_dispatcher");
178  return false;
179  }
180  TLOG_DEBUG(name_) << "Rank " << my_rank
181  << ", is_data_logger = " << is_data_logger_
182  << ", is_online_monitor = " << is_online_monitor_
183  << ", is_dispatcher = " << is_dispatcher_ << TLOG_ENDL;
184 
185  disk_writing_directory_ = "";
186  try
187  {
188  fhicl::ParameterSet output_pset =
189  pset.get<fhicl::ParameterSet>("outputs");
190  fhicl::ParameterSet normalout_pset =
191  output_pset.get<fhicl::ParameterSet>("normalOutput");
192 
193  if (!normalout_pset.is_empty())
194  {
195  std::string filename = normalout_pset.get<std::string>("fileName", "");
196  if (filename.size() > 0)
197  {
198  size_t pos = filename.rfind("/");
199  if (pos != std::string::npos)
200  {
201  disk_writing_directory_ = filename.substr(0, pos);
202  }
203  }
204  else
205  {
206  TLOG_WARNING(name_) << "Problem finding \"fileName\" parameter in \"normalOutput\" RootOutput module FHiCL code" << TLOG_ENDL;
207  }
208  }
209  }
210  catch (...) {}
211 
212  std::string xmlrpcClientString =
213  agg_pset.get<std::string>("xmlrpc_client_list", "");
214  if (xmlrpcClientString.size() > 0)
215  {
216  xmlrpc_client_lists_.clear();
217  boost::char_separator<char> sep1(";");
218  boost::tokenizer<boost::char_separator<char>>
219  primaryTokens(xmlrpcClientString, sep1);
220  boost::tokenizer<boost::char_separator<char>>::iterator iter1;
221  boost::tokenizer<boost::char_separator<char>>::iterator
222  endIter1 = primaryTokens.end();
223  for (iter1 = primaryTokens.begin(); iter1 != endIter1; ++iter1)
224  {
225  boost::char_separator<char> sep2(",");
226  boost::tokenizer<boost::char_separator<char>>
227  secondaryTokens(*iter1, sep2);
228  boost::tokenizer<boost::char_separator<char>>::iterator iter2;
229  boost::tokenizer<boost::char_separator<char>>::iterator
230  endIter2 = secondaryTokens.end();
231  int clientGroup = -1;
232  std::string url = "";
233  int loopCount = 0;
234  for (iter2 = secondaryTokens.begin(); iter2 != endIter2; ++iter2)
235  {
236  switch (loopCount)
237  {
238  case 0:
239  url = *iter2;
240  break;
241  case 1:
242  try
243  {
244  clientGroup = boost::lexical_cast<int>(*iter2);
245  }
246  catch (...) {}
247  break;
248  default:
249  TLOG_WARNING(name_)
250  << "Unexpected XMLRPC client list element, index = "
251  << loopCount << ", value = \"" << *iter2 << "\"" << TLOG_ENDL;
252  }
253  ++loopCount;
254  }
255  if (clientGroup >= 0 && url.size() > 0)
256  {
257  int elementsNeeded = clientGroup + 1 - ((int)xmlrpc_client_lists_.size());
258  for (int idx = 0; idx < elementsNeeded; ++idx)
259  {
260  std::vector<std::string> tmpVec;
261  xmlrpc_client_lists_.push_back(tmpVec);
262  }
263  xmlrpc_client_lists_[clientGroup].push_back(url);
264  }
265  }
266  }
267  double fileSizeMB = agg_pset.get<double>("subrun_size_MB", 0);
268  file_close_threshold_bytes_ = ((size_t)fileSizeMB * 1024.0 * 1024.0);
269  file_close_timeout_secs_ = agg_pset.get<time_t>("subrun_duration", 0);
270  file_close_event_count_ = agg_pset.get<size_t>("subrun_event_count", 0);
271 
272  inrun_recv_timeout_usec_ = agg_pset.get<size_t>("inrun_recv_timeout_usec", 100000);
273  endrun_recv_timeout_usec_ = agg_pset.get<size_t>("endrun_recv_timeout_usec", 20000000);
274  pause_recv_timeout_usec_ = agg_pset.get<size_t>("pause_recv_timeout_usec", 3000000);
275 
276  onmon_event_prescale_ = agg_pset.get<size_t>("onmon_event_prescale", 1);
277 
278  filesize_check_interval_seconds_ = agg_pset.get<int32_t>("filesize_check_interval_seconds", 20);
279  filesize_check_interval_events_ = agg_pset.get<int32_t>("filesize_check_interval_events", 20);
280 
281  // fetch the monitoring parameters and create the MonitoredQuantity instances
282  stats_helper_.createCollectors(agg_pset, 50, 20.0, 60.0, INPUT_EVENTS_STAT_KEY);
283 
284  // initialize the MetricManager and the names of our metrics
285  fhicl::ParameterSet metric_pset;
286 
287  try
288  {
289  metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
290  }
291  catch (...) {} // OK if there's no metrics table defined in the FHiCL
292 
293  if (metric_pset.is_empty())
294  {
295  TLOG_INFO(name_) << "No metric plugins appear to be defined" << TLOG_ENDL;
296  }
297  try
298  {
299  metricMan_.initialize(metric_pset, metricsReportingInstanceName);
300  }
301  catch (...)
302  {
303  ExceptionHandler(ExceptionHandlerRethrow::no,
304  "Error loading metrics in AggregatorCore::initialize()");
305  }
306 
307  if (event_store_ptr_ == nullptr)
308  {
309  artdaq::EventStore::ART_CFGSTRING_FCN* reader = &artapp_string_config;
310  size_t desired_events_per_bunch = expected_events_per_bunch_;
311  if (is_online_monitor_ || is_dispatcher_)
312  {
313  desired_events_per_bunch = 1;
314  }
315  TRACE(36, "Creating EventStore and Starting art thread");
316  event_store_ptr_.reset(new artdaq::EventStore(agg_pset, desired_events_per_bunch, 1,
317  init_string_, reader));
318  TRACE(36, "Done Creating EventStore");
319  event_store_ptr_->setSeqIDModulus(desired_events_per_bunch);
320  fhicl::ParameterSet tmp = pset;
321  tmp.erase("daq");
322  previous_pset_ = tmp;
323  }
324  else
325  {
326  fhicl::ParameterSet tmp = pset;
327  tmp.erase("daq");
328  if (tmp != previous_pset_)
329  {
330  TLOG_ERROR(name_)
331  << "The art configuration can not be altered after art "
332  << "has been configured." << TLOG_ENDL;
333  return false;
334  }
335  }
336 
337  return true;
338 }
339 
340 bool artdaq::AggregatorCore::start(art::RunID id)
341 {
342  event_count_in_run_ = 0;
343  event_count_in_subrun_ = 0;
344  subrun_start_time_ = time(0);
345  stats_helper_.resetStatistics();
346  previous_run_duration_ = -1.0;
347 
348  stop_requested_.store(false);
349  local_pause_requested_.store(false);
350  run_id_ = id;
351  metricMan_.do_start();
352  event_store_ptr_->startRun(run_id_.run());
353 
354  logMessage_("Started run " + boost::lexical_cast<std::string>(run_id_.run()));
355  return true;
356 }
357 
359 {
360  logMessage_("Stopping run " + boost::lexical_cast<std::string>(run_id_.run()) +
361  ", " + boost::lexical_cast<std::string>(event_count_in_run_) +
362  " events received so far.");
363 
364  /* Nothing to do here. The aggregator we clean up after itself once it has
365  received all of the EOD fragments it expects. Higher level code will block
366  until the process_fragments() thread exits. */
367  stop_requested_.store(true);
368  return true;
369 }
370 
372 {
373  logMessage_("Pausing run " + boost::lexical_cast<std::string>(run_id_.run()) +
374  ", " + boost::lexical_cast<std::string>(event_count_in_run_) +
375  " events received so far.");
376 
377  /* Nothing to do here. The aggregator we clean up after itself once it has
378  received all of the EOD fragments it expects. Higher level code will block
379  until the process_fragments() thread exits. */
380  local_pause_requested_.store(true);
381  return true;
382 }
383 
385 {
386  event_count_in_subrun_ = 0;
387  subrun_start_time_ = time(0);
388  local_pause_requested_.store(false);
389 
390  logMessage_("Resuming run " + boost::lexical_cast<std::string>(run_id_.run()));
391  metricMan_.do_start();
392  event_store_ptr_->startSubrun();
393  return true;
394 }
395 
397 {
398  int readerReturnValue;
399  bool endSucceeded = false;
400  int attemptsToEnd = 1;
401  endSucceeded = event_store_ptr_->endOfData(readerReturnValue);
402  while (!endSucceeded && attemptsToEnd < 3)
403  {
404  ++attemptsToEnd;
405  TLOG_DEBUG(name_) << "Retrying EventStore::endOfData()" << TLOG_ENDL;
406  endSucceeded = event_store_ptr_->endOfData(readerReturnValue);
407  }
408  metricMan_.shutdown();
409 
410  return endSucceeded;
411 }
412 
413 bool artdaq::AggregatorCore::soft_initialize(fhicl::ParameterSet const& pset)
414 {
415  TLOG_DEBUG(name_) << "soft_initialize method called with DAQ "
416  << "ParameterSet = \"" << pset.to_string()
417  << "\"." << TLOG_ENDL;
418  return true;
419 }
420 
421 bool artdaq::AggregatorCore::reinitialize(fhicl::ParameterSet const& pset)
422 {
423  TLOG_DEBUG(name_) << "reinitialize method called with DAQ "
424  << "ParameterSet = \"" << pset.to_string()
425  << "\"." << TLOG_ENDL;
426  return true;
427 }
428 
430 {
431  processing_fragments_.store(true);
432 
433  size_t eodFragmentsReceived = 0;
434  bool process_fragments = true;
435  int senderSlot;
436  detail::FragCounter fragments_received;
437  detail::FragCounter fragments_sent;
438  artdaq::FragmentPtr endSubRunMsg(nullptr);
439  time_t last_filesize_check_time = subrun_start_time_;
440 
441  // if (is_data_logger_)
442  if (true)
443  {
444  receiver_ptr_.reset(new artdaq::DataReceiverManager(data_pset_));
445  receiver_ptr_->start_threads();
446  }
447 
448  if (is_data_logger_ && data_pset_.has_key("destinations"))
449  {
450  sender_ptr_.reset(new artdaq::DataSenderManager(data_pset_));
451 
452  if (sender_ptr_->destinationCount() == 0) {
453  sender_ptr_.reset(nullptr);
454  }
455  }
456 
457  TLOG_DEBUG(name_) << "Waiting for first fragment." << TLOG_ENDL;
458 
459  artdaq::MonitoredQuantityStats::TIME_POINT_T startTime;
460  while (process_fragments)
461  {
462  artdaq::FragmentPtr fragmentPtr(new artdaq::Fragment);
463 
464  size_t recvTimeout = inrun_recv_timeout_usec_;
465  if (stop_requested_.load()) { recvTimeout = endrun_recv_timeout_usec_; }
466  else if (local_pause_requested_.load()) { recvTimeout = pause_recv_timeout_usec_; }
467 
468  startTime = artdaq::MonitoredQuantity::getCurrentTime();
469 
470  //Removed if statement on different Aggregator types as they all go through DataReceiverManager now
471  fragmentPtr = receiver_ptr_->recvFragment(senderSlot, recvTimeout);
472 
473  stats_helper_.addSample(INPUT_WAIT_STAT_KEY,
474  (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
475  /*if (senderSlot == MPI_ANY_SOURCE) // Use RECV_TIMEOUT now to indicate that no senders have sent anything
476  {
477  if (endSubRunMsg != nullptr)
478  {
479  TLOG_INFO(name_)
480  << "There appears to be no more data to receive - ending the run." << TLOG_ENDL;
481  event_store_ptr_->flushData();
482  artdaq::RawEvent_ptr subRunEvent(new artdaq::RawEvent(run_id_.run(), 1, 0));
483  subRunEvent->insertFragment(std::move(endSubRunMsg));
484 
485  bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
486 
487  if (!enqStatus)
488  {
489  TLOG_ERROR(name_) << "Attempt to send EndOfSubRun fragment to art timed out after " <<
490  enq_timeout_.count() << " seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
491  }
492  }
493  else
494  {
495  TLOG_ERROR(name_)
496  << "There appears to be no more data to receive, but the EndOfSubRun fragment isn't available to send to art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
497  }
498 
499  process_fragments = false;
500  continue;
501  }*/
502  if (senderSlot == artdaq::TransferInterface::RECV_TIMEOUT)
503  {
504  if (stop_requested_.load() &&
505  recvTimeout == endrun_recv_timeout_usec_)
506  {
507  if (endSubRunMsg != nullptr)
508  {
509  TLOG_WARNING(name_)
510  << "Timeout occurred in attempt to receive data, but as a stop has been requested, will forcibly end the run." << TLOG_ENDL;
511  event_store_ptr_->flushData();
512  artdaq::RawEvent_ptr subRunEvent(new artdaq::RawEvent(run_id_.run(), 1, 0));
513  subRunEvent->insertFragment(std::move(endSubRunMsg));
514 
515  bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
516  if (!enqStatus)
517  {
518  TLOG_ERROR(name_) << "Attempt to send EndOfSubRun fragment to art timed out after " <<
519  enq_timeout_.count() << " seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
520  }
521  }
522  else
523  {
524  if (event_count_in_subrun_ > 0)
525  {
526  TLOG_ERROR(name_)
527  << "Timeout receiving data after stop request, and the EndOfSubRun fragment isn't available to send to art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
528  }
529  else
530  {
531  std::string msg("Timeout receiving data after stop request, and the EndOfSubRun fragment isn't available to send to art;");
532  msg.append("DAQ may need to be returned to the \"Stopped\" state before further datataking");
533  logMessage_(msg);
534  }
535  }
536  process_fragments = false;
537  }
538  else if (local_pause_requested_.load() &&
539  recvTimeout == pause_recv_timeout_usec_)
540  {
541  if (endSubRunMsg != nullptr)
542  {
543  TLOG_WARNING(name_)
544  << "Timeout occurred in attempt to receive data, but as a pause has been requested, will forcibly pause the run." << TLOG_ENDL;
545  event_store_ptr_->flushData();
546  artdaq::RawEvent_ptr subRunEvent(new artdaq::RawEvent(run_id_.run(), 1, 0));
547  subRunEvent->insertFragment(std::move(endSubRunMsg));
548 
549  bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
550  if (!enqStatus)
551  {
552  TLOG_ERROR(name_) << "Attempt to send EndOfSubRun fragment to art timed out after " <<
553  enq_timeout_.count() << " seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
554  }
555  }
556  else
557  {
558  TLOG_ERROR(name_) <<
559  "Timeout receiving data after pause request, and the EndOfSubRun fragment isn't available to send to art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
560  }
561  process_fragments = false;
562  }
563 
564  continue;
565  }
566  else if (!fragmentPtr)
567  {
568  TLOG_ERROR(name_) << "Received invalid fragment from " << senderSlot << ". This is usually the case when a timeout has occurred, but sender was not set to RECV_TIMEOUT as expected." << TLOG_ENDL;
569  continue;
570  }
571  if (!receiver_ptr_->enabled_sources().count(senderSlot))
572  {
573  TLOG_ERROR(name_)
574  << "Invalid senderSlot received from recvFragment: "
575  << senderSlot << TLOG_ENDL;
576  continue;
577  }
578  fragments_received.incSlot(senderSlot);
579  if (artdaq::Fragment::isSystemFragmentType(fragmentPtr->type()) &&
580  fragmentPtr->type() != artdaq::Fragment::DataFragmentType)
581  {
582  TLOG_DEBUG(name_)
583  << "Sender slot = " << senderSlot
584  << ", fragment type = " << static_cast<int>(fragmentPtr->type())
585  << ", sequence ID = " << fragmentPtr->sequenceID() << TLOG_ENDL;
586  }
587 
588  // 11-Sep-2013, KAB - protect against invalid fragments
589  if (fragmentPtr->type() == artdaq::Fragment::InvalidFragmentType)
590  {
591  size_t fragSize = fragmentPtr->size() * sizeof(artdaq::RawDataType);
592  TLOG_ERROR(name_) << "Fragment received with type of "
593  << "INVALID. Size = " << fragSize
594  << ", sequence ID = " << fragmentPtr->sequenceID()
595  << ", fragment ID = " << fragmentPtr->fragmentID()
596  << ", and type = " << static_cast<int>(fragmentPtr->type()) << TLOG_ENDL;
597  continue;
598  }
599 
600  if (artdaq::Fragment::isUserFragmentType(fragmentPtr->type()) ||
601  fragmentPtr->type() == artdaq::Fragment::DataFragmentType)
602  {
603  ++event_count_in_run_;
604  ++event_count_in_subrun_;
605  if (event_count_in_run_ == 1)
606  {
607  logMessage_("Received event " +
608  boost::lexical_cast<std::string>(event_count_in_run_) +
609  " with sequence id " +
610  boost::lexical_cast<std::string>(fragmentPtr->sequenceID()) +
611  ".");
612  }
613  stats_helper_.addSample(INPUT_EVENTS_STAT_KEY, fragmentPtr->size());
614  if (stats_helper_.readyToReport(event_count_in_run_))
615  {
616  std::string statString = buildStatisticsString_();
617  logMessage_(statString);
618  logMessage_("Received event " +
619  boost::lexical_cast<std::string>(event_count_in_run_) +
620  " with sequence id " +
621  boost::lexical_cast<std::string>(fragmentPtr->sequenceID()) +
622  " (run " +
623  boost::lexical_cast<std::string>(run_id_.run()) +
624  ", subrun " +
625  boost::lexical_cast<std::string>(event_store_ptr_->subrunID()) +
626  ").");
627  }
628  }
629  if (stats_helper_.statsRollingWindowHasMoved()) { sendMetrics_(); }
630 
631  startTime = artdaq::MonitoredQuantity::getCurrentTime();
632 
633  if (is_data_logger_ && fragmentPtr->type() == artdaq::Fragment::DataFragmentType
634  && (event_count_in_run_ % onmon_event_prescale_) == 0 && sender_ptr_)
635  {
636  try
637  {
638  auto fragCopy = *fragmentPtr;
639  sender_ptr_->sendFragment(std::move(fragCopy));
640  }
641  catch (...)
642  {
643  ExceptionHandler(ExceptionHandlerRethrow::no,
644  "Exception thrown during data logger copy of event to dispatcher");
645  }
646  }
647  else if (is_dispatcher_)
648  {
649  if (fragmentPtr->type() != artdaq::Fragment::EndOfDataFragmentType)
650  {
651  if (fragmentPtr->type() == artdaq::Fragment::InitFragmentType)
652  {
653  init_fragment_ptr_ = std::make_unique<artdaq::Fragment>(*fragmentPtr);
654  }
655 
656  std::lock_guard<std::mutex> lock(dispatcher_transfers_mutex_);
657 
658  if (new_transfers_ == 0)
659  {
660  // So as to not flood log files/viewers with messages...
661  if (dispatcher_transfers_.size() > 0 && fragmentPtr->sequenceID() % 100 == 0)
662  {
663  TLOG_DEBUG(name_) << "Dispatcher: broadcasting seqID = " << fragmentPtr->sequenceID() << ", type = " <<
664  static_cast<size_t>(fragmentPtr->type()) << " to " << dispatcher_transfers_.size()
665  << " registered monitors" << TLOG_ENDL;
666  }
667  for (auto& transfer : dispatcher_transfers_)
668  {
669  transfer->copyFragment(*fragmentPtr, 0);
670  }
671  }
672  else
673  {
674  for (size_t i_q = dispatcher_transfers_.size() - new_transfers_; i_q < dispatcher_transfers_.size(); ++i_q)
675  {
676  TLOG_INFO(name_) << "Copying out init fragment, type " << static_cast<int>(init_fragment_ptr_->type()) <<
677  ", size " << init_fragment_ptr_->sizeBytes() << TLOG_ENDL;
678  dispatcher_transfers_[i_q]->copyFragment(*init_fragment_ptr_, 500000);
679  }
680  new_transfers_ = 0;
681  }
682  }
683  }
684 
685  stats_helper_.addSample(SHM_COPY_TIME_STAT_KEY,
686  (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
687 
688  //----------------------------------------------------------------------------
689 
690  artdaq::Fragment::sequence_id_t seq = fragmentPtr->sequenceID();
691  TRACE(21, "%s::process_fragments seq=%lu isLogger=%d type=%d"
692  , name_.c_str(), seq, is_data_logger_, fragmentPtr->type());
693  startTime = artdaq::MonitoredQuantity::getCurrentTime();
694  if (!art_initialized_)
695  {
696  /* The init fragment should always be the first fragment out of the
697  EventBuilder. */
698  if (fragmentPtr->type() == artdaq::Fragment::InitFragmentType)
699  {
700  TLOG_DEBUG(name_) << "Init" << TLOG_ENDL;
701 
702  if (is_data_logger_ && sender_ptr_)
703  {
704  auto fragCopy = *fragmentPtr;
705  sender_ptr_->sendFragment(std::move(fragCopy));
706 
707  }
708 
709  artdaq::RawEvent_ptr initEvent(new artdaq::RawEvent(run_id_.run(), 1, fragmentPtr->sequenceID()));
710  initEvent->insertFragment(std::move(fragmentPtr));
711 
712  bool enqStatus = event_queue_.enqTimedWait(initEvent, enq_timeout_);
713 
714  if (!enqStatus)
715  {
716  TLOG_ERROR(name_) << "Attempt to send Init event to art timed out after " <<
717  enq_timeout_.count() << " seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
718  }
719  art_initialized_ = true;
720  }
721  else
722  {
723  TLOG_ERROR(name_) << "Didn't receive an Init event with which to initialize art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
724  }
725  }
726  else
727  {
728  /* Note that in the currently implementation of the NetMon output/input
729  modules there are no EndOfRun or Shutdown fragments. */
730  if (fragmentPtr->type() == artdaq::Fragment::DataFragmentType)
731  {
732  if (is_data_logger_)
733  {
734  artdaq::FragmentPtr rejectedFragment;
735  auto seqId = fragmentPtr->sequenceID();
736  bool try_again = true;
737  while (try_again)
738  {
739  auto ret = event_store_ptr_->insert(std::move(fragmentPtr), rejectedFragment);
741  {
742  receiver_ptr_->unsuppressAll();
743  try_again = false;
744  }
746  {
747  try_again = false;
748  }
749  else if (stop_requested_.load())
750  {
751  try_again = false;
752  process_fragments = false;
753  receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
754  TLOG_WARNING(name_)
755  << "Unable to process event " << seqId
756  << " because of back-pressure - forcibly ending the run." << TLOG_ENDL;
757  }
758  else if (local_pause_requested_.load())
759  {
760  try_again = false;
761  process_fragments = false;
762  receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
763  TLOG_WARNING(name_)
764  << "Unable to process event " << seqId
765  << " because of back-pressure - forcibly pausing the run." << TLOG_ENDL;
766  }
768  {
769  fragmentPtr = std::move(rejectedFragment);
770  TLOG_WARNING(name_)
771  << "Unable to process event " << seqId
772  << " because of back-pressure from art - retrying..." << TLOG_ENDL;
773  }
774  else
775  {
776  try_again = false;
777  receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
778  TLOG_WARNING(name_)
779  << "Unable to process event " << seqId
780  << " because the EventStore has reached the maximum number of incomplete bunches." << std::endl
781  << " Will retry when the EventStore is ready for new events." << TLOG_ENDL;
782  }
783  }
784  }
785  else
786  {
787  event_store_ptr_->insert(std::move(fragmentPtr), false);
788  }
789  }
790  else if (fragmentPtr->type() == artdaq::Fragment::EndOfSubrunFragmentType)
791  {
792  if (is_data_logger_ && sender_ptr_)
793  {
794  auto fragCopy = *fragmentPtr;
795  sender_ptr_->sendFragment(std::move(fragCopy));
796  }
797  else if (is_dispatcher_)
798  {
799  for (auto& transfer : dispatcher_transfers_)
800  {
801  transfer->copyFragment(*fragmentPtr, 0);
802  }
803  }
804 
805  /* We inject the EndSubrun fragment after all other data has been
806  received. The SHandles and RHandles classes do not guarantee that
807  data will be received in the same order it is sent. We'll hold on to
808  this fragment and inject it once we've received all EOD fragments. */
809  endSubRunMsg = std::move(fragmentPtr);
810  }
811  else if (fragmentPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
812  {
813  eodFragmentsReceived++;
814  /* We count the EOD fragment as a fragment received but the SHandles class
815  does not count it as a fragment sent which means we need to add one to
816  the total expected fragments. */
817  fragments_sent.setSlot(senderSlot, *fragmentPtr->dataBegin() + 1);
818  }
819  }
820  float delta = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
821  stats_helper_.addSample(STORE_EVENT_WAIT_STAT_KEY, delta);
822  TRACE((delta > 3.0) ? 0 : 22, "%s::process_fragments seq=%lu isLogger=%d delta=%f start=%f"
823  , name_.c_str(), seq, is_data_logger_, delta, startTime);
824 
825  // 27-Sep-2013, KAB - added automatic file closing
826  startTime = artdaq::MonitoredQuantity::getCurrentTime();
827  if (is_data_logger_ && disk_writing_directory_.size() > 0 &&
828  !stop_requested_.load() && !system_pause_requested_.load())
829  {
830  bool threshold_reached = false;
831  if (file_close_event_count_ > 0 &&
832  event_count_in_subrun_ >= file_close_event_count_)
833  {
834  threshold_reached = true;
835  }
836  else
837  {
838  time_t now = time(0);
839  if (file_close_timeout_secs_ > 0 &&
840  (now - subrun_start_time_) >= file_close_timeout_secs_)
841  {
842  threshold_reached = true;
843  }
844  else
845  {
846  if (filesize_check_interval_seconds_ > 0 &&
847  filesize_check_interval_events_ > 0 &&
848  (now - last_filesize_check_time) >= filesize_check_interval_seconds_ &&
849  (event_count_in_run_ % filesize_check_interval_events_) == 0)
850  {
851  if (file_close_threshold_bytes_ > 0 &&
852  getLatestFileSize_() >= file_close_threshold_bytes_)
853  {
854  threshold_reached = true;
855  }
856  last_filesize_check_time = now;
857  }
858  }
859  }
860  if (threshold_reached)
861  {
862  system_pause_requested_.store(true);
863  if (pause_thread_.get() != 0)
864  {
865  pause_thread_->join();
866  }
867  TLOG_DEBUG(name_) << "Starting sendPauseAndResume thread "
868  << ", event count in subrun = "
869  << event_count_in_subrun_ << TLOG_ENDL;
870  pause_thread_.reset(new std::thread(&AggregatorCore::sendPauseAndResume_, this));
871  }
872  }
873  stats_helper_.addSample(FILE_CHECK_TIME_STAT_KEY,
874  (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
875 
876  /* If we've received EOD fragments from all of the EventBuilders we can
877  verify that we've also received every fragment that they have sent. If
878  all fragments are accounted for we can flush the EventStoreand exit out
879  of this thread.*/
880 
881  size_t source_count = 0;
882  if (is_data_logger_) source_count = receiver_ptr_->enabled_sources().size();
883  else source_count = 1;
884 
885  if (eodFragmentsReceived >= source_count && endSubRunMsg != nullptr)
886  {
887  bool fragmentsOutstanding = false;
888  if (is_data_logger_)
889  {
890  for (auto& i : receiver_ptr_->enabled_sources())
891  {
892  if (fragments_received[i] != fragments_sent[i])
893  {
894  fragmentsOutstanding = true;
895  break;
896  }
897  }
898  }
899 
900  if (!fragmentsOutstanding)
901  {
902  event_store_ptr_->flushData();
903  artdaq::RawEvent_ptr subRunEvent(new artdaq::RawEvent(run_id_.run(), 1, 0));
904  subRunEvent->insertFragment(std::move(endSubRunMsg));
905 
906  bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
907 
908  if (!enqStatus)
909  {
910  TLOG_ERROR(name_) << "All data appears to have been received but attempt to send EndOfSubRun fragment to art timed out after " <<
911  enq_timeout_.count() << " seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
912  }
913  process_fragments = false;
914  }
915  else
916  {
917  TLOG_WARNING(name_) << "EndOfSubRun fragment and all EndOfData fragments received but more data expected" << TLOG_ENDL;
918  }
919  }
920  }
921 
922  logMessage_("Subrun " +
923  boost::lexical_cast<std::string>(event_store_ptr_->subrunID()) +
924  " in run " + boost::lexical_cast<std::string>(run_id_.run()) +
925  " has ended. There were " +
926  boost::lexical_cast<std::string>(event_count_in_subrun_) +
927  " events in this subrun, and there have been " +
928  boost::lexical_cast<std::string>(event_count_in_run_) +
929  " events so far in this run.");
930 
931  artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
932  getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
933  if (mqPtr.get() != 0)
934  {
935  artdaq::MonitoredQuantityStats stats;
936  mqPtr->getStats(stats);
937  std::ostringstream oss;
938  oss << "Run " << run_id_.run() << " has an overall event rate of ";
939  oss << std::fixed << std::setprecision(1) << stats.fullSampleRate;
940  oss << " events/sec.";
941  logMessage_(oss.str());
942  previous_run_duration_ = stats.fullDuration;
943  }
944 
945  // 11-May-2015, KAB: call MetricManager::do_stop whenever we exit the
946  // processing fragments loop so that metrics correctly go to zero when
947  // there is no data flowing
948  metricMan_.do_stop();
949 
950  receiver_ptr_.reset(nullptr);
951  sender_ptr_.reset(nullptr);
952 
953  processing_fragments_.store(false);
954  return 0;
955 }
956 
957 std::string artdaq::AggregatorCore::report(std::string const& which) const
958 {
959  if (which == "event_count")
960  {
961  artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
962  getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
963  if (mqPtr.get() != 0)
964  {
965  return boost::lexical_cast<std::string>(mqPtr->getFullSampleCount());
966  }
967  else
968  {
969  return "-1";
970  }
971  }
972 
973  if (which == "run_duration")
974  {
975  // 17-Jan-2014, KAB: if we are not processing fragments, return
976  // the previous run duration
977  double duration = previous_run_duration_;
978  if (processing_fragments_.load())
979  {
980  artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
981  getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
982  if (mqPtr.get() != 0)
983  {
984  duration = mqPtr->getFullDuration();
985  }
986  }
987  std::ostringstream oss;
988  oss << std::fixed << std::setprecision(1) << duration;
989  return oss.str();
990  }
991 
992  if (which == "file_size")
993  {
994  size_t latestFileSize = getLatestFileSize_();
995  return boost::lexical_cast<std::string>(latestFileSize);
996  }
997 
998  if (which == "subrun_number")
999  {
1000  if (event_store_ptr_.get() != nullptr)
1001  {
1002  return boost::lexical_cast<std::string>(event_store_ptr_->subrunID());
1003  }
1004  else
1005  {
1006  return "-1";
1007  }
1008  }
1009 
1010  if (which == "incomplete_event_count")
1011  {
1012  if (event_store_ptr_ != nullptr)
1013  {
1014  return boost::lexical_cast<std::string>(event_store_ptr_->incompleteEventCount());
1015  }
1016  else
1017  {
1018  return "-1";
1019  }
1020  }
1021 
1022  // lots of cool stuff that we can do here
1023  // - report on the number of fragments received and the number
1024  // of events built (in the current or previous run
1025  // - report on the number of incomplete events in the EventStore
1026  // (if running)
1027  std::string tmpString = name_ + " run number = ";
1028  tmpString.append(boost::lexical_cast<std::string>(run_id_.run()));
1029  tmpString.append(". Command=\"" + which + "\" is not currently supported.");
1030  return tmpString;
1031 }
1032 
1033 std::string artdaq::AggregatorCore::register_monitor(fhicl::ParameterSet const& pset)
1034 {
1035  TLOG_DEBUG(name_) << "AggregatorCore::register_monitor called with argument \"" << pset.to_string() << "\"" << TLOG_ENDL;
1036  std::lock_guard<std::mutex> lock(dispatcher_transfers_mutex_);
1037 
1038  try
1039  {
1040  auto transfer = MakeTransferPlugin(pset, "transfer_plugin", TransferInterface::Role::kSend);
1041 
1042  for (auto& existing_transfer_ : dispatcher_transfers_)
1043  {
1044  if (existing_transfer_->uniqueLabel() == transfer->uniqueLabel())
1045  {
1046  std::stringstream errmsg;
1047  errmsg << "Attempt to register newly-created monitor with label \"" <<
1048  transfer->uniqueLabel() << "\" failed; a monitor with that label already exists";
1049  return errmsg.str();
1050  }
1051  }
1052 
1053  dispatcher_transfers_.emplace_back(std::move(transfer));
1054 
1055  TLOG_INFO(name_) << "Successfully registered monitor with label \"" << dispatcher_transfers_.back()->uniqueLabel() << "\"" << TLOG_ENDL;
1056 
1057  new_transfers_++;
1058  }
1059  catch (...)
1060  {
1061  std::stringstream errmsg;
1062  errmsg << "Unable to create a Transfer plugin with the FHiCL code \"" << pset.to_string() << "\", a new monitor has not been registered";
1063  return errmsg.str();
1064  }
1065 
1066  return "Success";
1067 }
1068 
1069 std::string artdaq::AggregatorCore::unregister_monitor(std::string const& label)
1070 {
1071  TLOG_DEBUG(name_) << "AggregatorCore::unregister_monitor called with argument \"" << label << "\"" << TLOG_ENDL;
1072  std::lock_guard<std::mutex> lock(dispatcher_transfers_mutex_);
1073 
1074  try
1075  {
1076  auto r_i_end = std::remove_if(dispatcher_transfers_.begin(),
1077  dispatcher_transfers_.end(),
1078  [label](const std::unique_ptr<TransferInterface>& transfer)
1079  {
1080  return transfer->uniqueLabel() == label;
1081  });
1082 
1083  auto nfound = dispatcher_transfers_.end() - r_i_end;
1084 
1085  TLOG_INFO(name_) << "Request from monitor with label \"" << label << "\" to unregister received" << TLOG_ENDL;
1086 
1087  if (nfound == 1)
1088  {
1089  dispatcher_transfers_.pop_back();
1090  return "Success";
1091  }
1092  else if (nfound == 0)
1093  {
1094  std::stringstream errmsg;
1095  errmsg << "Warning in AggregatorCore::unregister_monitor: unable to find requested transfer plugin with "
1096  << "label \"" << label << "\"";
1097  TLOG_WARNING(name_) << errmsg.str() << TLOG_ENDL;
1098  return errmsg.str();
1099  }
1100  else
1101  {
1102  std::stringstream errmsg;
1103  errmsg << "Warning in AggregatorCore::unregister_monitor: found more than one (" << nfound <<
1104  ") transfer plugins with label \"" << label << "\", will unregister all of them";
1105  TLOG_WARNING(name_) << errmsg.str() << TLOG_ENDL;
1106  dispatcher_transfers_.erase(r_i_end, dispatcher_transfers_.end());
1107  return errmsg.str();
1108  }
1109  }
1110  catch (...)
1111  {
1112  std::stringstream errmsg;
1113  errmsg << "Unable to unregister transfer plugin with label \"" << label << "\"";
1114  return errmsg.str();
1115  }
1116 
1117  return "Success";
1118 }
1119 
1120 
1121 size_t artdaq::AggregatorCore::getLatestFileSize_() const
1122 {
1123  if (disk_writing_directory_.size() == 0)
1124  {
1125  TLOG_DEBUG(name_) << "Latest file size = 0 (no directory)" << TLOG_ENDL;
1126  return 0;
1127  }
1128  BFS::path outputDir(disk_writing_directory_);
1129  BFS::directory_iterator endIter;
1130 
1131  std::time_t latestFileTime = 0;
1132  size_t latestFileSize = 0;
1133  if (BFS::exists(outputDir) && BFS::is_directory(outputDir))
1134  {
1135  for (BFS::directory_iterator dirIter(outputDir); dirIter != endIter; ++dirIter)
1136  {
1137  BFS::path pathObj = dirIter->path();
1138  if (pathObj.filename().string().find("RootOutput") != std::string::npos &&
1139  pathObj.filename().string().find("root") != std::string::npos)
1140  {
1141  if (BFS::last_write_time(pathObj) >= latestFileTime)
1142  {
1143  latestFileTime = BFS::last_write_time(pathObj);
1144  latestFileSize = BFS::file_size(pathObj);
1145  }
1146  }
1147  }
1148  }
1149  time_t now = time(0);
1150  if ((now - latestFileTime) < 60)
1151  {
1152  TLOG_DEBUG(name_) << "Latest file size = "
1153  << latestFileSize << TLOG_ENDL;
1154  return latestFileSize;
1155  }
1156  else
1157  {
1158  TLOG_DEBUG(name_) << "Latest file size = 0 (too old)" << TLOG_ENDL;
1159  return 0;
1160  }
1161 }
1162 
1163 bool artdaq::AggregatorCore::sendPauseAndResume_()
1164 {
1165  xmlrpc_c::clientSimple myClient;
1166  TLOG_INFO(name_) << "Starting automatic pause..." << TLOG_ENDL;
1167  for (size_t igrp = 0; igrp < xmlrpc_client_lists_.size(); ++igrp)
1168  {
1169  for (size_t idx = 0; idx < xmlrpc_client_lists_[igrp].size(); ++idx)
1170  {
1171  for (size_t iAttempt = 0; iAttempt < 5; ++iAttempt)
1172  {
1173  xmlrpc_c::value result;
1174  myClient.call((xmlrpc_client_lists_[igrp])[idx], "daq.pause", &result);
1175  std::string const resultString = xmlrpc_c::value_string(result);
1176  TLOG_DEBUG(name_) << "Pause: "
1177  << (xmlrpc_client_lists_[igrp])[idx]
1178  << " " << resultString << TLOG_ENDL;
1179  if (std::string::npos !=
1180  boost::algorithm::to_lower_copy(resultString).find("success"))
1181  {
1182  break;
1183  }
1184  else
1185  {
1186  sleep(2);
1187  TLOG_WARNING(name_) << "Retrying pause command to "
1188  << (xmlrpc_client_lists_[igrp])[idx]
1189  << " (" << resultString << ")" << TLOG_ENDL;
1190  }
1191  }
1192  }
1193  }
1194  TLOG_INFO(name_) << "Starting automatic resume..." << TLOG_ENDL;
1195  for (int igrp = (xmlrpc_client_lists_.size() - 1); igrp >= 0; --igrp)
1196  {
1197  for (size_t idx = 0; idx < xmlrpc_client_lists_[igrp].size(); ++idx)
1198  {
1199  for (size_t iAttempt = 0; iAttempt < 5; ++iAttempt)
1200  {
1201  xmlrpc_c::value result;
1202  myClient.call((xmlrpc_client_lists_[igrp])[idx], "daq.resume", &result);
1203  std::string const resultString = xmlrpc_c::value_string(result);
1204  TLOG_DEBUG(name_) << "Resume: "
1205  << (xmlrpc_client_lists_[igrp])[idx]
1206  << " " << resultString << TLOG_ENDL;
1207  if (std::string::npos !=
1208  boost::algorithm::to_lower_copy(resultString).find("success"))
1209  {
1210  break;
1211  }
1212  else
1213  {
1214  sleep(2);
1215  TLOG_WARNING(name_) << "Retrying resume command to "
1216  << (xmlrpc_client_lists_[igrp])[idx]
1217  << " (" << resultString << ")" << TLOG_ENDL;
1218  }
1219  }
1220  }
1221  }
1222  TLOG_INFO(name_) << "Done with automatic resume..." << TLOG_ENDL;
1223  system_pause_requested_.store(false);
1224  return true;
1225 }
1226 
1227 void artdaq::AggregatorCore::logMessage_(std::string const& text)
1228 {
1229  if (is_data_logger_)
1230  {
1231  TLOG_INFO(name_) << text << TLOG_ENDL;
1232  }
1233  else
1234  {
1235  TLOG_DEBUG(name_) << text << TLOG_ENDL;
1236  }
1237 }
1238 
1239 std::string artdaq::AggregatorCore::buildStatisticsString_()
1240 {
1241  std::ostringstream oss;
1242  double eventCount = 1.0;
1243  artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
1244  getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
1245  if (mqPtr.get() != 0)
1246  {
1247  //mqPtr->waitUntilAccumulatorsHaveBeenFlushed(3.0);
1248  artdaq::MonitoredQuantityStats stats;
1249  mqPtr->getStats(stats);
1250  oss << "Input statistics: "
1251  << stats.recentSampleCount << " events received at "
1252  << stats.recentSampleRate << " events/sec, data rate = "
1253  << (stats.recentValueRate * sizeof(artdaq::RawDataType)
1254  / 1024.0 / 1024.0) << " MB/sec, monitor window = "
1255  << stats.recentDuration << " sec, min::max event size = "
1256  << (stats.recentValueMin * sizeof(artdaq::RawDataType)
1257  / 1024.0 / 1024.0)
1258  << "::"
1259  << (stats.recentValueMax * sizeof(artdaq::RawDataType)
1260  / 1024.0 / 1024.0)
1261  << " MB" << std::endl;
1262  eventCount = std::max(double(stats.recentSampleCount), 1.0);
1263  oss << "Average times per event: ";
1264  if (stats.recentSampleRate > 0.0)
1265  {
1266  oss << " elapsed time = "
1267  << (1.0 / stats.recentSampleRate) << " sec";
1268  }
1269  }
1270 
1271  // 13-Jan-2015, KAB - Just a reminder that using "eventCount" in the
1272  // denominator of the calculations below is important so that the sum
1273  // of the different "average" times adds up to the overall average time
1274  // per event. In some (but not all) cases, using recentValueAverage()
1275  // would be equivalent.
1276 
1277  mqPtr = artdaq::StatisticsCollection::getInstance().
1278  getMonitoredQuantity(INPUT_WAIT_STAT_KEY);
1279  if (mqPtr.get() != 0)
1280  {
1281  oss << ", input wait time = "
1282  << (mqPtr->getRecentValueSum() / eventCount) << " sec";
1283  }
1284 
1285  mqPtr = artdaq::StatisticsCollection::getInstance().
1286  getMonitoredQuantity(STORE_EVENT_WAIT_STAT_KEY);
1287  if (mqPtr.get() != 0)
1288  {
1289  artdaq::MonitoredQuantityStats stats;
1290  mqPtr->getStats(stats);
1291  oss << ", avg::max event store wait time = "
1292  << (stats.recentValueSum / eventCount)
1293  << "::" << stats.recentValueMax
1294  << " sec";
1295  }
1296 
1297  mqPtr = artdaq::StatisticsCollection::getInstance().
1298  getMonitoredQuantity(SHM_COPY_TIME_STAT_KEY);
1299  if (mqPtr.get() != 0)
1300  {
1301  oss << ", shared memory copy time = "
1302  << (mqPtr->getRecentValueSum() / eventCount) << " sec";
1303  }
1304 
1305  mqPtr = artdaq::StatisticsCollection::getInstance().
1306  getMonitoredQuantity(FILE_CHECK_TIME_STAT_KEY);
1307  if (mqPtr.get() != 0)
1308  {
1309  oss << ", file size test time = "
1310  << (mqPtr->getRecentValueSum() / eventCount) << " sec";
1311  }
1312 
1313  return oss.str();
1314 }
1315 
1316 void artdaq::AggregatorCore::sendMetrics_()
1317 {
1318  //TLOG_DEBUG("AggregatorCore") << "Sending metrics " << TLOG_ENDL;
1319  double eventCount = 1.0;
1320  artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
1321  getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
1322  if (mqPtr.get() != 0)
1323  {
1324  artdaq::MonitoredQuantityStats stats;
1325  mqPtr->getStats(stats);
1326  eventCount = std::max(double(stats.recentSampleCount), 1.0);
1327  metricMan_.sendMetric("Event Rate",
1328  stats.recentSampleRate, "events/sec", 1);
1329  metricMan_.sendMetric("Average Event Size",
1330  (stats.recentValueAverage * sizeof(artdaq::RawDataType)
1331  ), "bytes/event", 2);
1332  metricMan_.sendMetric("Data Rate",
1333  (stats.recentValueRate * sizeof(artdaq::RawDataType)
1334  ), "bytes/sec", 2);
1335  }
1336 
1337  // 13-Jan-2015, KAB - Just a reminder that using "eventCount" in the
1338  // denominator of the calculations below is important so that the sum
1339  // of the different "average" times adds up to the overall average time
1340  // per event. In some (but not all) cases, using recentValueAverage()
1341  // would be equivalent.
1342 
1343  mqPtr = artdaq::StatisticsCollection::getInstance().
1344  getMonitoredQuantity(INPUT_WAIT_STAT_KEY);
1345  if (mqPtr.get() != 0)
1346  {
1347  metricMan_.sendMetric("Average Input Wait Time",
1348  (mqPtr->getRecentValueSum() / eventCount),
1349  "seconds/event", 3);
1350  }
1351 
1352  mqPtr = artdaq::StatisticsCollection::getInstance().
1353  getMonitoredQuantity(STORE_EVENT_WAIT_STAT_KEY);
1354  if (mqPtr.get() != 0)
1355  {
1356  metricMan_.sendMetric("Avg art Queue Wait Time",
1357  (mqPtr->getRecentValueSum() / eventCount),
1358  "seconds/event", 3);
1359  }
1360 
1361  mqPtr = artdaq::StatisticsCollection::getInstance().
1362  getMonitoredQuantity(SHM_COPY_TIME_STAT_KEY);
1363  if (mqPtr.get() != 0)
1364  {
1365  metricMan_.sendMetric("Avg Shared Memory Copy Time",
1366  (mqPtr->getRecentValueSum() / eventCount),
1367  "seconds/event", 4);
1368  }
1369 
1370  mqPtr = artdaq::StatisticsCollection::getInstance().
1371  getMonitoredQuantity(FILE_CHECK_TIME_STAT_KEY);
1372  if (mqPtr.get() != 0)
1373  {
1374  metricMan_.sendMetric("Average File Check Time",
1375  (mqPtr->getRecentValueSum() / eventCount),
1376  "seconds/event", 4);
1377  }
1378 }
static const std::string INPUT_EVENTS_STAT_KEY
Key for the Input Events MonitoredQuantity.
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
static const std::string SHM_COPY_TIME_STAT_KEY
Key for the Shared Memory Copy Time MonitoredQuantity.
Sends Fragment objects using TransferInterface plugins. Uses Routing Tables if confgiured, otherwise will Round-Robin Fragments to the destinations.
bool reinitialize(fhicl::ParameterSet const &pset)
Reinitializes the AggregatorCore. No-Op.
bool stop()
Stops the AggregatorCore.
bool shutdown()
Shuts Down the AggregatorCore.
Keep track of the count of Fragments received from a set of sources.
Definition: FragCounter.hh:20
bool start(art::RunID id)
Start the AggregatorCore.
The Fragment was successfully inserted.
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
std::string unregister_monitor(std::string const &label)
Delete the TransferInterface having the given unique label.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
AggregatorCore(int rank, std::string name)
AggregatorCore Constructor.
static const std::string INPUT_WAIT_STAT_KEY
Key for the Input Wait MonitoredQuantity.
bool soft_initialize(fhicl::ParameterSet const &pset)
Soft-Initializes the AggregatorCore. No-Op.
bool resume()
Resumes the AggregatorCore.
std::string register_monitor(fhicl::ParameterSet const &pset)
Create a new TransferInterface instance using the given configuration.
static const std::string FILE_CHECK_TIME_STAT_KEY
Key for the File Check Time MonitoredQuantity.
void setSlot(size_t slot, size_t val)
Set the given slot to the given value.
Definition: FragCounter.hh:110
void display_bits(void *memstart, size_t nbytes, std::string sourcename)
Write out memory, for debugging purposes.
The EventStore class collects Fragment objects, until it receives a complete event, at which point the event is handed over to the art thread.
Definition: EventStore.hh:49
This TransferInterface is a Sender.
void incSlot(size_t slot)
Increment the given slot by one.
Definition: FragCounter.hh:93
Receives Fragment objects from one or more DataSenderManager instances using TransferInterface plugin...
std::string report(std::string const &which) const
Send a report on a given run-time quantity.
The EventStore is full, but the Fragment was accepted as it is for an already-open event...
bool initialize(fhicl::ParameterSet const &pset)
Processes the initialize request.
bool pause()
Pauses the AggregatorCore.
int( ART_CFGSTRING_FCN)(const std::string &)
An art function that accepts a fhicl::ParameterSet as a string.
Definition: EventStore.hh:62
The Fragment was rejected, because the RawEventQueue is full.
static const std::string STORE_EVENT_WAIT_STAT_KEY
Key for the EventStore Event Wait MonitoredQuantity.
size_t process_fragments()
The main working loop of the AggregatorCore. Receives events from DataReceiverManager and processes t...