artdaq  v2_03_03
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Pages
AggregatorCore.cc
1 #pragma GCC diagnostic push
2 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3 #include <xmlrpc-c/client_simple.hpp>
4 #pragma GCC diagnostic pop
5 
6 #include <errno.h>
7 #include <sstream>
8 #include <iomanip>
9 #include <bitset>
10 
11 #include <boost/tokenizer.hpp>
12 #include <boost/filesystem.hpp>
13 #include <boost/algorithm/string.hpp>
14 #include "art/Framework/Art/artapp.h"
15 #include "cetlib/BasicPluginFactory.h"
16 
17 #include "artdaq-core/Core/SimpleQueueReader.hh"
18 #include "artdaq-core/Utilities/ExceptionHandler.hh"
19 #include "artdaq-core/Data/RawEvent.hh"
20 
21 #include "artdaq/Application/AggregatorCore.hh"
22 #include "artdaq/DAQrate/EventStore.hh"
23 #include "artdaq/DAQrate/detail/FragCounter.hh"
24 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
25 
26 
27 namespace BFS = boost::filesystem;
28 
29 const std::string artdaq::AggregatorCore::INPUT_EVENTS_STAT_KEY("AggregatorCoreInputEvents");
30 const std::string artdaq::AggregatorCore::INPUT_WAIT_STAT_KEY("AggregatorCoreInputWaitTime");
31 const std::string artdaq::AggregatorCore::STORE_EVENT_WAIT_STAT_KEY("AggregatorCoreStoreEventWaitTime");
32 const std::string artdaq::AggregatorCore::SHM_COPY_TIME_STAT_KEY("AggregatorCoreShmCopyTime");
33 const std::string artdaq::AggregatorCore::FILE_CHECK_TIME_STAT_KEY("AggregatorCoreFileCheckTime");
34 
35 namespace artdaq
36 {
43  void display_bits(void* memstart, size_t nbytes, std::string sourcename)
44  {
45  std::stringstream bitstr;
46  bitstr << "The " << nbytes << "-byte chunk of memory beginning at " << static_cast<void*>(memstart) << " is : ";
47 
48  for (unsigned int i = 0; i < nbytes; i++)
49  {
50  if (i % 4 == 0)
51  {
52  bitstr << "\n";
53  }
54 
55  bitstr << std::bitset<8>(*((reinterpret_cast<uint8_t*>(memstart)) + i)) << " ";
56  }
57 
58  TLOG_DEBUG(sourcename) << bitstr.str() << TLOG_ENDL;
59  }
60 }
61 
62 
63 artdaq::AggregatorCore::AggregatorCore(int rank, std::string name)
64  : name_(name)
65  , art_initialized_(false)
66  , event_queue_(artdaq::getGlobalQueue(10))
67  , stop_requested_(false)
68  , local_pause_requested_(false)
69  , processing_fragments_(false)
70  , system_pause_requested_(false)
71  , previous_run_duration_(-1.0)
72  , new_transfers_(0)
73 {
74  TLOG_DEBUG(name_) << "Constructor" << TLOG_ENDL;
80  metricMan = &metricMan_;
81  my_rank = rank;
82 }
83 
85 {
86  TLOG_DEBUG(name_) << "Destructor" << TLOG_ENDL;
87 }
88 
89 bool artdaq::AggregatorCore::initialize(fhicl::ParameterSet const& pset)
90 {
91  init_string_ = pset.to_string();
92  TLOG_DEBUG(name_) << "initialize method called with DAQ " << "ParameterSet = \"" << init_string_ << "\"." << TLOG_ENDL;
93 
94  // pull out the relevant parts of the ParameterSet
95  fhicl::ParameterSet daq_pset;
96  try
97  {
98  daq_pset = pset.get<fhicl::ParameterSet>("daq");
99  }
100  catch (...)
101  {
102  TLOG_ERROR(name_)
103  << "Unable to find the DAQ parameters in the initialization "
104  << "ParameterSet: \"" + pset.to_string() + "\"." << TLOG_ENDL;
105  return false;
106  }
107  fhicl::ParameterSet agg_pset;
108  try
109  {
110  agg_pset = daq_pset.get<fhicl::ParameterSet>("aggregator");
111  data_pset_ = agg_pset;
112  }
113  catch (...)
114  {
115  TLOG_ERROR(name_)
116  << "Unable to find the aggregator parameters in the DAQ "
117  << "initialization ParameterSet: \"" + daq_pset.to_string() + "\"." << TLOG_ENDL;
118  return false;
119  }
120  try
121  {
122  expected_events_per_bunch_ =
123  agg_pset.get<size_t>("expected_events_per_bunch");
124  }
125  catch (...)
126  {
127  TLOG_ERROR(name_)
128  << "The expected_events_per_bunch parameter was not specified "
129  << "in the aggregator initialization PSet: \"" << pset.to_string()
130  << "\"." << TLOG_ENDL;
131  return false;
132  }
133 
134  enq_timeout_ = static_cast<detail::seconds>(agg_pset.get<size_t>("enq_timeout", 5.0));
135 
136  // 15-Jun-2016, KAB: added ability to specify either is_data_logger or
137  // is_online_monitor in the parameter set. If neither are set in the PSet,
138  // then we default to the old-style of behavior in which the first AG is the
139  // data logger and the second is the online monitor.
140  is_data_logger_ = false;
141  is_online_monitor_ = false;
142  is_dispatcher_ = false;
143  std::string metricsReportingInstanceName = "Data Logger";
144  bool agtype_was_specified = false;
145  if (!agtype_was_specified)
146  {
147  try
148  {
149  is_data_logger_ = agg_pset.get<bool>("is_data_logger");
150  agtype_was_specified = true;
151  }
152  catch (...) {} // leave agtype_was_specified set to false
153  }
154  if (!agtype_was_specified)
155  {
156  try
157  {
158  is_online_monitor_ = agg_pset.get<bool>("is_online_monitor");
159  metricsReportingInstanceName = "Online Monitor";
160  agtype_was_specified = true;
161  }
162  catch (...) {} // leave agtype_was_specified set to false
163  }
164  if (!agtype_was_specified)
165  {
166  try
167  {
168  is_dispatcher_ = agg_pset.get<bool>("is_dispatcher");
169  metricsReportingInstanceName = "Dispatcher";
170  agtype_was_specified = true;
171  }
172  catch (...) {} // leave agtype_was_specified set to false
173  }
174 
175  if (!agtype_was_specified)
176  {
177  throw cet::exception("ConfigurationException", "You must specify one of is_data_logger, is_online_monitor or is_dispatcher");
178  return false;
179  }
180  TLOG_DEBUG(name_) << "Rank " << my_rank
181  << ", is_data_logger = " << is_data_logger_
182  << ", is_online_monitor = " << is_online_monitor_
183  << ", is_dispatcher = " << is_dispatcher_ << TLOG_ENDL;
184 
185  disk_writing_directory_ = "";
186  try
187  {
188  fhicl::ParameterSet output_pset =
189  pset.get<fhicl::ParameterSet>("outputs");
190  fhicl::ParameterSet normalout_pset =
191  output_pset.get<fhicl::ParameterSet>("normalOutput");
192 
193  if (!normalout_pset.is_empty())
194  {
195  std::string filename = normalout_pset.get<std::string>("fileName", "");
196  if (filename.size() > 0)
197  {
198  size_t pos = filename.rfind("/");
199  if (pos != std::string::npos)
200  {
201  disk_writing_directory_ = filename.substr(0, pos);
202  }
203  }
204  else
205  {
206  TLOG_WARNING(name_) << "Problem finding \"fileName\" parameter in \"normalOutput\" RootOutput module FHiCL code" << TLOG_ENDL;
207  }
208  }
209  }
210  catch (...) {}
211 
212  std::string xmlrpcClientString =
213  agg_pset.get<std::string>("xmlrpc_client_list", "");
214  if (xmlrpcClientString.size() > 0)
215  {
216  xmlrpc_client_lists_.clear();
217  boost::char_separator<char> sep1(";");
218  boost::tokenizer<boost::char_separator<char>>
219  primaryTokens(xmlrpcClientString, sep1);
220  boost::tokenizer<boost::char_separator<char>>::iterator iter1;
221  boost::tokenizer<boost::char_separator<char>>::iterator
222  endIter1 = primaryTokens.end();
223  for (iter1 = primaryTokens.begin(); iter1 != endIter1; ++iter1)
224  {
225  boost::char_separator<char> sep2(",");
226  boost::tokenizer<boost::char_separator<char>>
227  secondaryTokens(*iter1, sep2);
228  boost::tokenizer<boost::char_separator<char>>::iterator iter2;
229  boost::tokenizer<boost::char_separator<char>>::iterator
230  endIter2 = secondaryTokens.end();
231  int clientGroup = -1;
232  std::string url = "";
233  int loopCount = 0;
234  for (iter2 = secondaryTokens.begin(); iter2 != endIter2; ++iter2)
235  {
236  switch (loopCount)
237  {
238  case 0:
239  url = *iter2;
240  break;
241  case 1:
242  try
243  {
244  clientGroup = boost::lexical_cast<int>(*iter2);
245  }
246  catch (...) {}
247  break;
248  default:
249  TLOG_WARNING(name_)
250  << "Unexpected XMLRPC client list element, index = "
251  << loopCount << ", value = \"" << *iter2 << "\"" << TLOG_ENDL;
252  }
253  ++loopCount;
254  }
255  if (clientGroup >= 0 && url.size() > 0)
256  {
257  int elementsNeeded = clientGroup + 1 - ((int)xmlrpc_client_lists_.size());
258  for (int idx = 0; idx < elementsNeeded; ++idx)
259  {
260  std::vector<std::string> tmpVec;
261  xmlrpc_client_lists_.push_back(tmpVec);
262  }
263  xmlrpc_client_lists_[clientGroup].push_back(url);
264  }
265  }
266  }
267  double fileSizeMB = agg_pset.get<double>("subrun_size_MB", 0);
268  file_close_threshold_bytes_ = ((size_t)fileSizeMB * 1024.0 * 1024.0);
269  file_close_timeout_secs_ = agg_pset.get<time_t>("subrun_duration", 0);
270  file_close_event_count_ = agg_pset.get<size_t>("subrun_event_count", 0);
271 
272  inrun_recv_timeout_usec_ = agg_pset.get<size_t>("inrun_recv_timeout_usec", 100000);
273  endrun_recv_timeout_usec_ = agg_pset.get<size_t>("endrun_recv_timeout_usec", 20000000);
274  pause_recv_timeout_usec_ = agg_pset.get<size_t>("pause_recv_timeout_usec", 3000000);
275 
276  onmon_event_prescale_ = agg_pset.get<size_t>("onmon_event_prescale", 1);
277 
278  filesize_check_interval_seconds_ = agg_pset.get<int32_t>("filesize_check_interval_seconds", 20);
279  filesize_check_interval_events_ = agg_pset.get<int32_t>("filesize_check_interval_events", 20);
280 
281  // fetch the monitoring parameters and create the MonitoredQuantity instances
282  stats_helper_.createCollectors(agg_pset, 50, 20.0, 60.0, INPUT_EVENTS_STAT_KEY);
283 
284  // initialize the MetricManager and the names of our metrics
285  fhicl::ParameterSet metric_pset;
286 
287  try
288  {
289  metric_pset = daq_pset.get<fhicl::ParameterSet>("metrics");
290  }
291  catch (...) {} // OK if there's no metrics table defined in the FHiCL
292 
293  if (metric_pset.is_empty())
294  {
295  TLOG_INFO(name_) << "No metric plugins appear to be defined" << TLOG_ENDL;
296  }
297  try
298  {
299  metricMan_.initialize(metric_pset, metricsReportingInstanceName);
300  }
301  catch (...)
302  {
303  ExceptionHandler(ExceptionHandlerRethrow::no,
304  "Error loading metrics in AggregatorCore::initialize()");
305  }
306 
307  if (event_store_ptr_ == nullptr)
308  {
309  artdaq::EventStore::ART_CFGSTRING_FCN* reader = &artapp_string_config;
310  size_t desired_events_per_bunch = expected_events_per_bunch_;
311  if (is_online_monitor_ || is_dispatcher_)
312  {
313  desired_events_per_bunch = 1;
314  }
315  TRACE(36, "Creating EventStore and Starting art thread");
316  event_store_ptr_.reset(new artdaq::EventStore(agg_pset, desired_events_per_bunch, 1,
317  init_string_, reader));
318  TRACE(36, "Done Creating EventStore");
319  event_store_ptr_->setSeqIDModulus(desired_events_per_bunch);
320  fhicl::ParameterSet tmp = pset;
321  tmp.erase("daq");
322  previous_pset_ = tmp;
323  }
324  else
325  {
326  fhicl::ParameterSet tmp = pset;
327  tmp.erase("daq");
328  if (tmp != previous_pset_)
329  {
330  TLOG_ERROR(name_)
331  << "The art configuration can not be altered after art "
332  << "has been configured." << TLOG_ENDL;
333  return false;
334  }
335  }
336 
337  return true;
338 }
339 
340 bool artdaq::AggregatorCore::start(art::RunID id)
341 {
342  event_count_in_run_ = 0;
343  event_count_in_subrun_ = 0;
344  subrun_start_time_ = time(0);
345  stats_helper_.resetStatistics();
346  previous_run_duration_ = -1.0;
347 
348  stop_requested_.store(false);
349  local_pause_requested_.store(false);
350  run_id_ = id;
351  metricMan_.do_start();
352  event_store_ptr_->startRun(run_id_.run());
353 
354  logMessage_("Started run " + boost::lexical_cast<std::string>(run_id_.run()));
355  return true;
356 }
357 
359 {
360  logMessage_("Stopping run " + boost::lexical_cast<std::string>(run_id_.run()) +
361  ", " + boost::lexical_cast<std::string>(event_count_in_run_) +
362  " events received so far.");
363 
364  /* Nothing to do here. The aggregator we clean up after itself once it has
365  received all of the EOD fragments it expects. Higher level code will block
366  until the process_fragments() thread exits. */
367  stop_requested_.store(true);
368  return true;
369 }
370 
372 {
373  logMessage_("Pausing run " + boost::lexical_cast<std::string>(run_id_.run()) +
374  ", " + boost::lexical_cast<std::string>(event_count_in_run_) +
375  " events received so far.");
376 
377  /* Nothing to do here. The aggregator we clean up after itself once it has
378  received all of the EOD fragments it expects. Higher level code will block
379  until the process_fragments() thread exits. */
380  local_pause_requested_.store(true);
381  return true;
382 }
383 
385 {
386  event_count_in_subrun_ = 0;
387  subrun_start_time_ = time(0);
388  local_pause_requested_.store(false);
389 
390  logMessage_("Resuming run " + boost::lexical_cast<std::string>(run_id_.run()));
391  metricMan_.do_start();
392  event_store_ptr_->startSubrun();
393  return true;
394 }
395 
397 {
398  int readerReturnValue;
399  bool endSucceeded = false;
400  int attemptsToEnd = 1;
401  endSucceeded = event_store_ptr_->endOfData(readerReturnValue);
402  while (!endSucceeded && attemptsToEnd < 3)
403  {
404  ++attemptsToEnd;
405  TLOG_DEBUG(name_) << "Retrying EventStore::endOfData()" << TLOG_ENDL;
406  endSucceeded = event_store_ptr_->endOfData(readerReturnValue);
407  }
408  metricMan_.shutdown();
409 
410  return endSucceeded;
411 }
412 
413 bool artdaq::AggregatorCore::soft_initialize(fhicl::ParameterSet const& pset)
414 {
415  TLOG_DEBUG(name_) << "soft_initialize method called with DAQ "
416  << "ParameterSet = \"" << pset.to_string()
417  << "\"." << TLOG_ENDL;
418  return true;
419 }
420 
421 bool artdaq::AggregatorCore::reinitialize(fhicl::ParameterSet const& pset)
422 {
423  TLOG_DEBUG(name_) << "reinitialize method called with DAQ "
424  << "ParameterSet = \"" << pset.to_string()
425  << "\"." << TLOG_ENDL;
426  return true;
427 }
428 
430 {
431  processing_fragments_.store(true);
432 
433  size_t eodFragmentsReceived = 0;
434  bool process_fragments = true;
435  int senderSlot;
436  detail::FragCounter fragments_received;
437  detail::FragCounter fragments_sent;
438  artdaq::FragmentPtr endSubRunMsg(nullptr);
439  time_t last_filesize_check_time = subrun_start_time_;
440 
441  // if (is_data_logger_)
442  if (true)
443  {
444  receiver_ptr_.reset(new artdaq::DataReceiverManager(data_pset_));
445  receiver_ptr_->start_threads();
446  }
447 
448  if (is_data_logger_ && data_pset_.has_key("destinations"))
449  {
450  sender_ptr_.reset(new artdaq::DataSenderManager(data_pset_));
451 
452  if (sender_ptr_->destinationCount() == 0)
453  {
454  sender_ptr_.reset(nullptr);
455  }
456  }
457 
458  TLOG_DEBUG(name_) << "Waiting for first fragment." << TLOG_ENDL;
459 
460  artdaq::MonitoredQuantityStats::TIME_POINT_T startTime;
461  while (process_fragments)
462  {
463  artdaq::FragmentPtr fragmentPtr(new artdaq::Fragment);
464 
465  size_t recvTimeout = inrun_recv_timeout_usec_;
466  if (stop_requested_.load()) { recvTimeout = endrun_recv_timeout_usec_; }
467  else if (local_pause_requested_.load()) { recvTimeout = pause_recv_timeout_usec_; }
468 
469  startTime = artdaq::MonitoredQuantity::getCurrentTime();
470 
471  //Removed if statement on different Aggregator types as they all go through DataReceiverManager now
472  fragmentPtr = receiver_ptr_->recvFragment(senderSlot, recvTimeout);
473 
474  stats_helper_.addSample(INPUT_WAIT_STAT_KEY,
475  (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
476  /*if (senderSlot == MPI_ANY_SOURCE) // Use RECV_TIMEOUT now to indicate that no senders have sent anything
477  {
478  if (endSubRunMsg != nullptr)
479  {
480  TLOG_INFO(name_)
481  << "There appears to be no more data to receive - ending the run." << TLOG_ENDL;
482  event_store_ptr_->flushData();
483  artdaq::RawEvent_ptr subRunEvent(new artdaq::RawEvent(run_id_.run(), 1, 0));
484  subRunEvent->insertFragment(std::move(endSubRunMsg));
485 
486  bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
487 
488  if (!enqStatus)
489  {
490  TLOG_ERROR(name_) << "Attempt to send EndOfSubRun fragment to art timed out after " <<
491  enq_timeout_.count() << " seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
492  }
493  }
494  else
495  {
496  TLOG_ERROR(name_)
497  << "There appears to be no more data to receive, but the EndOfSubRun fragment isn't available to send to art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
498  }
499 
500  process_fragments = false;
501  continue;
502  }*/
503  if (senderSlot == artdaq::TransferInterface::RECV_TIMEOUT)
504  {
505  if (stop_requested_.load() &&
506  recvTimeout == endrun_recv_timeout_usec_)
507  {
508  if (endSubRunMsg != nullptr)
509  {
510  TLOG_WARNING(name_)
511  << "Timeout occurred in attempt to receive data, but as a stop has been requested, will forcibly end the run." << TLOG_ENDL;
512  event_store_ptr_->flushData();
513  artdaq::RawEvent_ptr subRunEvent(new artdaq::RawEvent(run_id_.run(), 1, 0));
514  subRunEvent->insertFragment(std::move(endSubRunMsg));
515 
516  bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
517  if (!enqStatus)
518  {
519  TLOG_ERROR(name_) << "Attempt to send EndOfSubRun fragment to art timed out after " <<
520  enq_timeout_.count() << " seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
521  }
522  }
523  else
524  {
525  if (event_count_in_subrun_ > 0)
526  {
527  TLOG_ERROR(name_)
528  << "Timeout receiving data after stop request, and the EndOfSubRun fragment isn't available to send to art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
529  }
530  else
531  {
532  std::string msg("Timeout receiving data after stop request, and the EndOfSubRun fragment isn't available to send to art;");
533  msg.append("DAQ may need to be returned to the \"Stopped\" state before further datataking");
534  logMessage_(msg);
535  }
536  }
537  process_fragments = false;
538  }
539  else if (local_pause_requested_.load() &&
540  recvTimeout == pause_recv_timeout_usec_)
541  {
542  if (endSubRunMsg != nullptr)
543  {
544  TLOG_WARNING(name_)
545  << "Timeout occurred in attempt to receive data, but as a pause has been requested, will forcibly pause the run." << TLOG_ENDL;
546  event_store_ptr_->flushData();
547  artdaq::RawEvent_ptr subRunEvent(new artdaq::RawEvent(run_id_.run(), 1, 0));
548  subRunEvent->insertFragment(std::move(endSubRunMsg));
549 
550  bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
551  if (!enqStatus)
552  {
553  TLOG_ERROR(name_) << "Attempt to send EndOfSubRun fragment to art timed out after " <<
554  enq_timeout_.count() << " seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
555  }
556  }
557  else
558  {
559  TLOG_ERROR(name_) <<
560  "Timeout receiving data after pause request, and the EndOfSubRun fragment isn't available to send to art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
561  }
562  process_fragments = false;
563  }
564 
565  continue;
566  }
567  else if (!fragmentPtr)
568  {
569  TLOG_ERROR(name_) << "Received invalid fragment from " << senderSlot << ". This is usually the case when a timeout has occurred, but sender was not set to RECV_TIMEOUT as expected." << TLOG_ENDL;
570  continue;
571  }
572  if (!receiver_ptr_->enabled_sources().count(senderSlot))
573  {
574  TLOG_ERROR(name_)
575  << "Invalid senderSlot received from recvFragment: "
576  << senderSlot << TLOG_ENDL;
577  continue;
578  }
579  fragments_received.incSlot(senderSlot);
580  if (artdaq::Fragment::isSystemFragmentType(fragmentPtr->type()) &&
581  fragmentPtr->type() != artdaq::Fragment::DataFragmentType)
582  {
583  TLOG_DEBUG(name_)
584  << "Sender slot = " << senderSlot
585  << ", fragment type = " << static_cast<int>(fragmentPtr->type())
586  << ", sequence ID = " << fragmentPtr->sequenceID() << TLOG_ENDL;
587  }
588 
589  // 11-Sep-2013, KAB - protect against invalid fragments
590  if (fragmentPtr->type() == artdaq::Fragment::InvalidFragmentType)
591  {
592  size_t fragSize = fragmentPtr->size() * sizeof(artdaq::RawDataType);
593  TLOG_ERROR(name_) << "Fragment received with type of "
594  << "INVALID. Size = " << fragSize
595  << ", sequence ID = " << fragmentPtr->sequenceID()
596  << ", fragment ID = " << fragmentPtr->fragmentID()
597  << ", and type = " << static_cast<int>(fragmentPtr->type()) << TLOG_ENDL;
598  continue;
599  }
600 
601  if (artdaq::Fragment::isUserFragmentType(fragmentPtr->type()) ||
602  fragmentPtr->type() == artdaq::Fragment::DataFragmentType)
603  {
604  ++event_count_in_run_;
605  ++event_count_in_subrun_;
606  if (event_count_in_run_ == 1)
607  {
608  logMessage_("Received event " +
609  boost::lexical_cast<std::string>(event_count_in_run_) +
610  " with sequence id " +
611  boost::lexical_cast<std::string>(fragmentPtr->sequenceID()) +
612  ".");
613  }
614  stats_helper_.addSample(INPUT_EVENTS_STAT_KEY, fragmentPtr->size());
615  if (stats_helper_.readyToReport(event_count_in_run_))
616  {
617  std::string statString = buildStatisticsString_();
618  logMessage_(statString);
619  logMessage_("Received event " +
620  boost::lexical_cast<std::string>(event_count_in_run_) +
621  " with sequence id " +
622  boost::lexical_cast<std::string>(fragmentPtr->sequenceID()) +
623  " (run " +
624  boost::lexical_cast<std::string>(run_id_.run()) +
625  ", subrun " +
626  boost::lexical_cast<std::string>(event_store_ptr_->subrunID()) +
627  ").");
628  }
629  }
630  if (stats_helper_.statsRollingWindowHasMoved()) { sendMetrics_(); }
631 
632  startTime = artdaq::MonitoredQuantity::getCurrentTime();
633 
634  if (is_data_logger_ && fragmentPtr->type() == artdaq::Fragment::DataFragmentType
635  && (event_count_in_run_ % onmon_event_prescale_) == 0 && sender_ptr_)
636  {
637  try
638  {
639  auto fragCopy = *fragmentPtr;
640  sender_ptr_->sendFragment(std::move(fragCopy));
641  }
642  catch (...)
643  {
644  ExceptionHandler(ExceptionHandlerRethrow::no,
645  "Exception thrown during data logger copy of event to dispatcher");
646  }
647  }
648  else if (is_dispatcher_)
649  {
650  if (fragmentPtr->type() != artdaq::Fragment::EndOfDataFragmentType)
651  {
652  if (fragmentPtr->type() == artdaq::Fragment::InitFragmentType)
653  {
654  init_fragment_ptr_ = std::make_unique<artdaq::Fragment>(*fragmentPtr);
655  }
656 
657  std::lock_guard<std::mutex> lock(dispatcher_transfers_mutex_);
658 
659  if (new_transfers_ == 0)
660  {
661  // So as to not flood log files/viewers with messages...
662  if (dispatcher_transfers_.size() > 0 && fragmentPtr->sequenceID() % 100 == 0)
663  {
664  TLOG_DEBUG(name_) << "Dispatcher: broadcasting seqID = " << fragmentPtr->sequenceID() << ", type = " <<
665  static_cast<size_t>(fragmentPtr->type()) << " to " << dispatcher_transfers_.size()
666  << " registered monitors" << TLOG_ENDL;
667  }
668  for (auto& transfer : dispatcher_transfers_)
669  {
670  transfer->copyFragment(*fragmentPtr, 0);
671  }
672  }
673  else
674  {
675  for (size_t i_q = dispatcher_transfers_.size() - new_transfers_; i_q < dispatcher_transfers_.size(); ++i_q)
676  {
677  TLOG_INFO(name_) << "Copying out init fragment, type " << static_cast<int>(init_fragment_ptr_->type()) <<
678  ", size " << init_fragment_ptr_->sizeBytes() << TLOG_ENDL;
679  dispatcher_transfers_[i_q]->copyFragment(*init_fragment_ptr_, 500000);
680  }
681  new_transfers_ = 0;
682  }
683  }
684  }
685 
686  stats_helper_.addSample(SHM_COPY_TIME_STAT_KEY,
687  (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
688 
689  //----------------------------------------------------------------------------
690 
691  artdaq::Fragment::sequence_id_t seq = fragmentPtr->sequenceID();
692  TRACE(21, "%s::process_fragments seq=%lu isLogger=%d type=%d"
693  , name_.c_str(), seq, is_data_logger_, fragmentPtr->type());
694  startTime = artdaq::MonitoredQuantity::getCurrentTime();
695  if (!art_initialized_)
696  {
697  /* The init fragment should always be the first fragment out of the
698  EventBuilder. */
699  if (fragmentPtr->type() == artdaq::Fragment::InitFragmentType)
700  {
701  TLOG_DEBUG(name_) << "Init" << TLOG_ENDL;
702 
703  if (is_data_logger_ && sender_ptr_)
704  {
705  auto fragCopy = *fragmentPtr;
706  sender_ptr_->sendFragment(std::move(fragCopy));
707 
708  }
709 
710  artdaq::RawEvent_ptr initEvent(new artdaq::RawEvent(run_id_.run(), 1, fragmentPtr->sequenceID()));
711  initEvent->insertFragment(std::move(fragmentPtr));
712 
713  bool enqStatus = event_queue_.enqTimedWait(initEvent, enq_timeout_);
714 
715  if (!enqStatus)
716  {
717  TLOG_ERROR(name_) << "Attempt to send Init event to art timed out after " <<
718  enq_timeout_.count() << " seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
719  }
720  art_initialized_ = true;
721  }
722  else
723  {
724  TLOG_ERROR(name_) << "Didn't receive an Init event with which to initialize art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
725  }
726  }
727  else
728  {
729  /* Note that in the currently implementation of the NetMon output/input
730  modules there are no EndOfRun or Shutdown fragments. */
731  if (fragmentPtr->type() == artdaq::Fragment::DataFragmentType)
732  {
733  if (is_data_logger_)
734  {
735  artdaq::FragmentPtr rejectedFragment;
736  auto seqId = fragmentPtr->sequenceID();
737  bool try_again = true;
738  while (try_again)
739  {
740  auto ret = event_store_ptr_->insert(std::move(fragmentPtr), rejectedFragment);
742  {
743  receiver_ptr_->unsuppressAll();
744  try_again = false;
745  }
747  {
748  try_again = false;
749  }
750  else if (stop_requested_.load())
751  {
752  try_again = false;
753  process_fragments = false;
754  receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
755  TLOG_WARNING(name_)
756  << "Unable to process event " << seqId
757  << " because of back-pressure - forcibly ending the run." << TLOG_ENDL;
758  }
759  else if (local_pause_requested_.load())
760  {
761  try_again = false;
762  process_fragments = false;
763  receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
764  TLOG_WARNING(name_)
765  << "Unable to process event " << seqId
766  << " because of back-pressure - forcibly pausing the run." << TLOG_ENDL;
767  }
769  {
770  fragmentPtr = std::move(rejectedFragment);
771  TLOG_WARNING(name_)
772  << "Unable to process event " << seqId
773  << " because of back-pressure from art - retrying..." << TLOG_ENDL;
774  }
775  else
776  {
777  try_again = false;
778  receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
779  TLOG_WARNING(name_)
780  << "Unable to process event " << seqId
781  << " because the EventStore has reached the maximum number of incomplete bunches." << std::endl
782  << " Will retry when the EventStore is ready for new events." << TLOG_ENDL;
783  }
784  }
785  }
786  else
787  {
788  event_store_ptr_->insert(std::move(fragmentPtr), false);
789  }
790  }
791  else if (fragmentPtr->type() == artdaq::Fragment::EndOfSubrunFragmentType)
792  {
793  if (is_data_logger_ && sender_ptr_)
794  {
795  auto fragCopy = *fragmentPtr;
796  sender_ptr_->sendFragment(std::move(fragCopy));
797  }
798  else if (is_dispatcher_)
799  {
800  for (auto& transfer : dispatcher_transfers_)
801  {
802  transfer->copyFragment(*fragmentPtr, 0);
803  }
804  }
805 
806  /* We inject the EndSubrun fragment after all other data has been
807  received. The SHandles and RHandles classes do not guarantee that
808  data will be received in the same order it is sent. We'll hold on to
809  this fragment and inject it once we've received all EOD fragments. */
810  endSubRunMsg = std::move(fragmentPtr);
811  }
812  else if (fragmentPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
813  {
814  eodFragmentsReceived++;
815  /* We count the EOD fragment as a fragment received but the SHandles class
816  does not count it as a fragment sent which means we need to add one to
817  the total expected fragments. */
818  fragments_sent.setSlot(senderSlot, *fragmentPtr->dataBegin() + 1);
819  }
820  }
821  float delta = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
822  stats_helper_.addSample(STORE_EVENT_WAIT_STAT_KEY, delta);
823  TRACE((delta > 3.0) ? 0 : 22, "%s::process_fragments seq=%lu isLogger=%d delta=%f start=%f"
824  , name_.c_str(), seq, is_data_logger_, delta, startTime);
825 
826  // 27-Sep-2013, KAB - added automatic file closing
827  startTime = artdaq::MonitoredQuantity::getCurrentTime();
828  if (is_data_logger_ && disk_writing_directory_.size() > 0 &&
829  !stop_requested_.load() && !system_pause_requested_.load())
830  {
831  bool threshold_reached = false;
832  if (file_close_event_count_ > 0 &&
833  event_count_in_subrun_ >= file_close_event_count_)
834  {
835  threshold_reached = true;
836  }
837  else
838  {
839  time_t now = time(0);
840  if (file_close_timeout_secs_ > 0 &&
841  (now - subrun_start_time_) >= file_close_timeout_secs_)
842  {
843  threshold_reached = true;
844  }
845  else
846  {
847  if (filesize_check_interval_seconds_ > 0 &&
848  filesize_check_interval_events_ > 0 &&
849  (now - last_filesize_check_time) >= filesize_check_interval_seconds_ &&
850  (event_count_in_run_ % filesize_check_interval_events_) == 0)
851  {
852  if (file_close_threshold_bytes_ > 0 &&
853  getLatestFileSize_() >= file_close_threshold_bytes_)
854  {
855  threshold_reached = true;
856  }
857  last_filesize_check_time = now;
858  }
859  }
860  }
861  if (threshold_reached)
862  {
863  system_pause_requested_.store(true);
864  if (pause_thread_.get() != 0)
865  {
866  pause_thread_->join();
867  }
868  TLOG_DEBUG(name_) << "Starting sendPauseAndResume thread "
869  << ", event count in subrun = "
870  << event_count_in_subrun_ << TLOG_ENDL;
871  pause_thread_.reset(new std::thread(&AggregatorCore::sendPauseAndResume_, this));
872  }
873  }
874  stats_helper_.addSample(FILE_CHECK_TIME_STAT_KEY,
875  (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
876 
877  /* If we've received EOD fragments from all of the EventBuilders we can
878  verify that we've also received every fragment that they have sent. If
879  all fragments are accounted for we can flush the EventStoreand exit out
880  of this thread.*/
881 
882  size_t source_count = 0;
883  if (is_data_logger_) source_count = receiver_ptr_->enabled_sources().size();
884  else source_count = 1;
885 
886  if (eodFragmentsReceived >= source_count && endSubRunMsg != nullptr)
887  {
888  bool fragmentsOutstanding = false;
889  if (is_data_logger_)
890  {
891  for (auto& i : receiver_ptr_->enabled_sources())
892  {
893  if (fragments_received[i] != fragments_sent[i])
894  {
895  fragmentsOutstanding = true;
896  break;
897  }
898  }
899  }
900 
901  if (!fragmentsOutstanding)
902  {
903  event_store_ptr_->flushData();
904  artdaq::RawEvent_ptr subRunEvent(new artdaq::RawEvent(run_id_.run(), 1, 0));
905  subRunEvent->insertFragment(std::move(endSubRunMsg));
906 
907  bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
908 
909  if (!enqStatus)
910  {
911  TLOG_ERROR(name_) << "All data appears to have been received but attempt to send EndOfSubRun fragment to art timed out after " <<
912  enq_timeout_.count() << " seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
913  }
914  process_fragments = false;
915  }
916  else
917  {
918  TLOG_WARNING(name_) << "EndOfSubRun fragment and all EndOfData fragments received but more data expected" << TLOG_ENDL;
919  }
920  }
921  }
922 
923  logMessage_("Subrun " +
924  boost::lexical_cast<std::string>(event_store_ptr_->subrunID()) +
925  " in run " + boost::lexical_cast<std::string>(run_id_.run()) +
926  " has ended. There were " +
927  boost::lexical_cast<std::string>(event_count_in_subrun_) +
928  " events in this subrun, and there have been " +
929  boost::lexical_cast<std::string>(event_count_in_run_) +
930  " events so far in this run.");
931 
932  artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
933  getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
934  if (mqPtr.get() != 0)
935  {
936  artdaq::MonitoredQuantityStats stats;
937  mqPtr->getStats(stats);
938  std::ostringstream oss;
939  oss << "Run " << run_id_.run() << " has an overall event rate of ";
940  oss << std::fixed << std::setprecision(1) << stats.fullSampleRate;
941  oss << " events/sec.";
942  logMessage_(oss.str());
943  previous_run_duration_ = stats.fullDuration;
944  }
945 
946  // 11-May-2015, KAB: call MetricManager::do_stop whenever we exit the
947  // processing fragments loop so that metrics correctly go to zero when
948  // there is no data flowing
949  metricMan_.do_stop();
950 
951  receiver_ptr_.reset(nullptr);
952  sender_ptr_.reset(nullptr);
953 
954  processing_fragments_.store(false);
955  return 0;
956 }
957 
958 std::string artdaq::AggregatorCore::report(std::string const& which) const
959 {
960  if (which == "event_count")
961  {
962  artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
963  getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
964  if (mqPtr.get() != 0)
965  {
966  return boost::lexical_cast<std::string>(mqPtr->getFullSampleCount());
967  }
968  else
969  {
970  return "-1";
971  }
972  }
973 
974  if (which == "run_duration")
975  {
976  // 17-Jan-2014, KAB: if we are not processing fragments, return
977  // the previous run duration
978  double duration = previous_run_duration_;
979  if (processing_fragments_.load())
980  {
981  artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
982  getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
983  if (mqPtr.get() != 0)
984  {
985  duration = mqPtr->getFullDuration();
986  }
987  }
988  std::ostringstream oss;
989  oss << std::fixed << std::setprecision(1) << duration;
990  return oss.str();
991  }
992 
993  if (which == "file_size")
994  {
995  size_t latestFileSize = getLatestFileSize_();
996  return boost::lexical_cast<std::string>(latestFileSize);
997  }
998 
999  if (which == "subrun_number")
1000  {
1001  if (event_store_ptr_.get() != nullptr)
1002  {
1003  return boost::lexical_cast<std::string>(event_store_ptr_->subrunID());
1004  }
1005  else
1006  {
1007  return "-1";
1008  }
1009  }
1010 
1011  if (which == "incomplete_event_count")
1012  {
1013  if (event_store_ptr_ != nullptr)
1014  {
1015  return boost::lexical_cast<std::string>(event_store_ptr_->incompleteEventCount());
1016  }
1017  else
1018  {
1019  return "-1";
1020  }
1021  }
1022 
1023  // lots of cool stuff that we can do here
1024  // - report on the number of fragments received and the number
1025  // of events built (in the current or previous run
1026  // - report on the number of incomplete events in the EventStore
1027  // (if running)
1028  std::string tmpString = name_ + " run number = ";
1029  tmpString.append(boost::lexical_cast<std::string>(run_id_.run()));
1030  tmpString.append(". Command=\"" + which + "\" is not currently supported.");
1031  return tmpString;
1032 }
1033 
1034 std::string artdaq::AggregatorCore::register_monitor(fhicl::ParameterSet const& pset)
1035 {
1036  TLOG_DEBUG(name_) << "AggregatorCore::register_monitor called with argument \"" << pset.to_string() << "\"" << TLOG_ENDL;
1037  std::lock_guard<std::mutex> lock(dispatcher_transfers_mutex_);
1038 
1039  try
1040  {
1041  auto transfer = MakeTransferPlugin(pset, "transfer_plugin", TransferInterface::Role::kSend);
1042 
1043  for (auto& existing_transfer_ : dispatcher_transfers_)
1044  {
1045  if (existing_transfer_->uniqueLabel() == transfer->uniqueLabel())
1046  {
1047  std::stringstream errmsg;
1048  errmsg << "Attempt to register newly-created monitor with label \"" <<
1049  transfer->uniqueLabel() << "\" failed; a monitor with that label already exists";
1050  return errmsg.str();
1051  }
1052  }
1053 
1054  dispatcher_transfers_.emplace_back(std::move(transfer));
1055 
1056  TLOG_INFO(name_) << "Successfully registered monitor with label \"" << dispatcher_transfers_.back()->uniqueLabel() << "\"" << TLOG_ENDL;
1057 
1058  new_transfers_++;
1059  }
1060  catch (...)
1061  {
1062  std::stringstream errmsg;
1063  errmsg << "Unable to create a Transfer plugin with the FHiCL code \"" << pset.to_string() << "\", a new monitor has not been registered";
1064  return errmsg.str();
1065  }
1066 
1067  return "Success";
1068 }
1069 
1070 std::string artdaq::AggregatorCore::unregister_monitor(std::string const& label)
1071 {
1072  TLOG_DEBUG(name_) << "AggregatorCore::unregister_monitor called with argument \"" << label << "\"" << TLOG_ENDL;
1073  std::lock_guard<std::mutex> lock(dispatcher_transfers_mutex_);
1074 
1075  try
1076  {
1077  auto r_i_end = std::remove_if(dispatcher_transfers_.begin(),
1078  dispatcher_transfers_.end(),
1079  [label](const std::unique_ptr<TransferInterface>& transfer)
1080  {
1081  return transfer->uniqueLabel() == label;
1082  });
1083 
1084  auto nfound = dispatcher_transfers_.end() - r_i_end;
1085 
1086  TLOG_INFO(name_) << "Request from monitor with label \"" << label << "\" to unregister received" << TLOG_ENDL;
1087 
1088  if (nfound == 1)
1089  {
1090  dispatcher_transfers_.pop_back();
1091  return "Success";
1092  }
1093  else if (nfound == 0)
1094  {
1095  std::stringstream errmsg;
1096  errmsg << "Warning in AggregatorCore::unregister_monitor: unable to find requested transfer plugin with "
1097  << "label \"" << label << "\"";
1098  TLOG_WARNING(name_) << errmsg.str() << TLOG_ENDL;
1099  return errmsg.str();
1100  }
1101  else
1102  {
1103  std::stringstream errmsg;
1104  errmsg << "Warning in AggregatorCore::unregister_monitor: found more than one (" << nfound <<
1105  ") transfer plugins with label \"" << label << "\", will unregister all of them";
1106  TLOG_WARNING(name_) << errmsg.str() << TLOG_ENDL;
1107  dispatcher_transfers_.erase(r_i_end, dispatcher_transfers_.end());
1108  return errmsg.str();
1109  }
1110  }
1111  catch (...)
1112  {
1113  std::stringstream errmsg;
1114  errmsg << "Unable to unregister transfer plugin with label \"" << label << "\"";
1115  return errmsg.str();
1116  }
1117 
1118  return "Success";
1119 }
1120 
1121 
1122 size_t artdaq::AggregatorCore::getLatestFileSize_() const
1123 {
1124  if (disk_writing_directory_.size() == 0)
1125  {
1126  TLOG_DEBUG(name_) << "Latest file size = 0 (no directory)" << TLOG_ENDL;
1127  return 0;
1128  }
1129  BFS::path outputDir(disk_writing_directory_);
1130  BFS::directory_iterator endIter;
1131 
1132  std::time_t latestFileTime = 0;
1133  size_t latestFileSize = 0;
1134  if (BFS::exists(outputDir) && BFS::is_directory(outputDir))
1135  {
1136  for (BFS::directory_iterator dirIter(outputDir); dirIter != endIter; ++dirIter)
1137  {
1138  BFS::path pathObj = dirIter->path();
1139  if (pathObj.filename().string().find("RootOutput") != std::string::npos &&
1140  pathObj.filename().string().find("root") != std::string::npos)
1141  {
1142  if (BFS::last_write_time(pathObj) >= latestFileTime)
1143  {
1144  latestFileTime = BFS::last_write_time(pathObj);
1145  latestFileSize = BFS::file_size(pathObj);
1146  }
1147  }
1148  }
1149  }
1150  time_t now = time(0);
1151  if ((now - latestFileTime) < 60)
1152  {
1153  TLOG_DEBUG(name_) << "Latest file size = "
1154  << latestFileSize << TLOG_ENDL;
1155  return latestFileSize;
1156  }
1157  else
1158  {
1159  TLOG_DEBUG(name_) << "Latest file size = 0 (too old)" << TLOG_ENDL;
1160  return 0;
1161  }
1162 }
1163 
1164 bool artdaq::AggregatorCore::sendPauseAndResume_()
1165 {
1166  xmlrpc_c::clientSimple myClient;
1167  TLOG_INFO(name_) << "Starting automatic pause..." << TLOG_ENDL;
1168  for (size_t igrp = 0; igrp < xmlrpc_client_lists_.size(); ++igrp)
1169  {
1170  for (size_t idx = 0; idx < xmlrpc_client_lists_[igrp].size(); ++idx)
1171  {
1172  for (size_t iAttempt = 0; iAttempt < 5; ++iAttempt)
1173  {
1174  xmlrpc_c::value result;
1175  myClient.call((xmlrpc_client_lists_[igrp])[idx], "daq.pause", &result);
1176  std::string const resultString = xmlrpc_c::value_string(result);
1177  TLOG_DEBUG(name_) << "Pause: "
1178  << (xmlrpc_client_lists_[igrp])[idx]
1179  << " " << resultString << TLOG_ENDL;
1180  if (std::string::npos !=
1181  boost::algorithm::to_lower_copy(resultString).find("success"))
1182  {
1183  break;
1184  }
1185  else
1186  {
1187  sleep(2);
1188  TLOG_WARNING(name_) << "Retrying pause command to "
1189  << (xmlrpc_client_lists_[igrp])[idx]
1190  << " (" << resultString << ")" << TLOG_ENDL;
1191  }
1192  }
1193  }
1194  }
1195  TLOG_INFO(name_) << "Starting automatic resume..." << TLOG_ENDL;
1196  for (int igrp = (xmlrpc_client_lists_.size() - 1); igrp >= 0; --igrp)
1197  {
1198  for (size_t idx = 0; idx < xmlrpc_client_lists_[igrp].size(); ++idx)
1199  {
1200  for (size_t iAttempt = 0; iAttempt < 5; ++iAttempt)
1201  {
1202  xmlrpc_c::value result;
1203  myClient.call((xmlrpc_client_lists_[igrp])[idx], "daq.resume", &result);
1204  std::string const resultString = xmlrpc_c::value_string(result);
1205  TLOG_DEBUG(name_) << "Resume: "
1206  << (xmlrpc_client_lists_[igrp])[idx]
1207  << " " << resultString << TLOG_ENDL;
1208  if (std::string::npos !=
1209  boost::algorithm::to_lower_copy(resultString).find("success"))
1210  {
1211  break;
1212  }
1213  else
1214  {
1215  sleep(2);
1216  TLOG_WARNING(name_) << "Retrying resume command to "
1217  << (xmlrpc_client_lists_[igrp])[idx]
1218  << " (" << resultString << ")" << TLOG_ENDL;
1219  }
1220  }
1221  }
1222  }
1223  TLOG_INFO(name_) << "Done with automatic resume..." << TLOG_ENDL;
1224  system_pause_requested_.store(false);
1225  return true;
1226 }
1227 
1228 void artdaq::AggregatorCore::logMessage_(std::string const& text)
1229 {
1230  if (is_data_logger_)
1231  {
1232  TLOG_INFO(name_) << text << TLOG_ENDL;
1233  }
1234  else
1235  {
1236  TLOG_DEBUG(name_) << text << TLOG_ENDL;
1237  }
1238 }
1239 
1240 std::string artdaq::AggregatorCore::buildStatisticsString_()
1241 {
1242  std::ostringstream oss;
1243  double eventCount = 1.0;
1244  artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
1245  getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
1246  if (mqPtr.get() != 0)
1247  {
1248  //mqPtr->waitUntilAccumulatorsHaveBeenFlushed(3.0);
1249  artdaq::MonitoredQuantityStats stats;
1250  mqPtr->getStats(stats);
1251  oss << "Input statistics: "
1252  << stats.recentSampleCount << " events received at "
1253  << stats.recentSampleRate << " events/sec, data rate = "
1254  << (stats.recentValueRate * sizeof(artdaq::RawDataType)
1255  / 1024.0 / 1024.0) << " MB/sec, monitor window = "
1256  << stats.recentDuration << " sec, min::max event size = "
1257  << (stats.recentValueMin * sizeof(artdaq::RawDataType)
1258  / 1024.0 / 1024.0)
1259  << "::"
1260  << (stats.recentValueMax * sizeof(artdaq::RawDataType)
1261  / 1024.0 / 1024.0)
1262  << " MB" << std::endl;
1263  eventCount = std::max(double(stats.recentSampleCount), 1.0);
1264  oss << "Average times per event: ";
1265  if (stats.recentSampleRate > 0.0)
1266  {
1267  oss << " elapsed time = "
1268  << (1.0 / stats.recentSampleRate) << " sec";
1269  }
1270  }
1271 
1272  // 13-Jan-2015, KAB - Just a reminder that using "eventCount" in the
1273  // denominator of the calculations below is important so that the sum
1274  // of the different "average" times adds up to the overall average time
1275  // per event. In some (but not all) cases, using recentValueAverage()
1276  // would be equivalent.
1277 
1278  mqPtr = artdaq::StatisticsCollection::getInstance().
1279  getMonitoredQuantity(INPUT_WAIT_STAT_KEY);
1280  if (mqPtr.get() != 0)
1281  {
1282  oss << ", input wait time = "
1283  << (mqPtr->getRecentValueSum() / eventCount) << " sec";
1284  }
1285 
1286  mqPtr = artdaq::StatisticsCollection::getInstance().
1287  getMonitoredQuantity(STORE_EVENT_WAIT_STAT_KEY);
1288  if (mqPtr.get() != 0)
1289  {
1290  artdaq::MonitoredQuantityStats stats;
1291  mqPtr->getStats(stats);
1292  oss << ", avg::max event store wait time = "
1293  << (stats.recentValueSum / eventCount)
1294  << "::" << stats.recentValueMax
1295  << " sec";
1296  }
1297 
1298  mqPtr = artdaq::StatisticsCollection::getInstance().
1299  getMonitoredQuantity(SHM_COPY_TIME_STAT_KEY);
1300  if (mqPtr.get() != 0)
1301  {
1302  oss << ", shared memory copy time = "
1303  << (mqPtr->getRecentValueSum() / eventCount) << " sec";
1304  }
1305 
1306  mqPtr = artdaq::StatisticsCollection::getInstance().
1307  getMonitoredQuantity(FILE_CHECK_TIME_STAT_KEY);
1308  if (mqPtr.get() != 0)
1309  {
1310  oss << ", file size test time = "
1311  << (mqPtr->getRecentValueSum() / eventCount) << " sec";
1312  }
1313 
1314  return oss.str();
1315 }
1316 
1317 void artdaq::AggregatorCore::sendMetrics_()
1318 {
1319  //TLOG_DEBUG("AggregatorCore") << "Sending metrics " << TLOG_ENDL;
1320  double eventCount = 1.0;
1321  artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
1322  getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
1323  if (mqPtr.get() != 0)
1324  {
1325  artdaq::MonitoredQuantityStats stats;
1326  mqPtr->getStats(stats);
1327  eventCount = std::max(double(stats.recentSampleCount), 1.0);
1328  metricMan_.sendMetric("Event Rate", stats.recentSampleRate, "events/sec", 1, MetricMode::Average);
1329  metricMan_.sendMetric("Average Event Size", (stats.recentValueAverage * sizeof(artdaq::RawDataType)), "bytes/event", 2, MetricMode::Average);
1330  metricMan_.sendMetric("Data Rate", (stats.recentValueRate * sizeof(artdaq::RawDataType)), "bytes/sec", 2, MetricMode::Average);
1331  }
1332 
1333  // 13-Jan-2015, KAB - Just a reminder that using "eventCount" in the
1334  // denominator of the calculations below is important so that the sum
1335  // of the different "average" times adds up to the overall average time
1336  // per event. In some (but not all) cases, using recentValueAverage()
1337  // would be equivalent.
1338 
1339  mqPtr = artdaq::StatisticsCollection::getInstance().
1340  getMonitoredQuantity(INPUT_WAIT_STAT_KEY);
1341  if (mqPtr.get() != 0)
1342  {
1343  metricMan_.sendMetric("Average Input Wait Time", (mqPtr->getRecentValueSum() / eventCount), "seconds/event", 3, MetricMode::Average);
1344  }
1345 
1346  mqPtr = artdaq::StatisticsCollection::getInstance().
1347  getMonitoredQuantity(STORE_EVENT_WAIT_STAT_KEY);
1348  if (mqPtr.get() != 0)
1349  {
1350  metricMan_.sendMetric("Avg art Queue Wait Time", (mqPtr->getRecentValueSum() / eventCount), "seconds/event", 3, MetricMode::Average);
1351  }
1352 
1353  mqPtr = artdaq::StatisticsCollection::getInstance().
1354  getMonitoredQuantity(SHM_COPY_TIME_STAT_KEY);
1355  if (mqPtr.get() != 0)
1356  {
1357  metricMan_.sendMetric("Avg Shared Memory Copy Time", (mqPtr->getRecentValueSum() / eventCount), "seconds/event", 4, MetricMode::Average);
1358  }
1359 
1360  mqPtr = artdaq::StatisticsCollection::getInstance().
1361  getMonitoredQuantity(FILE_CHECK_TIME_STAT_KEY);
1362  if (mqPtr.get() != 0)
1363  {
1364  metricMan_.sendMetric("Average File Check Time", (mqPtr->getRecentValueSum() / eventCount), "seconds/event", 4, MetricMode::Average);
1365  }
1366 }
static const std::string INPUT_EVENTS_STAT_KEY
Key for the Input Events MonitoredQuantity.
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
static const std::string SHM_COPY_TIME_STAT_KEY
Key for the Shared Memory Copy Time MonitoredQuantity.
Sends Fragment objects using TransferInterface plugins. Uses Routing Tables if confgiured, otherwise will Round-Robin Fragments to the destinations.
bool reinitialize(fhicl::ParameterSet const &pset)
Reinitializes the AggregatorCore. No-Op.
bool stop()
Stops the AggregatorCore.
bool shutdown()
Shuts Down the AggregatorCore.
Keep track of the count of Fragments received from a set of sources.
Definition: FragCounter.hh:20
bool start(art::RunID id)
Start the AggregatorCore.
The Fragment was successfully inserted.
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
std::string unregister_monitor(std::string const &label)
Delete the TransferInterface having the given unique label.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
AggregatorCore(int rank, std::string name)
AggregatorCore Constructor.
static const std::string INPUT_WAIT_STAT_KEY
Key for the Input Wait MonitoredQuantity.
bool soft_initialize(fhicl::ParameterSet const &pset)
Soft-Initializes the AggregatorCore. No-Op.
bool resume()
Resumes the AggregatorCore.
std::string register_monitor(fhicl::ParameterSet const &pset)
Create a new TransferInterface instance using the given configuration.
static const std::string FILE_CHECK_TIME_STAT_KEY
Key for the File Check Time MonitoredQuantity.
void setSlot(size_t slot, size_t val)
Set the given slot to the given value.
Definition: FragCounter.hh:110
void display_bits(void *memstart, size_t nbytes, std::string sourcename)
Write out memory, for debugging purposes.
The EventStore class collects Fragment objects, until it receives a complete event, at which point the event is handed over to the art thread.
Definition: EventStore.hh:49
This TransferInterface is a Sender.
void incSlot(size_t slot)
Increment the given slot by one.
Definition: FragCounter.hh:93
Receives Fragment objects from one or more DataSenderManager instances using TransferInterface plugin...
std::string report(std::string const &which) const
Send a report on a given run-time quantity.
The EventStore is full, but the Fragment was accepted as it is for an already-open event...
bool initialize(fhicl::ParameterSet const &pset)
Processes the initialize request.
bool pause()
Pauses the AggregatorCore.
int( ART_CFGSTRING_FCN)(const std::string &)
An art function that accepts a fhicl::ParameterSet as a string.
Definition: EventStore.hh:62
The Fragment was rejected, because the RawEventQueue is full.
static const std::string STORE_EVENT_WAIT_STAT_KEY
Key for the EventStore Event Wait MonitoredQuantity.
size_t process_fragments()
The main working loop of the AggregatorCore. Receives events from DataReceiverManager and processes t...