1 #include "canvas/Utilities/Exception.h"
2 #include "art/Framework/Art/artapp.h"
4 #include "artdaq-core/Core/SimpleQueueReader.hh"
5 #include "artdaq-core/Utilities/ExceptionHandler.hh"
7 #include "artdaq/Application/EventBuilderCore.hh"
8 #include "artdaq/TransferPlugins/TransferInterface.hh"
9 #include "artdaq/DAQdata/Globals.hh"
10 #include "artdaq/DAQrate/EventStore.hh"
11 #define TRACE_NAME "EventBuilderCore"
21 , art_initialized_(false)
22 , stop_requested_(false)
23 , pause_requested_(false)
24 , run_is_paused_(false)
25 , processing_fragments_(false)
27 TLOG_DEBUG(name_) <<
"Constructor" << TLOG_ENDL;
32 metricMan = &metricMan_;
37 TLOG_DEBUG(name_) <<
"Destructor" << TLOG_ENDL;
40 void artdaq::EventBuilderCore::initializeEventStore(fhicl::ParameterSet pset)
45 TRACE(36,
"Creating EventStore and Starting art thread");
47 init_string_, reader));
48 TRACE(36,
"Done Creating EventStore");
49 art_initialized_ =
true;
53 const char* dummyArgs[1]{
"SimpleQueueReader" };
56 1, const_cast<char**>(dummyArgs), reader));
62 init_string_ = pset.to_string();
63 TLOG_DEBUG(name_) <<
"initialize method called with DAQ "
64 <<
"ParameterSet = \"" << init_string_ <<
"\"." << TLOG_ENDL;
67 fhicl::ParameterSet daq_pset;
70 daq_pset = pset.get<fhicl::ParameterSet>(
"daq");
75 <<
"Unable to find the DAQ parameters in the initialization "
76 <<
"ParameterSet: \"" + pset.to_string() +
"\"." << TLOG_ENDL;
79 fhicl::ParameterSet evb_pset;
82 evb_pset = daq_pset.get<fhicl::ParameterSet>(
"event_builder");
83 data_pset_ = evb_pset;
88 <<
"Unable to find the event_builder parameters in the DAQ "
89 <<
"initialization ParameterSet: \"" + daq_pset.to_string() +
"\"." << TLOG_ENDL;
94 expected_fragments_per_event_ =
95 evb_pset.get<
size_t>(
"expected_fragments_per_event");
100 <<
"The expected_fragments_per_event parameter was not specified "
101 <<
"in the event_builder initialization PSet: \"" << pset.to_string()
102 <<
"\"." << TLOG_ENDL;
107 try { use_art_ = evb_pset.get<
bool>(
"use_art"); }
111 <<
"The use_art parameter was not specified "
112 <<
"in the event_builder initialization PSet: \""
113 << evb_pset.to_string() <<
"\"." << TLOG_ENDL;
116 inrun_recv_timeout_usec_ = evb_pset.get<
size_t>(
"inrun_recv_timeout_usec", 100000);
117 endrun_recv_timeout_usec_ = evb_pset.get<
size_t>(
"endrun_recv_timeout_usec", 20000000);
118 pause_recv_timeout_usec_ = evb_pset.get<
size_t>(
"pause_recv_timeout_usec", 3000000);
119 verbose_ = evb_pset.get<
bool>(
"verbose",
false);
122 statsHelper_.createCollectors(evb_pset, 100, 20.0, 60.0, INPUT_FRAGMENTS_STAT_KEY);
125 std::string metricsReportingInstanceName =
"EventBuilder." +
126 boost::lexical_cast<std::string>(my_rank);
127 fhicl::ParameterSet metric_pset;
130 metric_pset = daq_pset.get<fhicl::ParameterSet>(
"metrics");
134 if (metric_pset.is_empty())
136 TLOG_INFO(name_) <<
"No metric plugins appear to be defined" << TLOG_ENDL;
140 metricMan_.initialize(metric_pset, metricsReportingInstanceName);
144 ExceptionHandler(ExceptionHandlerRethrow::no,
145 "Error loading metrics in EventBuilderCore::initialize()");
154 if (art_initialized_ ==
false)
156 this->initializeEventStore(evb_pset);
157 fhicl::ParameterSet tmp = pset;
159 previous_pset_ = tmp;
163 fhicl::ParameterSet tmp = pset;
165 if (tmp != previous_pset_)
168 <<
"The art configuration can not be altered after art "
169 <<
"has been configured." << TLOG_ENDL;
179 stop_requested_.store(
false);
180 pause_requested_.store(
false);
181 run_is_paused_.store(
false);
183 eod_fragments_received_ = 0;
184 fragment_count_in_run_ = 0;
185 statsHelper_.resetStatistics();
187 metricMan_.do_start();
188 event_store_ptr_->startRun(
id.run());
190 logMessage_(
"Started run " + boost::lexical_cast<std::string>(run_id_.run()));
196 logMessage_(
"Stopping run " + boost::lexical_cast<std::string>(run_id_.run()) +
197 ", subrun " + boost::lexical_cast<std::string>(event_store_ptr_->subrunID()));
205 stop_requested_.store(
true);
208 if (!run_is_paused_.load())
210 endSucceeded =
false;
212 endSucceeded = event_store_ptr_->endSubrun();
213 while (!endSucceeded && attemptsToEnd < 3)
216 TLOG_DEBUG(name_) <<
"Retrying EventStore::endSubrun()" << TLOG_ENDL;
217 endSucceeded = event_store_ptr_->endSubrun();
222 <<
"EventStore::endSubrun in stop method failed after three tries." << TLOG_ENDL;
226 endSucceeded =
false;
228 endSucceeded = event_store_ptr_->endRun();
229 while (!endSucceeded && attemptsToEnd < 3)
232 TLOG_DEBUG(name_) <<
"Retrying EventStore::endRun()" << TLOG_ENDL;
233 endSucceeded = event_store_ptr_->endRun();
238 <<
"EventStore::endRun in stop method failed after three tries." << TLOG_ENDL;
241 flush_mutex_.unlock();
242 run_is_paused_.store(
false);
248 logMessage_(
"Pausing run " + boost::lexical_cast<std::string>(run_id_.run()) +
249 ", subrun " + boost::lexical_cast<std::string>(event_store_ptr_->subrunID()));
250 pause_requested_.store(
true);
253 bool endSucceeded =
false;
254 int attemptsToEnd = 1;
255 endSucceeded = event_store_ptr_->endSubrun();
256 while (!endSucceeded && attemptsToEnd < 3)
259 TLOG_DEBUG(name_) <<
"Retrying EventStore::endSubrun()" << TLOG_ENDL;
260 endSucceeded = event_store_ptr_->endSubrun();
265 <<
"EventStore::endSubrun in pause method failed after three tries." << TLOG_ENDL;
268 flush_mutex_.unlock();
269 run_is_paused_.store(
true);
275 logMessage_(
"Resuming run " + boost::lexical_cast<std::string>(run_id_.run()));
276 eod_fragments_received_ = 0;
277 pause_requested_.store(
false);
279 metricMan_.do_start();
280 event_store_ptr_->startSubrun();
281 run_is_paused_.store(
false);
291 int readerReturnValue;
292 bool endSucceeded =
false;
293 int attemptsToEnd = 1;
294 TRACE(4,
"EventBuilderCore::shutdown: Calling EventStore::endOfData");
295 endSucceeded = event_store_ptr_->endOfData(readerReturnValue);
296 while (!endSucceeded && attemptsToEnd < 3)
299 TRACE(4,
"EventBuilderCore::shutdown: Retrying endOfData call");
300 TLOG_DEBUG(name_) <<
"Retrying EventStore::endOfData()" << TLOG_ENDL;
301 endSucceeded = event_store_ptr_->endOfData(readerReturnValue);
303 TRACE(4,
"EventBuilderCore::shutdown: Shutting down MetricManager");
304 metricMan_.shutdown();
305 TRACE(4,
"EventBuilderCore::shutdown: Complete");
311 TLOG_DEBUG(name_) <<
"soft_initialize method called with DAQ "
312 <<
"ParameterSet = \"" << pset.to_string()
313 <<
"\"." << TLOG_ENDL;
319 TLOG_DEBUG(name_) <<
"reinitialize method called with DAQ "
320 <<
"ParameterSet = \"" << pset.to_string()
321 <<
"\"." << TLOG_ENDL;
322 event_store_ptr_.reset(
nullptr);
323 art_initialized_ =
false;
324 return initialize(pset);
329 processing_fragments_.store(
true);
330 bool process_fragments =
true;
336 receiver_ptr_->start_threads();
340 TLOG_DEBUG(name_) <<
"Waiting for first fragment." << TLOG_ENDL;
341 artdaq::MonitoredQuantityStats::TIME_POINT_T startTime;
342 while (process_fragments)
344 size_t recvTimeout = inrun_recv_timeout_usec_;
345 if (stop_requested_.load()) { recvTimeout = endrun_recv_timeout_usec_; }
346 else if (pause_requested_.load()) { recvTimeout = pause_recv_timeout_usec_; }
347 startTime = artdaq::MonitoredQuantity::getCurrentTime();
348 artdaq::FragmentPtr pfragment = receiver_ptr_->recvFragment(senderSlot, recvTimeout);
349 statsHelper_.addSample(INPUT_WAIT_STAT_KEY,
350 (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
353 if (stop_requested_.load() &&
354 recvTimeout == endrun_recv_timeout_usec_)
357 <<
"Timeout occurred in attempt to receive data, but as a stop has been requested, will forcibly end the run." << TLOG_ENDL;
358 event_store_ptr_->flushData();
359 flush_mutex_.unlock();
360 process_fragments =
false;
362 else if (pause_requested_.load() &&
363 recvTimeout == pause_recv_timeout_usec_)
366 <<
"Timeout occurred in attempt to receive data, but as a pause has been requested, will forcibly pause the run." << TLOG_ENDL;
367 event_store_ptr_->flushData();
368 flush_mutex_.unlock();
369 process_fragments =
false;
375 TLOG_ERROR(name_) <<
"Received invalid fragment from " << senderSlot <<
". This is usually the case when a timeout has occurred, but sender was not set to RECV_TIMEOUT as expected." << TLOG_ENDL;
378 if (!receiver_ptr_->enabled_sources().count(senderSlot))
381 <<
"Invalid senderSlot received from recvFragment: "
382 << senderSlot << TLOG_ENDL;
385 fragments_received.
incSlot(senderSlot);
386 if (artdaq::Fragment::isSystemFragmentType(pfragment->type()))
389 <<
"Sender slot = " << senderSlot
390 <<
", fragment type = " << ((int)pfragment->type())
391 <<
", sequence ID = " << pfragment->sequenceID() << TLOG_ENDL;
394 ++fragment_count_in_run_;
395 TRACE(18,
"process_fragments %lu=fragment_count_in_run_ %lu=pfragment->size()"
396 , fragment_count_in_run_, pfragment->size());
397 statsHelper_.addSample(INPUT_FRAGMENTS_STAT_KEY, pfragment->size());
398 if (statsHelper_.readyToReport(fragment_count_in_run_))
400 std::string statString = buildStatisticsString_();
401 logMessage_(statString);
402 logMessage_(
"Received fragment " +
403 boost::lexical_cast<std::string>(fragment_count_in_run_) +
404 " with sequence ID " +
405 boost::lexical_cast<std::string>(pfragment->sequenceID()) +
407 boost::lexical_cast<std::string>(run_id_.run()) +
409 boost::lexical_cast<std::string>(event_store_ptr_->subrunID()) +
412 if (statsHelper_.statsRollingWindowHasMoved())
415 event_store_ptr_->sendMetrics();
418 startTime = artdaq::MonitoredQuantity::getCurrentTime();
419 if (pfragment->type() != artdaq::Fragment::EndOfDataFragmentType)
421 artdaq::FragmentPtr rejectedFragment;
422 auto seqId = pfragment->sequenceID();
423 auto fragId = pfragment->fragmentID();
424 bool try_again =
true;
427 auto ret = event_store_ptr_->insert(std::move(pfragment), rejectedFragment);
430 receiver_ptr_->unsuppressAll();
437 else if (stop_requested_.load())
440 flush_mutex_.unlock();
441 process_fragments =
false;
442 receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
444 <<
"Unable to process fragment " << fragId
445 <<
" in event " << seqId
446 <<
" because of back-pressure - forcibly ending the run." << TLOG_ENDL;
448 else if (pause_requested_.load())
451 flush_mutex_.unlock();
452 process_fragments =
false;
453 receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
455 <<
"Unable to process fragment " << fragId
456 <<
" in event " << seqId
457 <<
" because of back-pressure - forcibly pausing the run." << TLOG_ENDL;
461 pfragment = std::move(rejectedFragment);
463 <<
"Unable to process fragment " << fragId
464 <<
" in event " << seqId
465 <<
" because of back-pressure from art - retrying..." << TLOG_ENDL;
470 receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
472 <<
"Unable to process fragment " << fragId
473 <<
" in event " << seqId
474 <<
" because the EventStore has reached the maximum number of incomplete events." << std::endl
475 <<
" Will retry when the EventStore is ready for new events." << TLOG_ENDL;
481 eod_fragments_received_++;
482 event_store_ptr_->setRequestMode(detail::RequestMessageMode::EndOfRun);
486 fragments_sent.
setSlot(senderSlot, *pfragment->dataBegin() + 1);
488 statsHelper_.addSample(STORE_EVENT_WAIT_STAT_KEY,
489 artdaq::MonitoredQuantity::getCurrentTime() - startTime);
495 if (eod_fragments_received_ == receiver_ptr_->enabled_sources().size())
497 bool fragmentsOutstanding =
false;
498 for (
auto& i : receiver_ptr_->enabled_sources())
500 if (fragments_received[i] != fragments_sent[i])
502 fragmentsOutstanding =
true;
507 if (!fragmentsOutstanding)
509 event_store_ptr_->flushData();
510 flush_mutex_.unlock();
511 process_fragments =
false;
515 TLOG_WARNING(name_) <<
"All EndOfData fragments received but more data expected" << TLOG_ENDL;
523 metricMan_.do_stop();
525 receiver_ptr_.reset(
nullptr);
526 processing_fragments_.store(
false);
532 if (which ==
"incomplete_event_count")
534 if (event_store_ptr_ !=
nullptr)
536 return boost::lexical_cast<std::string>(event_store_ptr_->incompleteEventCount());
543 if (which ==
"event_count")
545 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
546 getMonitoredQuantity(STORE_EVENT_WAIT_STAT_KEY);
547 if (mqPtr.get() != 0)
549 return boost::lexical_cast<std::string>(mqPtr->getFullSampleCount());
557 if (which ==
"run_duration")
561 if (processing_fragments_.load())
563 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
564 getMonitoredQuantity(STORE_EVENT_WAIT_STAT_KEY);
565 if (mqPtr.get() != 0)
567 duration = mqPtr->getFullDuration();
570 std::ostringstream oss;
571 oss << std::fixed << std::setprecision(1) << duration;
580 std::string tmpString = name_ +
" run number = ";
581 tmpString.append(boost::lexical_cast<std::string>(run_id_.run()));
582 tmpString.append(
". Command \"" + which +
"\" is not currently supported.");
587 std::string artdaq::EventBuilderCore::buildStatisticsString_()
589 std::ostringstream oss;
590 double eventCount = 1.0;
591 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
592 getMonitoredQuantity(INPUT_FRAGMENTS_STAT_KEY);
593 if (mqPtr.get() != 0)
596 artdaq::MonitoredQuantityStats stats;
597 mqPtr->getStats(stats);
598 oss <<
"Input statistics: "
599 << stats.recentSampleCount <<
" fragments received at "
600 << stats.recentSampleRate <<
" fragments/sec, data rate = "
601 << (stats.recentValueRate *
sizeof(artdaq::RawDataType)
602 / 1024.0 / 1024.0) <<
" MB/sec, monitor window = "
603 << stats.recentDuration <<
" sec, min::max fragment size = "
604 << (stats.recentValueMin *
sizeof(artdaq::RawDataType)
607 << (stats.recentValueMax *
sizeof(artdaq::RawDataType)
609 <<
" MB" << std::endl;
610 eventCount = std::max(
double(stats.recentSampleCount), 1.0);
611 oss <<
"Average times per fragment: ";
612 if (stats.recentSampleRate > 0.0)
614 oss <<
" elapsed time = "
615 << (1.0 / stats.recentSampleRate) <<
" sec";
625 mqPtr = artdaq::StatisticsCollection::getInstance().
626 getMonitoredQuantity(INPUT_WAIT_STAT_KEY);
627 if (mqPtr.get() != 0)
629 oss <<
", input wait time = "
630 << (mqPtr->getRecentValueSum() / eventCount) <<
" sec";
633 mqPtr = artdaq::StatisticsCollection::getInstance().
634 getMonitoredQuantity(STORE_EVENT_WAIT_STAT_KEY);
635 if (mqPtr.get() != 0)
637 oss <<
", event store wait time = "
638 << (mqPtr->getRecentValueSum() / eventCount) <<
" sec";
644 void artdaq::EventBuilderCore::sendMetrics_()
647 double fragmentCount = 1.0;
648 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
649 getMonitoredQuantity(INPUT_FRAGMENTS_STAT_KEY);
650 if (mqPtr.get() != 0)
652 artdaq::MonitoredQuantityStats stats;
653 mqPtr->getStats(stats);
654 fragmentCount = std::max(
double(stats.recentSampleCount), 1.0);
655 metricMan_.sendMetric(
"Fragment Count",
656 static_cast<unsigned long>(stats.fullSampleCount),
658 metricMan_.sendMetric(
"Fragment Rate",
659 stats.recentSampleRate,
"fragments/sec", 1);
660 metricMan_.sendMetric(
"Average Fragment Size",
661 (stats.recentValueAverage *
sizeof(artdaq::RawDataType)
662 ),
"bytes/fragment", 2);
663 metricMan_.sendMetric(
"Data Rate",
664 (stats.recentValueRate *
sizeof(artdaq::RawDataType)
674 mqPtr = artdaq::StatisticsCollection::getInstance().
675 getMonitoredQuantity(INPUT_WAIT_STAT_KEY);
676 if (mqPtr.get() != 0)
678 metricMan_.sendMetric(
"Average Input Wait Time",
679 (mqPtr->getRecentValueSum() / fragmentCount),
680 "seconds/fragment", 3);
683 mqPtr = artdaq::StatisticsCollection::getInstance().
684 getMonitoredQuantity(STORE_EVENT_WAIT_STAT_KEY);
685 if (mqPtr.get() != 0)
687 metricMan_.sendMetric(
"Avg Event Store Wait Time",
688 (mqPtr->getRecentValueSum() / fragmentCount),
689 "seconds/fragment", 3);
693 void artdaq::EventBuilderCore::logMessage_(std::string
const& text)
697 TLOG_INFO(name_) << text << TLOG_ENDL;
701 TLOG_DEBUG(name_) << text << TLOG_ENDL;
std::string report(std::string const &which) const
Send a report on a given run-time quantity.
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
static const std::string INPUT_FRAGMENTS_STAT_KEY
Key for the Input Fragments MonitoredQuantity.
bool resume()
Resumes the EventBuilderCore.
Keep track of the count of Fragments received from a set of sources.
The Fragment was successfully inserted.
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
bool initialize(fhicl::ParameterSet const &pset)
Processes the initialize request.
static const std::string STORE_EVENT_WAIT_STAT_KEY
Key for the Store Event Wait MonitoredQuantity.
static const std::string INPUT_WAIT_STAT_KEY
Key for the Input Wait MonitoredQuantity.
void setSlot(size_t slot, size_t val)
Set the given slot to the given value.
EventBuilderCore(int rank, std::string name)
EventBuilderCore Constructor.
size_t process_fragments()
The main loop of the EventBuilderCore. Receives Fragment objects from DataReceiverManager and enqueue...
The EventStore class collects Fragment objects, until it receives a complete event, at which point the event is handed over to the art thread.
void incSlot(size_t slot)
Increment the given slot by one.
int( ART_CMDLINE_FCN)(int, char **)
An art function that accepts standard C main arguments.
Receives Fragment objects from one or more DataSenderManager instances using TransferInterface plugin...
The EventStore is full, but the Fragment was accepted as it is for an already-open event...
bool start(art::RunID id)
Start the EventBuilderCore.
bool stop()
Stops the EventBuilderCore.
bool pause()
Pauses the EventBuilderCore.
int( ART_CFGSTRING_FCN)(const std::string &)
An art function that accepts a fhicl::ParameterSet as a string.
bool reinitialize(fhicl::ParameterSet const &pset)
Reinitializes the EventBuilderCore.
bool shutdown()
Shuts Down the EventBuilderCore.
bool soft_initialize(fhicl::ParameterSet const &pset)
Soft-Initializes the EventBuilderCore. No-Op.
The Fragment was rejected, because the RawEventQueue is full.