1 #pragma GCC diagnostic push
2 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3 #include "xmlrpc-c/client_simple.hpp"
4 #pragma GCC diagnostic pop
11 #include <boost/tokenizer.hpp>
12 #include <boost/filesystem.hpp>
13 #include <boost/algorithm/string.hpp>
14 #include "art/Framework/Art/artapp.h"
15 #include "cetlib/BasicPluginFactory.h"
17 #include "artdaq-core/Core/SimpleQueueReader.hh"
18 #include "artdaq-core/Utilities/ExceptionHandler.hh"
19 #include "artdaq-core/Data/RawEvent.hh"
21 #include "artdaq/Application/AggregatorCore.hh"
22 #include "artdaq/DAQrate/EventStore.hh"
23 #include "artdaq/DAQrate/detail/FragCounter.hh"
24 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
27 namespace BFS = boost::filesystem;
43 void display_bits(
void* memstart,
size_t nbytes, std::string sourcename)
45 std::stringstream bitstr;
46 bitstr <<
"The " << nbytes <<
"-byte chunk of memory beginning at " <<
static_cast<void*
>(memstart) <<
" is : ";
48 for (
unsigned int i = 0; i < nbytes; i++)
55 bitstr << std::bitset<8>(*((
reinterpret_cast<uint8_t*
>(memstart)) + i)) <<
" ";
58 TLOG_DEBUG(sourcename) << bitstr.str() << TLOG_ENDL;
65 , art_initialized_(false)
66 , event_queue_(artdaq::getGlobalQueue(10))
67 , stop_requested_(false)
68 , local_pause_requested_(false)
69 , processing_fragments_(false)
70 , system_pause_requested_(false)
71 , previous_run_duration_(-1.0)
74 TLOG_DEBUG(name_) <<
"Constructor" << TLOG_ENDL;
80 metricMan = &metricMan_;
86 TLOG_DEBUG(name_) <<
"Destructor" << TLOG_ENDL;
91 init_string_ = pset.to_string();
92 TLOG_DEBUG(name_) <<
"initialize method called with DAQ " <<
"ParameterSet = \"" << init_string_ <<
"\"." << TLOG_ENDL;
95 fhicl::ParameterSet daq_pset;
98 daq_pset = pset.get<fhicl::ParameterSet>(
"daq");
103 <<
"Unable to find the DAQ parameters in the initialization "
104 <<
"ParameterSet: \"" + pset.to_string() +
"\"." << TLOG_ENDL;
107 fhicl::ParameterSet agg_pset;
110 agg_pset = daq_pset.get<fhicl::ParameterSet>(
"aggregator");
111 data_pset_ = agg_pset;
116 <<
"Unable to find the aggregator parameters in the DAQ "
117 <<
"initialization ParameterSet: \"" + daq_pset.to_string() +
"\"." << TLOG_ENDL;
122 expected_events_per_bunch_ =
123 agg_pset.get<
size_t>(
"expected_events_per_bunch");
128 <<
"The expected_events_per_bunch parameter was not specified "
129 <<
"in the aggregator initialization PSet: \"" << pset.to_string()
130 <<
"\"." << TLOG_ENDL;
134 enq_timeout_ =
static_cast<detail::seconds
>(agg_pset.get<
size_t>(
"enq_timeout", 5.0));
140 is_data_logger_ =
false;
141 is_online_monitor_ =
false;
142 is_dispatcher_ =
false;
143 std::string metricsReportingInstanceName =
"Data Logger";
144 bool agtype_was_specified =
false;
145 if (!agtype_was_specified)
149 is_data_logger_ = agg_pset.get<
bool>(
"is_data_logger");
150 agtype_was_specified =
true;
154 if (!agtype_was_specified)
158 is_online_monitor_ = agg_pset.get<
bool>(
"is_online_monitor");
159 metricsReportingInstanceName =
"Online Monitor";
160 agtype_was_specified =
true;
164 if (!agtype_was_specified)
168 is_dispatcher_ = agg_pset.get<
bool>(
"is_dispatcher");
169 metricsReportingInstanceName =
"Dispatcher";
170 agtype_was_specified =
true;
175 if (!agtype_was_specified)
177 throw cet::exception(
"ConfigurationException",
"You must specify one of is_data_logger, is_online_monitor or is_dispatcher");
180 TLOG_DEBUG(name_) <<
"Rank " << my_rank
181 <<
", is_data_logger = " << is_data_logger_
182 <<
", is_online_monitor = " << is_online_monitor_
183 <<
", is_dispatcher = " << is_dispatcher_ << TLOG_ENDL;
185 disk_writing_directory_ =
"";
188 fhicl::ParameterSet output_pset =
189 pset.get<fhicl::ParameterSet>(
"outputs");
190 fhicl::ParameterSet normalout_pset =
191 output_pset.get<fhicl::ParameterSet>(
"normalOutput");
193 if (!normalout_pset.is_empty())
195 std::string filename = normalout_pset.get<std::string>(
"fileName",
"");
196 if (filename.size() > 0)
198 size_t pos = filename.rfind(
"/");
199 if (pos != std::string::npos)
201 disk_writing_directory_ = filename.substr(0, pos);
206 TLOG_WARNING(name_) <<
"Problem finding \"fileName\" parameter in \"normalOutput\" RootOutput module FHiCL code" << TLOG_ENDL;
212 std::string xmlrpcClientString =
213 agg_pset.get<std::string>(
"xmlrpc_client_list",
"");
214 if (xmlrpcClientString.size() > 0)
216 xmlrpc_client_lists_.clear();
217 boost::char_separator<char> sep1(
";");
218 boost::tokenizer<boost::char_separator<char>>
219 primaryTokens(xmlrpcClientString, sep1);
220 boost::tokenizer<boost::char_separator<char>>::iterator iter1;
221 boost::tokenizer<boost::char_separator<char>>::iterator
222 endIter1 = primaryTokens.end();
223 for (iter1 = primaryTokens.begin(); iter1 != endIter1; ++iter1)
225 boost::char_separator<char> sep2(
",");
226 boost::tokenizer<boost::char_separator<char>>
227 secondaryTokens(*iter1, sep2);
228 boost::tokenizer<boost::char_separator<char>>::iterator iter2;
229 boost::tokenizer<boost::char_separator<char>>::iterator
230 endIter2 = secondaryTokens.end();
231 int clientGroup = -1;
232 std::string url =
"";
234 for (iter2 = secondaryTokens.begin(); iter2 != endIter2; ++iter2)
244 clientGroup = boost::lexical_cast<
int>(*iter2);
250 <<
"Unexpected XMLRPC client list element, index = "
251 << loopCount <<
", value = \"" << *iter2 <<
"\"" << TLOG_ENDL;
255 if (clientGroup >= 0 && url.size() > 0)
257 int elementsNeeded = clientGroup + 1 - ((int)xmlrpc_client_lists_.size());
258 for (
int idx = 0; idx < elementsNeeded; ++idx)
260 std::vector<std::string> tmpVec;
261 xmlrpc_client_lists_.push_back(tmpVec);
263 xmlrpc_client_lists_[clientGroup].push_back(url);
267 double fileSizeMB = agg_pset.get<
double>(
"subrun_size_MB", 0);
268 file_close_threshold_bytes_ = ((size_t)fileSizeMB * 1024.0 * 1024.0);
269 file_close_timeout_secs_ = agg_pset.get<time_t>(
"subrun_duration", 0);
270 file_close_event_count_ = agg_pset.get<
size_t>(
"subrun_event_count", 0);
272 inrun_recv_timeout_usec_ = agg_pset.get<
size_t>(
"inrun_recv_timeout_usec", 100000);
273 endrun_recv_timeout_usec_ = agg_pset.get<
size_t>(
"endrun_recv_timeout_usec", 20000000);
274 pause_recv_timeout_usec_ = agg_pset.get<
size_t>(
"pause_recv_timeout_usec", 3000000);
276 onmon_event_prescale_ = agg_pset.get<
size_t>(
"onmon_event_prescale", 1);
278 filesize_check_interval_seconds_ = agg_pset.get<int32_t>(
"filesize_check_interval_seconds", 20);
279 filesize_check_interval_events_ = agg_pset.get<int32_t>(
"filesize_check_interval_events", 20);
282 stats_helper_.createCollectors(agg_pset, 50, 20.0, 60.0, INPUT_EVENTS_STAT_KEY);
285 fhicl::ParameterSet metric_pset;
289 metric_pset = daq_pset.get<fhicl::ParameterSet>(
"metrics");
293 if (metric_pset.is_empty())
295 TLOG_INFO(name_) <<
"No metric plugins appear to be defined" << TLOG_ENDL;
299 metricMan_.initialize(metric_pset, metricsReportingInstanceName);
303 ExceptionHandler(ExceptionHandlerRethrow::no,
304 "Error loading metrics in AggregatorCore::initialize()");
307 if (event_store_ptr_ ==
nullptr)
310 size_t desired_events_per_bunch = expected_events_per_bunch_;
311 if (is_online_monitor_ || is_dispatcher_)
313 desired_events_per_bunch = 1;
315 TRACE(36,
"Creating EventStore and Starting art thread");
317 init_string_, reader));
318 TRACE(36,
"Done Creating EventStore");
319 event_store_ptr_->setSeqIDModulus(desired_events_per_bunch);
320 fhicl::ParameterSet tmp = pset;
322 previous_pset_ = tmp;
326 fhicl::ParameterSet tmp = pset;
328 if (tmp != previous_pset_)
331 <<
"The art configuration can not be altered after art "
332 <<
"has been configured." << TLOG_ENDL;
342 event_count_in_run_ = 0;
343 event_count_in_subrun_ = 0;
344 subrun_start_time_ = time(0);
345 stats_helper_.resetStatistics();
346 previous_run_duration_ = -1.0;
348 stop_requested_.store(
false);
349 local_pause_requested_.store(
false);
351 metricMan_.do_start();
352 event_store_ptr_->startRun(run_id_.run());
354 logMessage_(
"Started run " + boost::lexical_cast<std::string>(run_id_.run()));
360 logMessage_(
"Stopping run " + boost::lexical_cast<std::string>(run_id_.run()) +
361 ", " + boost::lexical_cast<std::string>(event_count_in_run_) +
362 " events received so far.");
367 stop_requested_.store(
true);
373 logMessage_(
"Pausing run " + boost::lexical_cast<std::string>(run_id_.run()) +
374 ", " + boost::lexical_cast<std::string>(event_count_in_run_) +
375 " events received so far.");
380 local_pause_requested_.store(
true);
386 event_count_in_subrun_ = 0;
387 subrun_start_time_ = time(0);
388 local_pause_requested_.store(
false);
390 logMessage_(
"Resuming run " + boost::lexical_cast<std::string>(run_id_.run()));
391 metricMan_.do_start();
392 event_store_ptr_->startSubrun();
398 int readerReturnValue;
399 bool endSucceeded =
false;
400 int attemptsToEnd = 1;
401 endSucceeded = event_store_ptr_->endOfData(readerReturnValue);
402 while (!endSucceeded && attemptsToEnd < 3)
405 TLOG_DEBUG(name_) <<
"Retrying EventStore::endOfData()" << TLOG_ENDL;
406 endSucceeded = event_store_ptr_->endOfData(readerReturnValue);
408 metricMan_.shutdown();
415 TLOG_DEBUG(name_) <<
"soft_initialize method called with DAQ "
416 <<
"ParameterSet = \"" << pset.to_string()
417 <<
"\"." << TLOG_ENDL;
423 TLOG_DEBUG(name_) <<
"reinitialize method called with DAQ "
424 <<
"ParameterSet = \"" << pset.to_string()
425 <<
"\"." << TLOG_ENDL;
431 processing_fragments_.store(
true);
433 size_t eodFragmentsReceived = 0;
434 bool process_fragments =
true;
438 artdaq::FragmentPtr endSubRunMsg(
nullptr);
439 time_t last_filesize_check_time = subrun_start_time_;
445 receiver_ptr_->start_threads();
448 if (is_data_logger_ && data_pset_.has_key(
"destinations"))
452 if (sender_ptr_->destinationCount() == 0) {
453 sender_ptr_.reset(
nullptr);
457 TLOG_DEBUG(name_) <<
"Waiting for first fragment." << TLOG_ENDL;
459 artdaq::MonitoredQuantityStats::TIME_POINT_T startTime;
460 while (process_fragments)
462 artdaq::FragmentPtr fragmentPtr(
new artdaq::Fragment);
464 size_t recvTimeout = inrun_recv_timeout_usec_;
465 if (stop_requested_.load()) { recvTimeout = endrun_recv_timeout_usec_; }
466 else if (local_pause_requested_.load()) { recvTimeout = pause_recv_timeout_usec_; }
468 startTime = artdaq::MonitoredQuantity::getCurrentTime();
471 fragmentPtr = receiver_ptr_->recvFragment(senderSlot, recvTimeout);
473 stats_helper_.addSample(INPUT_WAIT_STAT_KEY,
474 (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
504 if (stop_requested_.load() &&
505 recvTimeout == endrun_recv_timeout_usec_)
507 if (endSubRunMsg !=
nullptr)
510 <<
"Timeout occurred in attempt to receive data, but as a stop has been requested, will forcibly end the run." << TLOG_ENDL;
511 event_store_ptr_->flushData();
512 artdaq::RawEvent_ptr subRunEvent(
new artdaq::RawEvent(run_id_.run(), 1, 0));
513 subRunEvent->insertFragment(std::move(endSubRunMsg));
515 bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
518 TLOG_ERROR(name_) <<
"Attempt to send EndOfSubRun fragment to art timed out after " <<
519 enq_timeout_.count() <<
" seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
524 if (event_count_in_subrun_ > 0)
527 <<
"Timeout receiving data after stop request, and the EndOfSubRun fragment isn't available to send to art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
531 std::string msg(
"Timeout receiving data after stop request, and the EndOfSubRun fragment isn't available to send to art;");
532 msg.append(
"DAQ may need to be returned to the \"Stopped\" state before further datataking");
536 process_fragments =
false;
538 else if (local_pause_requested_.load() &&
539 recvTimeout == pause_recv_timeout_usec_)
541 if (endSubRunMsg !=
nullptr)
544 <<
"Timeout occurred in attempt to receive data, but as a pause has been requested, will forcibly pause the run." << TLOG_ENDL;
545 event_store_ptr_->flushData();
546 artdaq::RawEvent_ptr subRunEvent(
new artdaq::RawEvent(run_id_.run(), 1, 0));
547 subRunEvent->insertFragment(std::move(endSubRunMsg));
549 bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
552 TLOG_ERROR(name_) <<
"Attempt to send EndOfSubRun fragment to art timed out after " <<
553 enq_timeout_.count() <<
" seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
559 "Timeout receiving data after pause request, and the EndOfSubRun fragment isn't available to send to art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
561 process_fragments =
false;
566 else if (!fragmentPtr)
568 TLOG_ERROR(name_) <<
"Received invalid fragment from " << senderSlot <<
". This is usually the case when a timeout has occurred, but sender was not set to RECV_TIMEOUT as expected." << TLOG_ENDL;
571 if (!receiver_ptr_->enabled_sources().count(senderSlot))
574 <<
"Invalid senderSlot received from recvFragment: "
575 << senderSlot << TLOG_ENDL;
578 fragments_received.
incSlot(senderSlot);
579 if (artdaq::Fragment::isSystemFragmentType(fragmentPtr->type()) &&
580 fragmentPtr->type() != artdaq::Fragment::DataFragmentType)
583 <<
"Sender slot = " << senderSlot
584 <<
", fragment type = " <<
static_cast<int>(fragmentPtr->type())
585 <<
", sequence ID = " << fragmentPtr->sequenceID() << TLOG_ENDL;
589 if (fragmentPtr->type() == artdaq::Fragment::InvalidFragmentType)
591 size_t fragSize = fragmentPtr->size() *
sizeof(artdaq::RawDataType);
592 TLOG_ERROR(name_) <<
"Fragment received with type of "
593 <<
"INVALID. Size = " << fragSize
594 <<
", sequence ID = " << fragmentPtr->sequenceID()
595 <<
", fragment ID = " << fragmentPtr->fragmentID()
596 <<
", and type = " <<
static_cast<int>(fragmentPtr->type()) << TLOG_ENDL;
600 if (artdaq::Fragment::isUserFragmentType(fragmentPtr->type()) ||
601 fragmentPtr->type() == artdaq::Fragment::DataFragmentType)
603 ++event_count_in_run_;
604 ++event_count_in_subrun_;
605 if (event_count_in_run_ == 1)
607 logMessage_(
"Received event " +
608 boost::lexical_cast<std::string>(event_count_in_run_) +
609 " with sequence id " +
610 boost::lexical_cast<std::string>(fragmentPtr->sequenceID()) +
613 stats_helper_.addSample(INPUT_EVENTS_STAT_KEY, fragmentPtr->size());
614 if (stats_helper_.readyToReport(event_count_in_run_))
616 std::string statString = buildStatisticsString_();
617 logMessage_(statString);
618 logMessage_(
"Received event " +
619 boost::lexical_cast<std::string>(event_count_in_run_) +
620 " with sequence id " +
621 boost::lexical_cast<std::string>(fragmentPtr->sequenceID()) +
623 boost::lexical_cast<std::string>(run_id_.run()) +
625 boost::lexical_cast<std::string>(event_store_ptr_->subrunID()) +
629 if (stats_helper_.statsRollingWindowHasMoved()) { sendMetrics_(); }
631 startTime = artdaq::MonitoredQuantity::getCurrentTime();
633 if (is_data_logger_ && fragmentPtr->type() == artdaq::Fragment::DataFragmentType
634 && (event_count_in_run_ % onmon_event_prescale_) == 0 && sender_ptr_)
638 auto fragCopy = *fragmentPtr;
639 sender_ptr_->sendFragment(std::move(fragCopy));
643 ExceptionHandler(ExceptionHandlerRethrow::no,
644 "Exception thrown during data logger copy of event to dispatcher");
647 else if (is_dispatcher_)
649 if (fragmentPtr->type() != artdaq::Fragment::EndOfDataFragmentType)
651 if (fragmentPtr->type() == artdaq::Fragment::InitFragmentType)
653 init_fragment_ptr_ = std::make_unique<artdaq::Fragment>(*fragmentPtr);
656 std::lock_guard<std::mutex> lock(dispatcher_transfers_mutex_);
658 if (new_transfers_ == 0)
661 if (dispatcher_transfers_.size() > 0 && fragmentPtr->sequenceID() % 100 == 0)
663 TLOG_DEBUG(name_) <<
"Dispatcher: broadcasting seqID = " << fragmentPtr->sequenceID() <<
", type = " <<
664 static_cast<size_t>(fragmentPtr->type()) <<
" to " << dispatcher_transfers_.size()
665 <<
" registered monitors" << TLOG_ENDL;
667 for (
auto& transfer : dispatcher_transfers_)
669 transfer->copyFragment(*fragmentPtr, 0);
674 for (
size_t i_q = dispatcher_transfers_.size() - new_transfers_; i_q < dispatcher_transfers_.size(); ++i_q)
676 TLOG_INFO(name_) <<
"Copying out init fragment, type " <<
static_cast<int>(init_fragment_ptr_->type()) <<
677 ", size " << init_fragment_ptr_->sizeBytes() << TLOG_ENDL;
678 dispatcher_transfers_[i_q]->copyFragment(*init_fragment_ptr_, 500000);
685 stats_helper_.addSample(SHM_COPY_TIME_STAT_KEY,
686 (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
690 artdaq::Fragment::sequence_id_t seq = fragmentPtr->sequenceID();
691 TRACE(21,
"%s::process_fragments seq=%lu isLogger=%d type=%d"
692 , name_.c_str(), seq, is_data_logger_, fragmentPtr->type());
693 startTime = artdaq::MonitoredQuantity::getCurrentTime();
694 if (!art_initialized_)
698 if (fragmentPtr->type() == artdaq::Fragment::InitFragmentType)
700 TLOG_DEBUG(name_) <<
"Init" << TLOG_ENDL;
702 if (is_data_logger_ && sender_ptr_)
704 auto fragCopy = *fragmentPtr;
705 sender_ptr_->sendFragment(std::move(fragCopy));
709 artdaq::RawEvent_ptr initEvent(
new artdaq::RawEvent(run_id_.run(), 1, fragmentPtr->sequenceID()));
710 initEvent->insertFragment(std::move(fragmentPtr));
712 bool enqStatus = event_queue_.enqTimedWait(initEvent, enq_timeout_);
716 TLOG_ERROR(name_) <<
"Attempt to send Init event to art timed out after " <<
717 enq_timeout_.count() <<
" seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
719 art_initialized_ =
true;
723 TLOG_ERROR(name_) <<
"Didn't receive an Init event with which to initialize art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
730 if (fragmentPtr->type() == artdaq::Fragment::DataFragmentType)
734 artdaq::FragmentPtr rejectedFragment;
735 auto seqId = fragmentPtr->sequenceID();
736 bool try_again =
true;
739 auto ret = event_store_ptr_->insert(std::move(fragmentPtr), rejectedFragment);
742 receiver_ptr_->unsuppressAll();
749 else if (stop_requested_.load())
752 process_fragments =
false;
753 receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
755 <<
"Unable to process event " << seqId
756 <<
" because of back-pressure - forcibly ending the run." << TLOG_ENDL;
758 else if (local_pause_requested_.load())
761 process_fragments =
false;
762 receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
764 <<
"Unable to process event " << seqId
765 <<
" because of back-pressure - forcibly pausing the run." << TLOG_ENDL;
769 fragmentPtr = std::move(rejectedFragment);
771 <<
"Unable to process event " << seqId
772 <<
" because of back-pressure from art - retrying..." << TLOG_ENDL;
777 receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
779 <<
"Unable to process event " << seqId
780 <<
" because the EventStore has reached the maximum number of incomplete bunches." << std::endl
781 <<
" Will retry when the EventStore is ready for new events." << TLOG_ENDL;
787 event_store_ptr_->insert(std::move(fragmentPtr),
false);
790 else if (fragmentPtr->type() == artdaq::Fragment::EndOfSubrunFragmentType)
792 if (is_data_logger_ && sender_ptr_)
794 auto fragCopy = *fragmentPtr;
795 sender_ptr_->sendFragment(std::move(fragCopy));
797 else if (is_dispatcher_)
799 for (
auto& transfer : dispatcher_transfers_)
801 transfer->copyFragment(*fragmentPtr, 0);
809 endSubRunMsg = std::move(fragmentPtr);
811 else if (fragmentPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
813 eodFragmentsReceived++;
817 fragments_sent.
setSlot(senderSlot, *fragmentPtr->dataBegin() + 1);
820 float delta = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
821 stats_helper_.addSample(STORE_EVENT_WAIT_STAT_KEY, delta);
822 TRACE((delta > 3.0) ? 0 : 22,
"%s::process_fragments seq=%lu isLogger=%d delta=%f start=%f"
823 , name_.c_str(), seq, is_data_logger_, delta, startTime);
826 startTime = artdaq::MonitoredQuantity::getCurrentTime();
827 if (is_data_logger_ && disk_writing_directory_.size() > 0 &&
828 !stop_requested_.load() && !system_pause_requested_.load())
830 bool threshold_reached =
false;
831 if (file_close_event_count_ > 0 &&
832 event_count_in_subrun_ >= file_close_event_count_)
834 threshold_reached =
true;
838 time_t now = time(0);
839 if (file_close_timeout_secs_ > 0 &&
840 (now - subrun_start_time_) >= file_close_timeout_secs_)
842 threshold_reached =
true;
846 if (filesize_check_interval_seconds_ > 0 &&
847 filesize_check_interval_events_ > 0 &&
848 (now - last_filesize_check_time) >= filesize_check_interval_seconds_ &&
849 (event_count_in_run_ % filesize_check_interval_events_) == 0)
851 if (file_close_threshold_bytes_ > 0 &&
852 getLatestFileSize_() >= file_close_threshold_bytes_)
854 threshold_reached =
true;
856 last_filesize_check_time = now;
860 if (threshold_reached)
862 system_pause_requested_.store(
true);
863 if (pause_thread_.get() != 0)
865 pause_thread_->join();
867 TLOG_DEBUG(name_) <<
"Starting sendPauseAndResume thread "
868 <<
", event count in subrun = "
869 << event_count_in_subrun_ << TLOG_ENDL;
870 pause_thread_.reset(
new std::thread(&AggregatorCore::sendPauseAndResume_,
this));
873 stats_helper_.addSample(FILE_CHECK_TIME_STAT_KEY,
874 (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
881 size_t source_count = 0;
882 if (is_data_logger_) source_count = receiver_ptr_->enabled_sources().size();
883 else source_count = 1;
885 if (eodFragmentsReceived >= source_count && endSubRunMsg !=
nullptr)
887 bool fragmentsOutstanding =
false;
890 for (
auto& i : receiver_ptr_->enabled_sources())
892 if (fragments_received[i] != fragments_sent[i])
894 fragmentsOutstanding =
true;
900 if (!fragmentsOutstanding)
902 event_store_ptr_->flushData();
903 artdaq::RawEvent_ptr subRunEvent(
new artdaq::RawEvent(run_id_.run(), 1, 0));
904 subRunEvent->insertFragment(std::move(endSubRunMsg));
906 bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
910 TLOG_ERROR(name_) <<
"All data appears to have been received but attempt to send EndOfSubRun fragment to art timed out after " <<
911 enq_timeout_.count() <<
" seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
913 process_fragments =
false;
917 TLOG_WARNING(name_) <<
"EndOfSubRun fragment and all EndOfData fragments received but more data expected" << TLOG_ENDL;
922 logMessage_(
"Subrun " +
923 boost::lexical_cast<std::string>(event_store_ptr_->subrunID()) +
924 " in run " + boost::lexical_cast<std::string>(run_id_.run()) +
925 " has ended. There were " +
926 boost::lexical_cast<std::string>(event_count_in_subrun_) +
927 " events in this subrun, and there have been " +
928 boost::lexical_cast<std::string>(event_count_in_run_) +
929 " events so far in this run.");
931 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
932 getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
933 if (mqPtr.get() != 0)
935 artdaq::MonitoredQuantityStats stats;
936 mqPtr->getStats(stats);
937 std::ostringstream oss;
938 oss <<
"Run " << run_id_.run() <<
" has an overall event rate of ";
939 oss << std::fixed << std::setprecision(1) << stats.fullSampleRate;
940 oss <<
" events/sec.";
941 logMessage_(oss.str());
942 previous_run_duration_ = stats.fullDuration;
948 metricMan_.do_stop();
950 receiver_ptr_.reset(
nullptr);
951 sender_ptr_.reset(
nullptr);
953 processing_fragments_.store(
false);
959 if (which ==
"event_count")
961 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
962 getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
963 if (mqPtr.get() != 0)
965 return boost::lexical_cast<std::string>(mqPtr->getFullSampleCount());
973 if (which ==
"run_duration")
977 double duration = previous_run_duration_;
978 if (processing_fragments_.load())
980 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
981 getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
982 if (mqPtr.get() != 0)
984 duration = mqPtr->getFullDuration();
987 std::ostringstream oss;
988 oss << std::fixed << std::setprecision(1) << duration;
992 if (which ==
"file_size")
994 size_t latestFileSize = getLatestFileSize_();
995 return boost::lexical_cast<std::string>(latestFileSize);
998 if (which ==
"subrun_number")
1000 if (event_store_ptr_.get() !=
nullptr)
1002 return boost::lexical_cast<std::string>(event_store_ptr_->subrunID());
1010 if (which ==
"incomplete_event_count")
1012 if (event_store_ptr_ !=
nullptr)
1014 return boost::lexical_cast<std::string>(event_store_ptr_->incompleteEventCount());
1027 std::string tmpString = name_ +
" run number = ";
1028 tmpString.append(boost::lexical_cast<std::string>(run_id_.run()));
1029 tmpString.append(
". Command=\"" + which +
"\" is not currently supported.");
1035 TLOG_DEBUG(name_) <<
"AggregatorCore::register_monitor called with argument \"" << pset.to_string() <<
"\"" << TLOG_ENDL;
1036 std::lock_guard<std::mutex> lock(dispatcher_transfers_mutex_);
1042 for (
auto& existing_transfer_ : dispatcher_transfers_)
1044 if (existing_transfer_->uniqueLabel() == transfer->uniqueLabel())
1046 std::stringstream errmsg;
1047 errmsg <<
"Attempt to register newly-created monitor with label \"" <<
1048 transfer->uniqueLabel() <<
"\" failed; a monitor with that label already exists";
1049 return errmsg.str();
1053 dispatcher_transfers_.emplace_back(std::move(transfer));
1055 TLOG_INFO(name_) <<
"Successfully registered monitor with label \"" << dispatcher_transfers_.back()->uniqueLabel() <<
"\"" << TLOG_ENDL;
1061 std::stringstream errmsg;
1062 errmsg <<
"Unable to create a Transfer plugin with the FHiCL code \"" << pset.to_string() <<
"\", a new monitor has not been registered";
1063 return errmsg.str();
1071 TLOG_DEBUG(name_) <<
"AggregatorCore::unregister_monitor called with argument \"" << label <<
"\"" << TLOG_ENDL;
1072 std::lock_guard<std::mutex> lock(dispatcher_transfers_mutex_);
1076 auto r_i_end = std::remove_if(dispatcher_transfers_.begin(),
1077 dispatcher_transfers_.end(),
1078 [label](
const std::unique_ptr<TransferInterface>& transfer)
1080 return transfer->uniqueLabel() == label;
1083 auto nfound = dispatcher_transfers_.end() - r_i_end;
1085 TLOG_INFO(name_) <<
"Request from monitor with label \"" << label <<
"\" to unregister received" << TLOG_ENDL;
1089 dispatcher_transfers_.pop_back();
1092 else if (nfound == 0)
1094 std::stringstream errmsg;
1095 errmsg <<
"Warning in AggregatorCore::unregister_monitor: unable to find requested transfer plugin with "
1096 <<
"label \"" << label <<
"\"";
1097 TLOG_WARNING(name_) << errmsg.str() << TLOG_ENDL;
1098 return errmsg.str();
1102 std::stringstream errmsg;
1103 errmsg <<
"Warning in AggregatorCore::unregister_monitor: found more than one (" << nfound <<
1104 ") transfer plugins with label \"" << label <<
"\", will unregister all of them";
1105 TLOG_WARNING(name_) << errmsg.str() << TLOG_ENDL;
1106 dispatcher_transfers_.erase(r_i_end, dispatcher_transfers_.end());
1107 return errmsg.str();
1112 std::stringstream errmsg;
1113 errmsg <<
"Unable to unregister transfer plugin with label \"" << label <<
"\"";
1114 return errmsg.str();
1121 size_t artdaq::AggregatorCore::getLatestFileSize_()
const
1123 if (disk_writing_directory_.size() == 0)
1125 TLOG_DEBUG(name_) <<
"Latest file size = 0 (no directory)" << TLOG_ENDL;
1128 BFS::path outputDir(disk_writing_directory_);
1129 BFS::directory_iterator endIter;
1131 std::time_t latestFileTime = 0;
1132 size_t latestFileSize = 0;
1133 if (BFS::exists(outputDir) && BFS::is_directory(outputDir))
1135 for (BFS::directory_iterator dirIter(outputDir); dirIter != endIter; ++dirIter)
1137 BFS::path pathObj = dirIter->path();
1138 if (pathObj.filename().string().find(
"RootOutput") != std::string::npos &&
1139 pathObj.filename().string().find(
"root") != std::string::npos)
1141 if (BFS::last_write_time(pathObj) >= latestFileTime)
1143 latestFileTime = BFS::last_write_time(pathObj);
1144 latestFileSize = BFS::file_size(pathObj);
1149 time_t now = time(0);
1150 if ((now - latestFileTime) < 60)
1152 TLOG_DEBUG(name_) <<
"Latest file size = "
1153 << latestFileSize << TLOG_ENDL;
1154 return latestFileSize;
1158 TLOG_DEBUG(name_) <<
"Latest file size = 0 (too old)" << TLOG_ENDL;
1163 bool artdaq::AggregatorCore::sendPauseAndResume_()
1165 xmlrpc_c::clientSimple myClient;
1166 TLOG_INFO(name_) <<
"Starting automatic pause..." << TLOG_ENDL;
1167 for (
size_t igrp = 0; igrp < xmlrpc_client_lists_.size(); ++igrp)
1169 for (
size_t idx = 0; idx < xmlrpc_client_lists_[igrp].size(); ++idx)
1171 for (
size_t iAttempt = 0; iAttempt < 5; ++iAttempt)
1173 xmlrpc_c::value result;
1174 myClient.call((xmlrpc_client_lists_[igrp])[idx],
"daq.pause", &result);
1175 std::string
const resultString = xmlrpc_c::value_string(result);
1176 TLOG_DEBUG(name_) <<
"Pause: "
1177 << (xmlrpc_client_lists_[igrp])[idx]
1178 <<
" " << resultString << TLOG_ENDL;
1179 if (std::string::npos !=
1180 boost::algorithm::to_lower_copy(resultString).find(
"success"))
1187 TLOG_WARNING(name_) <<
"Retrying pause command to "
1188 << (xmlrpc_client_lists_[igrp])[idx]
1189 <<
" (" << resultString <<
")" << TLOG_ENDL;
1194 TLOG_INFO(name_) <<
"Starting automatic resume..." << TLOG_ENDL;
1195 for (
int igrp = (xmlrpc_client_lists_.size() - 1); igrp >= 0; --igrp)
1197 for (
size_t idx = 0; idx < xmlrpc_client_lists_[igrp].size(); ++idx)
1199 for (
size_t iAttempt = 0; iAttempt < 5; ++iAttempt)
1201 xmlrpc_c::value result;
1202 myClient.call((xmlrpc_client_lists_[igrp])[idx],
"daq.resume", &result);
1203 std::string
const resultString = xmlrpc_c::value_string(result);
1204 TLOG_DEBUG(name_) <<
"Resume: "
1205 << (xmlrpc_client_lists_[igrp])[idx]
1206 <<
" " << resultString << TLOG_ENDL;
1207 if (std::string::npos !=
1208 boost::algorithm::to_lower_copy(resultString).find(
"success"))
1215 TLOG_WARNING(name_) <<
"Retrying resume command to "
1216 << (xmlrpc_client_lists_[igrp])[idx]
1217 <<
" (" << resultString <<
")" << TLOG_ENDL;
1222 TLOG_INFO(name_) <<
"Done with automatic resume..." << TLOG_ENDL;
1223 system_pause_requested_.store(
false);
1227 void artdaq::AggregatorCore::logMessage_(std::string
const& text)
1229 if (is_data_logger_)
1231 TLOG_INFO(name_) << text << TLOG_ENDL;
1235 TLOG_DEBUG(name_) << text << TLOG_ENDL;
1239 std::string artdaq::AggregatorCore::buildStatisticsString_()
1241 std::ostringstream oss;
1242 double eventCount = 1.0;
1243 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
1244 getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
1245 if (mqPtr.get() != 0)
1248 artdaq::MonitoredQuantityStats stats;
1249 mqPtr->getStats(stats);
1250 oss <<
"Input statistics: "
1251 << stats.recentSampleCount <<
" events received at "
1252 << stats.recentSampleRate <<
" events/sec, data rate = "
1253 << (stats.recentValueRate *
sizeof(artdaq::RawDataType)
1254 / 1024.0 / 1024.0) <<
" MB/sec, monitor window = "
1255 << stats.recentDuration <<
" sec, min::max event size = "
1256 << (stats.recentValueMin *
sizeof(artdaq::RawDataType)
1259 << (stats.recentValueMax *
sizeof(artdaq::RawDataType)
1261 <<
" MB" << std::endl;
1262 eventCount = std::max(
double(stats.recentSampleCount), 1.0);
1263 oss <<
"Average times per event: ";
1264 if (stats.recentSampleRate > 0.0)
1266 oss <<
" elapsed time = "
1267 << (1.0 / stats.recentSampleRate) <<
" sec";
1277 mqPtr = artdaq::StatisticsCollection::getInstance().
1278 getMonitoredQuantity(INPUT_WAIT_STAT_KEY);
1279 if (mqPtr.get() != 0)
1281 oss <<
", input wait time = "
1282 << (mqPtr->getRecentValueSum() / eventCount) <<
" sec";
1285 mqPtr = artdaq::StatisticsCollection::getInstance().
1286 getMonitoredQuantity(STORE_EVENT_WAIT_STAT_KEY);
1287 if (mqPtr.get() != 0)
1289 artdaq::MonitoredQuantityStats stats;
1290 mqPtr->getStats(stats);
1291 oss <<
", avg::max event store wait time = "
1292 << (stats.recentValueSum / eventCount)
1293 <<
"::" << stats.recentValueMax
1297 mqPtr = artdaq::StatisticsCollection::getInstance().
1298 getMonitoredQuantity(SHM_COPY_TIME_STAT_KEY);
1299 if (mqPtr.get() != 0)
1301 oss <<
", shared memory copy time = "
1302 << (mqPtr->getRecentValueSum() / eventCount) <<
" sec";
1305 mqPtr = artdaq::StatisticsCollection::getInstance().
1306 getMonitoredQuantity(FILE_CHECK_TIME_STAT_KEY);
1307 if (mqPtr.get() != 0)
1309 oss <<
", file size test time = "
1310 << (mqPtr->getRecentValueSum() / eventCount) <<
" sec";
1316 void artdaq::AggregatorCore::sendMetrics_()
1319 double eventCount = 1.0;
1320 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
1321 getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
1322 if (mqPtr.get() != 0)
1324 artdaq::MonitoredQuantityStats stats;
1325 mqPtr->getStats(stats);
1326 eventCount = std::max(
double(stats.recentSampleCount), 1.0);
1327 metricMan_.sendMetric(
"Event Rate",
1328 stats.recentSampleRate,
"events/sec", 1);
1329 metricMan_.sendMetric(
"Average Event Size",
1330 (stats.recentValueAverage *
sizeof(artdaq::RawDataType)
1331 ),
"bytes/event", 2);
1332 metricMan_.sendMetric(
"Data Rate",
1333 (stats.recentValueRate *
sizeof(artdaq::RawDataType)
1343 mqPtr = artdaq::StatisticsCollection::getInstance().
1344 getMonitoredQuantity(INPUT_WAIT_STAT_KEY);
1345 if (mqPtr.get() != 0)
1347 metricMan_.sendMetric(
"Average Input Wait Time",
1348 (mqPtr->getRecentValueSum() / eventCount),
1349 "seconds/event", 3);
1352 mqPtr = artdaq::StatisticsCollection::getInstance().
1353 getMonitoredQuantity(STORE_EVENT_WAIT_STAT_KEY);
1354 if (mqPtr.get() != 0)
1356 metricMan_.sendMetric(
"Avg art Queue Wait Time",
1357 (mqPtr->getRecentValueSum() / eventCount),
1358 "seconds/event", 3);
1361 mqPtr = artdaq::StatisticsCollection::getInstance().
1362 getMonitoredQuantity(SHM_COPY_TIME_STAT_KEY);
1363 if (mqPtr.get() != 0)
1365 metricMan_.sendMetric(
"Avg Shared Memory Copy Time",
1366 (mqPtr->getRecentValueSum() / eventCount),
1367 "seconds/event", 4);
1370 mqPtr = artdaq::StatisticsCollection::getInstance().
1371 getMonitoredQuantity(FILE_CHECK_TIME_STAT_KEY);
1372 if (mqPtr.get() != 0)
1374 metricMan_.sendMetric(
"Average File Check Time",
1375 (mqPtr->getRecentValueSum() / eventCount),
1376 "seconds/event", 4);
static const std::string INPUT_EVENTS_STAT_KEY
Key for the Input Events MonitoredQuantity.
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
static const std::string SHM_COPY_TIME_STAT_KEY
Key for the Shared Memory Copy Time MonitoredQuantity.
Sends Fragment objects using TransferInterface plugins. Uses Routing Tables if confgiured, otherwise will Round-Robin Fragments to the destinations.
bool reinitialize(fhicl::ParameterSet const &pset)
Reinitializes the AggregatorCore. No-Op.
bool stop()
Stops the AggregatorCore.
bool shutdown()
Shuts Down the AggregatorCore.
Keep track of the count of Fragments received from a set of sources.
bool start(art::RunID id)
Start the AggregatorCore.
The Fragment was successfully inserted.
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
std::string unregister_monitor(std::string const &label)
Delete the TransferInterface having the given unique label.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
AggregatorCore(int rank, std::string name)
AggregatorCore Constructor.
static const std::string INPUT_WAIT_STAT_KEY
Key for the Input Wait MonitoredQuantity.
bool soft_initialize(fhicl::ParameterSet const &pset)
Soft-Initializes the AggregatorCore. No-Op.
bool resume()
Resumes the AggregatorCore.
std::string register_monitor(fhicl::ParameterSet const &pset)
Create a new TransferInterface instance using the given configuration.
static const std::string FILE_CHECK_TIME_STAT_KEY
Key for the File Check Time MonitoredQuantity.
void setSlot(size_t slot, size_t val)
Set the given slot to the given value.
void display_bits(void *memstart, size_t nbytes, std::string sourcename)
Write out memory, for debugging purposes.
The EventStore class collects Fragment objects, until it receives a complete event, at which point the event is handed over to the art thread.
This TransferInterface is a Sender.
void incSlot(size_t slot)
Increment the given slot by one.
Receives Fragment objects from one or more DataSenderManager instances using TransferInterface plugin...
std::string report(std::string const &which) const
Send a report on a given run-time quantity.
The EventStore is full, but the Fragment was accepted as it is for an already-open event...
bool initialize(fhicl::ParameterSet const &pset)
Processes the initialize request.
bool pause()
Pauses the AggregatorCore.
int( ART_CFGSTRING_FCN)(const std::string &)
An art function that accepts a fhicl::ParameterSet as a string.
The Fragment was rejected, because the RawEventQueue is full.
static const std::string STORE_EVENT_WAIT_STAT_KEY
Key for the EventStore Event Wait MonitoredQuantity.
size_t process_fragments()
The main working loop of the AggregatorCore. Receives events from DataReceiverManager and processes t...