1 #pragma GCC diagnostic push
2 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3 #include <xmlrpc-c/client_simple.hpp>
4 #pragma GCC diagnostic pop
11 #include <boost/tokenizer.hpp>
12 #include <boost/filesystem.hpp>
13 #include <boost/algorithm/string.hpp>
14 #include "art/Framework/Art/artapp.h"
15 #include "cetlib/BasicPluginFactory.h"
17 #include "artdaq-core/Core/SimpleQueueReader.hh"
18 #include "artdaq-core/Utilities/ExceptionHandler.hh"
19 #include "artdaq-core/Data/RawEvent.hh"
21 #include "artdaq/Application/AggregatorCore.hh"
22 #include "artdaq/DAQrate/EventStore.hh"
23 #include "artdaq/DAQrate/detail/FragCounter.hh"
24 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
27 namespace BFS = boost::filesystem;
43 void display_bits(
void* memstart,
size_t nbytes, std::string sourcename)
45 std::stringstream bitstr;
46 bitstr <<
"The " << nbytes <<
"-byte chunk of memory beginning at " <<
static_cast<void*
>(memstart) <<
" is : ";
48 for (
unsigned int i = 0; i < nbytes; i++)
55 bitstr << std::bitset<8>(*((
reinterpret_cast<uint8_t*
>(memstart)) + i)) <<
" ";
58 TLOG_DEBUG(sourcename) << bitstr.str() << TLOG_ENDL;
65 , art_initialized_(false)
66 , event_queue_(artdaq::getGlobalQueue(10))
67 , stop_requested_(false)
68 , local_pause_requested_(false)
69 , processing_fragments_(false)
70 , system_pause_requested_(false)
71 , previous_run_duration_(-1.0)
74 TLOG_DEBUG(name_) <<
"Constructor" << TLOG_ENDL;
80 metricMan = &metricMan_;
86 TLOG_DEBUG(name_) <<
"Destructor" << TLOG_ENDL;
91 init_string_ = pset.to_string();
92 TLOG_DEBUG(name_) <<
"initialize method called with DAQ " <<
"ParameterSet = \"" << init_string_ <<
"\"." << TLOG_ENDL;
95 fhicl::ParameterSet daq_pset;
98 daq_pset = pset.get<fhicl::ParameterSet>(
"daq");
103 <<
"Unable to find the DAQ parameters in the initialization "
104 <<
"ParameterSet: \"" + pset.to_string() +
"\"." << TLOG_ENDL;
107 fhicl::ParameterSet agg_pset;
110 agg_pset = daq_pset.get<fhicl::ParameterSet>(
"aggregator");
111 data_pset_ = agg_pset;
116 <<
"Unable to find the aggregator parameters in the DAQ "
117 <<
"initialization ParameterSet: \"" + daq_pset.to_string() +
"\"." << TLOG_ENDL;
122 expected_events_per_bunch_ =
123 agg_pset.get<
size_t>(
"expected_events_per_bunch");
128 <<
"The expected_events_per_bunch parameter was not specified "
129 <<
"in the aggregator initialization PSet: \"" << pset.to_string()
130 <<
"\"." << TLOG_ENDL;
134 enq_timeout_ =
static_cast<detail::seconds
>(agg_pset.get<
size_t>(
"enq_timeout", 5.0));
140 is_data_logger_ =
false;
141 is_online_monitor_ =
false;
142 is_dispatcher_ =
false;
143 std::string metricsReportingInstanceName =
"Data Logger";
144 bool agtype_was_specified =
false;
145 if (!agtype_was_specified)
149 is_data_logger_ = agg_pset.get<
bool>(
"is_data_logger");
150 agtype_was_specified =
true;
154 if (!agtype_was_specified)
158 is_online_monitor_ = agg_pset.get<
bool>(
"is_online_monitor");
159 metricsReportingInstanceName =
"Online Monitor";
160 agtype_was_specified =
true;
164 if (!agtype_was_specified)
168 is_dispatcher_ = agg_pset.get<
bool>(
"is_dispatcher");
169 metricsReportingInstanceName =
"Dispatcher";
170 agtype_was_specified =
true;
175 if (!agtype_was_specified)
177 throw cet::exception(
"ConfigurationException",
"You must specify one of is_data_logger, is_online_monitor or is_dispatcher");
180 TLOG_DEBUG(name_) <<
"Rank " << my_rank
181 <<
", is_data_logger = " << is_data_logger_
182 <<
", is_online_monitor = " << is_online_monitor_
183 <<
", is_dispatcher = " << is_dispatcher_ << TLOG_ENDL;
185 disk_writing_directory_ =
"";
188 fhicl::ParameterSet output_pset =
189 pset.get<fhicl::ParameterSet>(
"outputs");
190 fhicl::ParameterSet normalout_pset =
191 output_pset.get<fhicl::ParameterSet>(
"normalOutput");
193 if (!normalout_pset.is_empty())
195 std::string filename = normalout_pset.get<std::string>(
"fileName",
"");
196 if (filename.size() > 0)
198 size_t pos = filename.rfind(
"/");
199 if (pos != std::string::npos)
201 disk_writing_directory_ = filename.substr(0, pos);
206 TLOG_WARNING(name_) <<
"Problem finding \"fileName\" parameter in \"normalOutput\" RootOutput module FHiCL code" << TLOG_ENDL;
212 std::string xmlrpcClientString =
213 agg_pset.get<std::string>(
"xmlrpc_client_list",
"");
214 if (xmlrpcClientString.size() > 0)
216 xmlrpc_client_lists_.clear();
217 boost::char_separator<char> sep1(
";");
218 boost::tokenizer<boost::char_separator<char>>
219 primaryTokens(xmlrpcClientString, sep1);
220 boost::tokenizer<boost::char_separator<char>>::iterator iter1;
221 boost::tokenizer<boost::char_separator<char>>::iterator
222 endIter1 = primaryTokens.end();
223 for (iter1 = primaryTokens.begin(); iter1 != endIter1; ++iter1)
225 boost::char_separator<char> sep2(
",");
226 boost::tokenizer<boost::char_separator<char>>
227 secondaryTokens(*iter1, sep2);
228 boost::tokenizer<boost::char_separator<char>>::iterator iter2;
229 boost::tokenizer<boost::char_separator<char>>::iterator
230 endIter2 = secondaryTokens.end();
231 int clientGroup = -1;
232 std::string url =
"";
234 for (iter2 = secondaryTokens.begin(); iter2 != endIter2; ++iter2)
244 clientGroup = boost::lexical_cast<
int>(*iter2);
250 <<
"Unexpected XMLRPC client list element, index = "
251 << loopCount <<
", value = \"" << *iter2 <<
"\"" << TLOG_ENDL;
255 if (clientGroup >= 0 && url.size() > 0)
257 int elementsNeeded = clientGroup + 1 - ((int)xmlrpc_client_lists_.size());
258 for (
int idx = 0; idx < elementsNeeded; ++idx)
260 std::vector<std::string> tmpVec;
261 xmlrpc_client_lists_.push_back(tmpVec);
263 xmlrpc_client_lists_[clientGroup].push_back(url);
267 double fileSizeMB = agg_pset.get<
double>(
"subrun_size_MB", 0);
268 file_close_threshold_bytes_ = ((size_t)fileSizeMB * 1024.0 * 1024.0);
269 file_close_timeout_secs_ = agg_pset.get<time_t>(
"subrun_duration", 0);
270 file_close_event_count_ = agg_pset.get<
size_t>(
"subrun_event_count", 0);
272 inrun_recv_timeout_usec_ = agg_pset.get<
size_t>(
"inrun_recv_timeout_usec", 100000);
273 endrun_recv_timeout_usec_ = agg_pset.get<
size_t>(
"endrun_recv_timeout_usec", 20000000);
274 pause_recv_timeout_usec_ = agg_pset.get<
size_t>(
"pause_recv_timeout_usec", 3000000);
276 onmon_event_prescale_ = agg_pset.get<
size_t>(
"onmon_event_prescale", 1);
278 filesize_check_interval_seconds_ = agg_pset.get<int32_t>(
"filesize_check_interval_seconds", 20);
279 filesize_check_interval_events_ = agg_pset.get<int32_t>(
"filesize_check_interval_events", 20);
282 stats_helper_.createCollectors(agg_pset, 50, 20.0, 60.0, INPUT_EVENTS_STAT_KEY);
285 fhicl::ParameterSet metric_pset;
289 metric_pset = daq_pset.get<fhicl::ParameterSet>(
"metrics");
293 if (metric_pset.is_empty())
295 TLOG_INFO(name_) <<
"No metric plugins appear to be defined" << TLOG_ENDL;
299 metricMan_.initialize(metric_pset, metricsReportingInstanceName);
303 ExceptionHandler(ExceptionHandlerRethrow::no,
304 "Error loading metrics in AggregatorCore::initialize()");
307 if (event_store_ptr_ ==
nullptr)
310 size_t desired_events_per_bunch = expected_events_per_bunch_;
311 if (is_online_monitor_ || is_dispatcher_)
313 desired_events_per_bunch = 1;
315 TRACE(36,
"Creating EventStore and Starting art thread");
317 init_string_, reader));
318 TRACE(36,
"Done Creating EventStore");
319 event_store_ptr_->setSeqIDModulus(desired_events_per_bunch);
320 fhicl::ParameterSet tmp = pset;
322 previous_pset_ = tmp;
326 fhicl::ParameterSet tmp = pset;
328 if (tmp != previous_pset_)
331 <<
"The art configuration can not be altered after art "
332 <<
"has been configured." << TLOG_ENDL;
342 event_count_in_run_ = 0;
343 event_count_in_subrun_ = 0;
344 subrun_start_time_ = time(0);
345 stats_helper_.resetStatistics();
346 previous_run_duration_ = -1.0;
348 stop_requested_.store(
false);
349 local_pause_requested_.store(
false);
351 metricMan_.do_start();
352 event_store_ptr_->startRun(run_id_.run());
354 logMessage_(
"Started run " + boost::lexical_cast<std::string>(run_id_.run()));
360 logMessage_(
"Stopping run " + boost::lexical_cast<std::string>(run_id_.run()) +
361 ", " + boost::lexical_cast<std::string>(event_count_in_run_) +
362 " events received so far.");
367 stop_requested_.store(
true);
373 logMessage_(
"Pausing run " + boost::lexical_cast<std::string>(run_id_.run()) +
374 ", " + boost::lexical_cast<std::string>(event_count_in_run_) +
375 " events received so far.");
380 local_pause_requested_.store(
true);
386 event_count_in_subrun_ = 0;
387 subrun_start_time_ = time(0);
388 local_pause_requested_.store(
false);
390 logMessage_(
"Resuming run " + boost::lexical_cast<std::string>(run_id_.run()));
391 metricMan_.do_start();
392 event_store_ptr_->startSubrun();
398 int readerReturnValue;
399 bool endSucceeded =
false;
400 int attemptsToEnd = 1;
401 endSucceeded = event_store_ptr_->endOfData(readerReturnValue);
402 while (!endSucceeded && attemptsToEnd < 3)
405 TLOG_DEBUG(name_) <<
"Retrying EventStore::endOfData()" << TLOG_ENDL;
406 endSucceeded = event_store_ptr_->endOfData(readerReturnValue);
408 metricMan_.shutdown();
415 TLOG_DEBUG(name_) <<
"soft_initialize method called with DAQ "
416 <<
"ParameterSet = \"" << pset.to_string()
417 <<
"\"." << TLOG_ENDL;
423 TLOG_DEBUG(name_) <<
"reinitialize method called with DAQ "
424 <<
"ParameterSet = \"" << pset.to_string()
425 <<
"\"." << TLOG_ENDL;
431 processing_fragments_.store(
true);
433 size_t eodFragmentsReceived = 0;
434 bool process_fragments =
true;
438 artdaq::FragmentPtr endSubRunMsg(
nullptr);
439 time_t last_filesize_check_time = subrun_start_time_;
445 receiver_ptr_->start_threads();
448 if (is_data_logger_ && data_pset_.has_key(
"destinations"))
452 if (sender_ptr_->destinationCount() == 0)
454 sender_ptr_.reset(
nullptr);
458 TLOG_DEBUG(name_) <<
"Waiting for first fragment." << TLOG_ENDL;
460 artdaq::MonitoredQuantityStats::TIME_POINT_T startTime;
461 while (process_fragments)
463 artdaq::FragmentPtr fragmentPtr(
new artdaq::Fragment);
465 size_t recvTimeout = inrun_recv_timeout_usec_;
466 if (stop_requested_.load()) { recvTimeout = endrun_recv_timeout_usec_; }
467 else if (local_pause_requested_.load()) { recvTimeout = pause_recv_timeout_usec_; }
469 startTime = artdaq::MonitoredQuantity::getCurrentTime();
472 fragmentPtr = receiver_ptr_->recvFragment(senderSlot, recvTimeout);
474 stats_helper_.addSample(INPUT_WAIT_STAT_KEY,
475 (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
505 if (stop_requested_.load() &&
506 recvTimeout == endrun_recv_timeout_usec_)
508 if (endSubRunMsg !=
nullptr)
511 <<
"Timeout occurred in attempt to receive data, but as a stop has been requested, will forcibly end the run." << TLOG_ENDL;
512 event_store_ptr_->flushData();
513 artdaq::RawEvent_ptr subRunEvent(
new artdaq::RawEvent(run_id_.run(), 1, 0));
514 subRunEvent->insertFragment(std::move(endSubRunMsg));
516 bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
519 TLOG_ERROR(name_) <<
"Attempt to send EndOfSubRun fragment to art timed out after " <<
520 enq_timeout_.count() <<
" seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
525 if (event_count_in_subrun_ > 0)
528 <<
"Timeout receiving data after stop request, and the EndOfSubRun fragment isn't available to send to art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
532 std::string msg(
"Timeout receiving data after stop request, and the EndOfSubRun fragment isn't available to send to art;");
533 msg.append(
"DAQ may need to be returned to the \"Stopped\" state before further datataking");
537 process_fragments =
false;
539 else if (local_pause_requested_.load() &&
540 recvTimeout == pause_recv_timeout_usec_)
542 if (endSubRunMsg !=
nullptr)
545 <<
"Timeout occurred in attempt to receive data, but as a pause has been requested, will forcibly pause the run." << TLOG_ENDL;
546 event_store_ptr_->flushData();
547 artdaq::RawEvent_ptr subRunEvent(
new artdaq::RawEvent(run_id_.run(), 1, 0));
548 subRunEvent->insertFragment(std::move(endSubRunMsg));
550 bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
553 TLOG_ERROR(name_) <<
"Attempt to send EndOfSubRun fragment to art timed out after " <<
554 enq_timeout_.count() <<
" seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
560 "Timeout receiving data after pause request, and the EndOfSubRun fragment isn't available to send to art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
562 process_fragments =
false;
567 else if (!fragmentPtr)
569 TLOG_ERROR(name_) <<
"Received invalid fragment from " << senderSlot <<
". This is usually the case when a timeout has occurred, but sender was not set to RECV_TIMEOUT as expected." << TLOG_ENDL;
572 if (!receiver_ptr_->enabled_sources().count(senderSlot))
575 <<
"Invalid senderSlot received from recvFragment: "
576 << senderSlot << TLOG_ENDL;
579 fragments_received.
incSlot(senderSlot);
580 if (artdaq::Fragment::isSystemFragmentType(fragmentPtr->type()) &&
581 fragmentPtr->type() != artdaq::Fragment::DataFragmentType)
584 <<
"Sender slot = " << senderSlot
585 <<
", fragment type = " <<
static_cast<int>(fragmentPtr->type())
586 <<
", sequence ID = " << fragmentPtr->sequenceID() << TLOG_ENDL;
590 if (fragmentPtr->type() == artdaq::Fragment::InvalidFragmentType)
592 size_t fragSize = fragmentPtr->size() *
sizeof(artdaq::RawDataType);
593 TLOG_ERROR(name_) <<
"Fragment received with type of "
594 <<
"INVALID. Size = " << fragSize
595 <<
", sequence ID = " << fragmentPtr->sequenceID()
596 <<
", fragment ID = " << fragmentPtr->fragmentID()
597 <<
", and type = " <<
static_cast<int>(fragmentPtr->type()) << TLOG_ENDL;
601 if (artdaq::Fragment::isUserFragmentType(fragmentPtr->type()) ||
602 fragmentPtr->type() == artdaq::Fragment::DataFragmentType)
604 ++event_count_in_run_;
605 ++event_count_in_subrun_;
606 if (event_count_in_run_ == 1)
608 logMessage_(
"Received event " +
609 boost::lexical_cast<std::string>(event_count_in_run_) +
610 " with sequence id " +
611 boost::lexical_cast<std::string>(fragmentPtr->sequenceID()) +
614 stats_helper_.addSample(INPUT_EVENTS_STAT_KEY, fragmentPtr->size());
615 if (stats_helper_.readyToReport(event_count_in_run_))
617 std::string statString = buildStatisticsString_();
618 logMessage_(statString);
619 logMessage_(
"Received event " +
620 boost::lexical_cast<std::string>(event_count_in_run_) +
621 " with sequence id " +
622 boost::lexical_cast<std::string>(fragmentPtr->sequenceID()) +
624 boost::lexical_cast<std::string>(run_id_.run()) +
626 boost::lexical_cast<std::string>(event_store_ptr_->subrunID()) +
630 if (stats_helper_.statsRollingWindowHasMoved()) { sendMetrics_(); }
632 startTime = artdaq::MonitoredQuantity::getCurrentTime();
634 if (is_data_logger_ && fragmentPtr->type() == artdaq::Fragment::DataFragmentType
635 && (event_count_in_run_ % onmon_event_prescale_) == 0 && sender_ptr_)
639 auto fragCopy = *fragmentPtr;
640 sender_ptr_->sendFragment(std::move(fragCopy));
644 ExceptionHandler(ExceptionHandlerRethrow::no,
645 "Exception thrown during data logger copy of event to dispatcher");
648 else if (is_dispatcher_)
650 if (fragmentPtr->type() != artdaq::Fragment::EndOfDataFragmentType)
652 if (fragmentPtr->type() == artdaq::Fragment::InitFragmentType)
654 init_fragment_ptr_ = std::make_unique<artdaq::Fragment>(*fragmentPtr);
657 std::lock_guard<std::mutex> lock(dispatcher_transfers_mutex_);
659 if (new_transfers_ == 0)
662 if (dispatcher_transfers_.size() > 0 && fragmentPtr->sequenceID() % 100 == 0)
664 TLOG_DEBUG(name_) <<
"Dispatcher: broadcasting seqID = " << fragmentPtr->sequenceID() <<
", type = " <<
665 static_cast<size_t>(fragmentPtr->type()) <<
" to " << dispatcher_transfers_.size()
666 <<
" registered monitors" << TLOG_ENDL;
668 for (
auto& transfer : dispatcher_transfers_)
670 transfer->copyFragment(*fragmentPtr, 0);
675 for (
size_t i_q = dispatcher_transfers_.size() - new_transfers_; i_q < dispatcher_transfers_.size(); ++i_q)
677 TLOG_INFO(name_) <<
"Copying out init fragment, type " <<
static_cast<int>(init_fragment_ptr_->type()) <<
678 ", size " << init_fragment_ptr_->sizeBytes() << TLOG_ENDL;
679 dispatcher_transfers_[i_q]->copyFragment(*init_fragment_ptr_, 500000);
686 stats_helper_.addSample(SHM_COPY_TIME_STAT_KEY,
687 (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
691 artdaq::Fragment::sequence_id_t seq = fragmentPtr->sequenceID();
692 TRACE(21,
"%s::process_fragments seq=%lu isLogger=%d type=%d"
693 , name_.c_str(), seq, is_data_logger_, fragmentPtr->type());
694 startTime = artdaq::MonitoredQuantity::getCurrentTime();
695 if (!art_initialized_)
699 if (fragmentPtr->type() == artdaq::Fragment::InitFragmentType)
701 TLOG_DEBUG(name_) <<
"Init" << TLOG_ENDL;
703 if (is_data_logger_ && sender_ptr_)
705 auto fragCopy = *fragmentPtr;
706 sender_ptr_->sendFragment(std::move(fragCopy));
710 artdaq::RawEvent_ptr initEvent(
new artdaq::RawEvent(run_id_.run(), 1, fragmentPtr->sequenceID()));
711 initEvent->insertFragment(std::move(fragmentPtr));
713 bool enqStatus = event_queue_.enqTimedWait(initEvent, enq_timeout_);
717 TLOG_ERROR(name_) <<
"Attempt to send Init event to art timed out after " <<
718 enq_timeout_.count() <<
" seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
720 art_initialized_ =
true;
724 TLOG_ERROR(name_) <<
"Didn't receive an Init event with which to initialize art; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
731 if (fragmentPtr->type() == artdaq::Fragment::DataFragmentType)
735 artdaq::FragmentPtr rejectedFragment;
736 auto seqId = fragmentPtr->sequenceID();
737 bool try_again =
true;
740 auto ret = event_store_ptr_->insert(std::move(fragmentPtr), rejectedFragment);
743 receiver_ptr_->unsuppressAll();
750 else if (stop_requested_.load())
753 process_fragments =
false;
754 receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
756 <<
"Unable to process event " << seqId
757 <<
" because of back-pressure - forcibly ending the run." << TLOG_ENDL;
759 else if (local_pause_requested_.load())
762 process_fragments =
false;
763 receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
765 <<
"Unable to process event " << seqId
766 <<
" because of back-pressure - forcibly pausing the run." << TLOG_ENDL;
770 fragmentPtr = std::move(rejectedFragment);
772 <<
"Unable to process event " << seqId
773 <<
" because of back-pressure from art - retrying..." << TLOG_ENDL;
778 receiver_ptr_->reject_fragment(senderSlot, std::move(rejectedFragment));
780 <<
"Unable to process event " << seqId
781 <<
" because the EventStore has reached the maximum number of incomplete bunches." << std::endl
782 <<
" Will retry when the EventStore is ready for new events." << TLOG_ENDL;
788 event_store_ptr_->insert(std::move(fragmentPtr),
false);
791 else if (fragmentPtr->type() == artdaq::Fragment::EndOfSubrunFragmentType)
793 if (is_data_logger_ && sender_ptr_)
795 auto fragCopy = *fragmentPtr;
796 sender_ptr_->sendFragment(std::move(fragCopy));
798 else if (is_dispatcher_)
800 for (
auto& transfer : dispatcher_transfers_)
802 transfer->copyFragment(*fragmentPtr, 0);
810 endSubRunMsg = std::move(fragmentPtr);
812 else if (fragmentPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
814 eodFragmentsReceived++;
818 fragments_sent.
setSlot(senderSlot, *fragmentPtr->dataBegin() + 1);
821 float delta = artdaq::MonitoredQuantity::getCurrentTime() - startTime;
822 stats_helper_.addSample(STORE_EVENT_WAIT_STAT_KEY, delta);
823 TRACE((delta > 3.0) ? 0 : 22,
"%s::process_fragments seq=%lu isLogger=%d delta=%f start=%f"
824 , name_.c_str(), seq, is_data_logger_, delta, startTime);
827 startTime = artdaq::MonitoredQuantity::getCurrentTime();
828 if (is_data_logger_ && disk_writing_directory_.size() > 0 &&
829 !stop_requested_.load() && !system_pause_requested_.load())
831 bool threshold_reached =
false;
832 if (file_close_event_count_ > 0 &&
833 event_count_in_subrun_ >= file_close_event_count_)
835 threshold_reached =
true;
839 time_t now = time(0);
840 if (file_close_timeout_secs_ > 0 &&
841 (now - subrun_start_time_) >= file_close_timeout_secs_)
843 threshold_reached =
true;
847 if (filesize_check_interval_seconds_ > 0 &&
848 filesize_check_interval_events_ > 0 &&
849 (now - last_filesize_check_time) >= filesize_check_interval_seconds_ &&
850 (event_count_in_run_ % filesize_check_interval_events_) == 0)
852 if (file_close_threshold_bytes_ > 0 &&
853 getLatestFileSize_() >= file_close_threshold_bytes_)
855 threshold_reached =
true;
857 last_filesize_check_time = now;
861 if (threshold_reached)
863 system_pause_requested_.store(
true);
864 if (pause_thread_.get() != 0)
866 pause_thread_->join();
868 TLOG_DEBUG(name_) <<
"Starting sendPauseAndResume thread "
869 <<
", event count in subrun = "
870 << event_count_in_subrun_ << TLOG_ENDL;
871 pause_thread_.reset(
new std::thread(&AggregatorCore::sendPauseAndResume_,
this));
874 stats_helper_.addSample(FILE_CHECK_TIME_STAT_KEY,
875 (artdaq::MonitoredQuantity::getCurrentTime() - startTime));
882 size_t source_count = 0;
883 if (is_data_logger_) source_count = receiver_ptr_->enabled_sources().size();
884 else source_count = 1;
886 if (eodFragmentsReceived >= source_count && endSubRunMsg !=
nullptr)
888 bool fragmentsOutstanding =
false;
891 for (
auto& i : receiver_ptr_->enabled_sources())
893 if (fragments_received[i] != fragments_sent[i])
895 fragmentsOutstanding =
true;
901 if (!fragmentsOutstanding)
903 event_store_ptr_->flushData();
904 artdaq::RawEvent_ptr subRunEvent(
new artdaq::RawEvent(run_id_.run(), 1, 0));
905 subRunEvent->insertFragment(std::move(endSubRunMsg));
907 bool enqStatus = event_queue_.enqTimedWait(subRunEvent, enq_timeout_);
911 TLOG_ERROR(name_) <<
"All data appears to have been received but attempt to send EndOfSubRun fragment to art timed out after " <<
912 enq_timeout_.count() <<
" seconds; DAQ may need to be returned to the \"Stopped\" state before further datataking" << TLOG_ENDL;
914 process_fragments =
false;
918 TLOG_WARNING(name_) <<
"EndOfSubRun fragment and all EndOfData fragments received but more data expected" << TLOG_ENDL;
923 logMessage_(
"Subrun " +
924 boost::lexical_cast<std::string>(event_store_ptr_->subrunID()) +
925 " in run " + boost::lexical_cast<std::string>(run_id_.run()) +
926 " has ended. There were " +
927 boost::lexical_cast<std::string>(event_count_in_subrun_) +
928 " events in this subrun, and there have been " +
929 boost::lexical_cast<std::string>(event_count_in_run_) +
930 " events so far in this run.");
932 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
933 getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
934 if (mqPtr.get() != 0)
936 artdaq::MonitoredQuantityStats stats;
937 mqPtr->getStats(stats);
938 std::ostringstream oss;
939 oss <<
"Run " << run_id_.run() <<
" has an overall event rate of ";
940 oss << std::fixed << std::setprecision(1) << stats.fullSampleRate;
941 oss <<
" events/sec.";
942 logMessage_(oss.str());
943 previous_run_duration_ = stats.fullDuration;
949 metricMan_.do_stop();
951 receiver_ptr_.reset(
nullptr);
952 sender_ptr_.reset(
nullptr);
954 processing_fragments_.store(
false);
960 if (which ==
"event_count")
962 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
963 getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
964 if (mqPtr.get() != 0)
966 return boost::lexical_cast<std::string>(mqPtr->getFullSampleCount());
974 if (which ==
"run_duration")
978 double duration = previous_run_duration_;
979 if (processing_fragments_.load())
981 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
982 getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
983 if (mqPtr.get() != 0)
985 duration = mqPtr->getFullDuration();
988 std::ostringstream oss;
989 oss << std::fixed << std::setprecision(1) << duration;
993 if (which ==
"file_size")
995 size_t latestFileSize = getLatestFileSize_();
996 return boost::lexical_cast<std::string>(latestFileSize);
999 if (which ==
"subrun_number")
1001 if (event_store_ptr_.get() !=
nullptr)
1003 return boost::lexical_cast<std::string>(event_store_ptr_->subrunID());
1011 if (which ==
"incomplete_event_count")
1013 if (event_store_ptr_ !=
nullptr)
1015 return boost::lexical_cast<std::string>(event_store_ptr_->incompleteEventCount());
1028 std::string tmpString = name_ +
" run number = ";
1029 tmpString.append(boost::lexical_cast<std::string>(run_id_.run()));
1030 tmpString.append(
". Command=\"" + which +
"\" is not currently supported.");
1036 TLOG_DEBUG(name_) <<
"AggregatorCore::register_monitor called with argument \"" << pset.to_string() <<
"\"" << TLOG_ENDL;
1037 std::lock_guard<std::mutex> lock(dispatcher_transfers_mutex_);
1043 for (
auto& existing_transfer_ : dispatcher_transfers_)
1045 if (existing_transfer_->uniqueLabel() == transfer->uniqueLabel())
1047 std::stringstream errmsg;
1048 errmsg <<
"Attempt to register newly-created monitor with label \"" <<
1049 transfer->uniqueLabel() <<
"\" failed; a monitor with that label already exists";
1050 return errmsg.str();
1054 dispatcher_transfers_.emplace_back(std::move(transfer));
1056 TLOG_INFO(name_) <<
"Successfully registered monitor with label \"" << dispatcher_transfers_.back()->uniqueLabel() <<
"\"" << TLOG_ENDL;
1062 std::stringstream errmsg;
1063 errmsg <<
"Unable to create a Transfer plugin with the FHiCL code \"" << pset.to_string() <<
"\", a new monitor has not been registered";
1064 return errmsg.str();
1072 TLOG_DEBUG(name_) <<
"AggregatorCore::unregister_monitor called with argument \"" << label <<
"\"" << TLOG_ENDL;
1073 std::lock_guard<std::mutex> lock(dispatcher_transfers_mutex_);
1077 auto r_i_end = std::remove_if(dispatcher_transfers_.begin(),
1078 dispatcher_transfers_.end(),
1079 [label](
const std::unique_ptr<TransferInterface>& transfer)
1081 return transfer->uniqueLabel() == label;
1084 auto nfound = dispatcher_transfers_.end() - r_i_end;
1086 TLOG_INFO(name_) <<
"Request from monitor with label \"" << label <<
"\" to unregister received" << TLOG_ENDL;
1090 dispatcher_transfers_.pop_back();
1093 else if (nfound == 0)
1095 std::stringstream errmsg;
1096 errmsg <<
"Warning in AggregatorCore::unregister_monitor: unable to find requested transfer plugin with "
1097 <<
"label \"" << label <<
"\"";
1098 TLOG_WARNING(name_) << errmsg.str() << TLOG_ENDL;
1099 return errmsg.str();
1103 std::stringstream errmsg;
1104 errmsg <<
"Warning in AggregatorCore::unregister_monitor: found more than one (" << nfound <<
1105 ") transfer plugins with label \"" << label <<
"\", will unregister all of them";
1106 TLOG_WARNING(name_) << errmsg.str() << TLOG_ENDL;
1107 dispatcher_transfers_.erase(r_i_end, dispatcher_transfers_.end());
1108 return errmsg.str();
1113 std::stringstream errmsg;
1114 errmsg <<
"Unable to unregister transfer plugin with label \"" << label <<
"\"";
1115 return errmsg.str();
1122 size_t artdaq::AggregatorCore::getLatestFileSize_()
const
1124 if (disk_writing_directory_.size() == 0)
1126 TLOG_DEBUG(name_) <<
"Latest file size = 0 (no directory)" << TLOG_ENDL;
1129 BFS::path outputDir(disk_writing_directory_);
1130 BFS::directory_iterator endIter;
1132 std::time_t latestFileTime = 0;
1133 size_t latestFileSize = 0;
1134 if (BFS::exists(outputDir) && BFS::is_directory(outputDir))
1136 for (BFS::directory_iterator dirIter(outputDir); dirIter != endIter; ++dirIter)
1138 BFS::path pathObj = dirIter->path();
1139 if (pathObj.filename().string().find(
"RootOutput") != std::string::npos &&
1140 pathObj.filename().string().find(
"root") != std::string::npos)
1142 if (BFS::last_write_time(pathObj) >= latestFileTime)
1144 latestFileTime = BFS::last_write_time(pathObj);
1145 latestFileSize = BFS::file_size(pathObj);
1150 time_t now = time(0);
1151 if ((now - latestFileTime) < 60)
1153 TLOG_DEBUG(name_) <<
"Latest file size = "
1154 << latestFileSize << TLOG_ENDL;
1155 return latestFileSize;
1159 TLOG_DEBUG(name_) <<
"Latest file size = 0 (too old)" << TLOG_ENDL;
1164 bool artdaq::AggregatorCore::sendPauseAndResume_()
1166 xmlrpc_c::clientSimple myClient;
1167 TLOG_INFO(name_) <<
"Starting automatic pause..." << TLOG_ENDL;
1168 for (
size_t igrp = 0; igrp < xmlrpc_client_lists_.size(); ++igrp)
1170 for (
size_t idx = 0; idx < xmlrpc_client_lists_[igrp].size(); ++idx)
1172 for (
size_t iAttempt = 0; iAttempt < 5; ++iAttempt)
1174 xmlrpc_c::value result;
1175 myClient.call((xmlrpc_client_lists_[igrp])[idx],
"daq.pause", &result);
1176 std::string
const resultString = xmlrpc_c::value_string(result);
1177 TLOG_DEBUG(name_) <<
"Pause: "
1178 << (xmlrpc_client_lists_[igrp])[idx]
1179 <<
" " << resultString << TLOG_ENDL;
1180 if (std::string::npos !=
1181 boost::algorithm::to_lower_copy(resultString).find(
"success"))
1188 TLOG_WARNING(name_) <<
"Retrying pause command to "
1189 << (xmlrpc_client_lists_[igrp])[idx]
1190 <<
" (" << resultString <<
")" << TLOG_ENDL;
1195 TLOG_INFO(name_) <<
"Starting automatic resume..." << TLOG_ENDL;
1196 for (
int igrp = (xmlrpc_client_lists_.size() - 1); igrp >= 0; --igrp)
1198 for (
size_t idx = 0; idx < xmlrpc_client_lists_[igrp].size(); ++idx)
1200 for (
size_t iAttempt = 0; iAttempt < 5; ++iAttempt)
1202 xmlrpc_c::value result;
1203 myClient.call((xmlrpc_client_lists_[igrp])[idx],
"daq.resume", &result);
1204 std::string
const resultString = xmlrpc_c::value_string(result);
1205 TLOG_DEBUG(name_) <<
"Resume: "
1206 << (xmlrpc_client_lists_[igrp])[idx]
1207 <<
" " << resultString << TLOG_ENDL;
1208 if (std::string::npos !=
1209 boost::algorithm::to_lower_copy(resultString).find(
"success"))
1216 TLOG_WARNING(name_) <<
"Retrying resume command to "
1217 << (xmlrpc_client_lists_[igrp])[idx]
1218 <<
" (" << resultString <<
")" << TLOG_ENDL;
1223 TLOG_INFO(name_) <<
"Done with automatic resume..." << TLOG_ENDL;
1224 system_pause_requested_.store(
false);
1228 void artdaq::AggregatorCore::logMessage_(std::string
const& text)
1230 if (is_data_logger_)
1232 TLOG_INFO(name_) << text << TLOG_ENDL;
1236 TLOG_DEBUG(name_) << text << TLOG_ENDL;
1240 std::string artdaq::AggregatorCore::buildStatisticsString_()
1242 std::ostringstream oss;
1243 double eventCount = 1.0;
1244 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
1245 getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
1246 if (mqPtr.get() != 0)
1249 artdaq::MonitoredQuantityStats stats;
1250 mqPtr->getStats(stats);
1251 oss <<
"Input statistics: "
1252 << stats.recentSampleCount <<
" events received at "
1253 << stats.recentSampleRate <<
" events/sec, data rate = "
1254 << (stats.recentValueRate *
sizeof(artdaq::RawDataType)
1255 / 1024.0 / 1024.0) <<
" MB/sec, monitor window = "
1256 << stats.recentDuration <<
" sec, min::max event size = "
1257 << (stats.recentValueMin *
sizeof(artdaq::RawDataType)
1260 << (stats.recentValueMax *
sizeof(artdaq::RawDataType)
1262 <<
" MB" << std::endl;
1263 eventCount = std::max(
double(stats.recentSampleCount), 1.0);
1264 oss <<
"Average times per event: ";
1265 if (stats.recentSampleRate > 0.0)
1267 oss <<
" elapsed time = "
1268 << (1.0 / stats.recentSampleRate) <<
" sec";
1278 mqPtr = artdaq::StatisticsCollection::getInstance().
1279 getMonitoredQuantity(INPUT_WAIT_STAT_KEY);
1280 if (mqPtr.get() != 0)
1282 oss <<
", input wait time = "
1283 << (mqPtr->getRecentValueSum() / eventCount) <<
" sec";
1286 mqPtr = artdaq::StatisticsCollection::getInstance().
1287 getMonitoredQuantity(STORE_EVENT_WAIT_STAT_KEY);
1288 if (mqPtr.get() != 0)
1290 artdaq::MonitoredQuantityStats stats;
1291 mqPtr->getStats(stats);
1292 oss <<
", avg::max event store wait time = "
1293 << (stats.recentValueSum / eventCount)
1294 <<
"::" << stats.recentValueMax
1298 mqPtr = artdaq::StatisticsCollection::getInstance().
1299 getMonitoredQuantity(SHM_COPY_TIME_STAT_KEY);
1300 if (mqPtr.get() != 0)
1302 oss <<
", shared memory copy time = "
1303 << (mqPtr->getRecentValueSum() / eventCount) <<
" sec";
1306 mqPtr = artdaq::StatisticsCollection::getInstance().
1307 getMonitoredQuantity(FILE_CHECK_TIME_STAT_KEY);
1308 if (mqPtr.get() != 0)
1310 oss <<
", file size test time = "
1311 << (mqPtr->getRecentValueSum() / eventCount) <<
" sec";
1317 void artdaq::AggregatorCore::sendMetrics_()
1320 double eventCount = 1.0;
1321 artdaq::MonitoredQuantityPtr mqPtr = artdaq::StatisticsCollection::getInstance().
1322 getMonitoredQuantity(INPUT_EVENTS_STAT_KEY);
1323 if (mqPtr.get() != 0)
1325 artdaq::MonitoredQuantityStats stats;
1326 mqPtr->getStats(stats);
1327 eventCount = std::max(
double(stats.recentSampleCount), 1.0);
1328 metricMan_.sendMetric(
"Event Rate", stats.recentSampleRate,
"events/sec", 1, MetricMode::Average);
1329 metricMan_.sendMetric(
"Average Event Size", (stats.recentValueAverage *
sizeof(artdaq::RawDataType)),
"bytes/event", 2, MetricMode::Average);
1330 metricMan_.sendMetric(
"Data Rate", (stats.recentValueRate *
sizeof(artdaq::RawDataType)),
"bytes/sec", 2, MetricMode::Average);
1339 mqPtr = artdaq::StatisticsCollection::getInstance().
1340 getMonitoredQuantity(INPUT_WAIT_STAT_KEY);
1341 if (mqPtr.get() != 0)
1343 metricMan_.sendMetric(
"Average Input Wait Time", (mqPtr->getRecentValueSum() / eventCount),
"seconds/event", 3, MetricMode::Average);
1346 mqPtr = artdaq::StatisticsCollection::getInstance().
1347 getMonitoredQuantity(STORE_EVENT_WAIT_STAT_KEY);
1348 if (mqPtr.get() != 0)
1350 metricMan_.sendMetric(
"Avg art Queue Wait Time", (mqPtr->getRecentValueSum() / eventCount),
"seconds/event", 3, MetricMode::Average);
1353 mqPtr = artdaq::StatisticsCollection::getInstance().
1354 getMonitoredQuantity(SHM_COPY_TIME_STAT_KEY);
1355 if (mqPtr.get() != 0)
1357 metricMan_.sendMetric(
"Avg Shared Memory Copy Time", (mqPtr->getRecentValueSum() / eventCount),
"seconds/event", 4, MetricMode::Average);
1360 mqPtr = artdaq::StatisticsCollection::getInstance().
1361 getMonitoredQuantity(FILE_CHECK_TIME_STAT_KEY);
1362 if (mqPtr.get() != 0)
1364 metricMan_.sendMetric(
"Average File Check Time", (mqPtr->getRecentValueSum() / eventCount),
"seconds/event", 4, MetricMode::Average);
static const std::string INPUT_EVENTS_STAT_KEY
Key for the Input Events MonitoredQuantity.
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
static const std::string SHM_COPY_TIME_STAT_KEY
Key for the Shared Memory Copy Time MonitoredQuantity.
Sends Fragment objects using TransferInterface plugins. Uses Routing Tables if confgiured, otherwise will Round-Robin Fragments to the destinations.
bool reinitialize(fhicl::ParameterSet const &pset)
Reinitializes the AggregatorCore. No-Op.
bool stop()
Stops the AggregatorCore.
bool shutdown()
Shuts Down the AggregatorCore.
Keep track of the count of Fragments received from a set of sources.
bool start(art::RunID id)
Start the AggregatorCore.
The Fragment was successfully inserted.
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
std::string unregister_monitor(std::string const &label)
Delete the TransferInterface having the given unique label.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
AggregatorCore(int rank, std::string name)
AggregatorCore Constructor.
static const std::string INPUT_WAIT_STAT_KEY
Key for the Input Wait MonitoredQuantity.
bool soft_initialize(fhicl::ParameterSet const &pset)
Soft-Initializes the AggregatorCore. No-Op.
bool resume()
Resumes the AggregatorCore.
std::string register_monitor(fhicl::ParameterSet const &pset)
Create a new TransferInterface instance using the given configuration.
static const std::string FILE_CHECK_TIME_STAT_KEY
Key for the File Check Time MonitoredQuantity.
void setSlot(size_t slot, size_t val)
Set the given slot to the given value.
void display_bits(void *memstart, size_t nbytes, std::string sourcename)
Write out memory, for debugging purposes.
The EventStore class collects Fragment objects, until it receives a complete event, at which point the event is handed over to the art thread.
This TransferInterface is a Sender.
void incSlot(size_t slot)
Increment the given slot by one.
Receives Fragment objects from one or more DataSenderManager instances using TransferInterface plugin...
std::string report(std::string const &which) const
Send a report on a given run-time quantity.
The EventStore is full, but the Fragment was accepted as it is for an already-open event...
bool initialize(fhicl::ParameterSet const &pset)
Processes the initialize request.
bool pause()
Pauses the AggregatorCore.
int( ART_CFGSTRING_FCN)(const std::string &)
An art function that accepts a fhicl::ParameterSet as a string.
The Fragment was rejected, because the RawEventQueue is full.
static const std::string STORE_EVENT_WAIT_STAT_KEY
Key for the EventStore Event Wait MonitoredQuantity.
size_t process_fragments()
The main working loop of the AggregatorCore. Receives events from DataReceiverManager and processes t...