2 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
4 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
5 #include "artdaq-core/Core/StatisticsCollection.hh"
6 #include "artdaq-core/Utilities/TraceLock.hh"
9 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
12 : SharedMemoryManager(pset.get<uint32_t>(
"shared_memory_key", 0xBEE70000 + getpid()),
13 pset.get<size_t>(
"buffer_count"),
14 pset.has_key(
"max_event_size_bytes") ? pset.get<size_t>(
"max_event_size_bytes") : pset.get<size_t>(
"expected_fragments_per_event") * pset.get<size_t>(
"max_fragment_size_bytes"),
15 pset.get<size_t>(
"stale_buffer_timeout_usec", pset.get<size_t>(
"event_queue_wait_time", 5) * 1000000),
16 !pset.get<bool>(
"broadcast_mode", false))
17 , num_art_processes_(pset.get<size_t>(
"art_analyzer_count", 1))
18 , num_fragments_per_event_(pset.get<size_t>(
"expected_fragments_per_event"))
19 , queue_size_(pset.get<size_t>(
"buffer_count"))
22 , subrun_rollover_event_(Fragment::InvalidSequenceID)
23 , last_released_event_(0)
24 , update_run_ids_(pset.get<bool>(
"update_run_ids_on_new_fragment", true))
25 , use_sequence_id_for_event_number_(pset.get<bool>(
"use_sequence_id_for_event_number", true))
26 , overwrite_mode_(!pset.get<bool>(
"use_art", true) || pset.get<bool>(
"overwrite_mode", false) || pset.get<bool>(
"broadcast_mode", false))
27 , send_init_fragments_(pset.get<bool>(
"send_init_fragments", true))
29 , buffer_writes_pending_()
30 , incomplete_event_report_interval_ms_(pset.get<int>(
"incomplete_event_report_interval_ms", -1))
31 , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
32 , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now())
33 , broadcast_timeout_ms_(pset.get<int>(
"fragment_broadcast_timeout_ms", 3000))
35 , run_incomplete_event_count_(0)
36 , subrun_event_count_(0)
37 , subrun_incomplete_event_count_(0)
38 , oversize_fragment_count_(0)
39 , maximum_oversize_fragment_count_(pset.get<int>(
"maximum_oversize_fragment_count", 1))
42 , always_restart_art_(pset.get<bool>(
"restart_crashed_art_processes", true))
43 , manual_art_(pset.get<bool>(
"manual_art", false))
44 , current_art_pset_(art_pset)
45 , minimum_art_lifetime_s_(pset.get<double>(
"minimum_art_lifetime_s", 2.0))
46 , art_event_processing_time_us_(pset.get<size_t>(
"expected_art_event_processing_time_us", 100000))
49 , broadcasts_(pset.get<uint32_t>(
"broadcast_shared_memory_key", 0xCEE70000 + getpid()),
50 pset.get<size_t>(
"broadcast_buffer_count", 10),
51 pset.get<size_t>(
"broadcast_buffer_size", 0x100000),
52 pset.get<int>(
"expected_art_event_processing_time_us", 100000) * pset.get<size_t>(
"buffer_count"), false)
54 SetMinWriteSize(
sizeof(detail::RawEventHeader) +
sizeof(detail::RawFragmentHeader));
55 broadcasts_.SetMinWriteSize(
sizeof(detail::RawEventHeader) +
sizeof(detail::RawFragmentHeader));
57 if (pset.get<
bool>(
"use_art",
true) ==
false)
59 TLOG(TLVL_INFO) <<
"BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
60 num_art_processes_ = 0;
64 TLOG(TLVL_INFO) <<
"BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
65 TLOG(TLVL_TRACE) <<
"art_pset is " << art_pset.to_string();
67 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
69 if (overwrite_mode_ && num_art_processes_ > 0)
71 TLOG(TLVL_WARNING) <<
"Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
73 else if (overwrite_mode_)
75 TLOG(TLVL_INFO) <<
"Overwrite Mode enabled, no configured art processes at startup";
78 for (
size_t ii = 0; ii < size(); ++ii)
80 buffer_writes_pending_[ii] = 0;
83 if (!IsValid())
throw cet::exception(app_name +
"_SharedMemoryEventManager") <<
"Unable to attach to Shared Memory!";
85 TLOG(TLVL_TRACE) <<
"Setting Writer rank to " << my_rank;
87 TLOG(TLVL_DEBUG) <<
"Writer Rank is " << GetRank();
90 TLOG(TLVL_TRACE) <<
"END CONSTRUCTOR";
95 TLOG(TLVL_TRACE) <<
"DESTRUCTOR";
96 if (running_) endOfData();
97 TLOG(TLVL_TRACE) <<
"Destructor END";
100 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag,
void* dataPtr)
102 TLOG(TLVL_TRACE) <<
"AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
103 <<
", sequence_id=" << frag.sequence_id;
104 auto buffer = getBufferForSequenceID_(frag.sequence_id,
true, frag.timestamp);
105 TLOG(TLVL_TRACE) <<
"Using buffer " << buffer;
106 if (buffer == -1)
return false;
109 TLOG(TLVL_ERROR) <<
"Dropping event because data taking has already passed this event number: " << frag.sequence_id;
113 auto hdr = getEventHeader_(buffer);
116 hdr->run_id = run_id_;
117 hdr->subrun_id = subrun_id_;
120 TLOG(TLVL_TRACE) <<
"AddFragment before Write calls";
121 Write(buffer, dataPtr, frag.word_count *
sizeof(RawDataType));
123 TLOG(TLVL_TRACE) <<
"Checking for complete event";
124 auto fragmentCount = GetFragmentCount(frag.sequence_id);
125 hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
126 TLOG(TLVL_TRACE) <<
"hdr->is_complete=" << std::boolalpha << hdr->is_complete
127 <<
", fragmentCount=" << fragmentCount
128 <<
", num_fragments_per_event=" << num_fragments_per_event_
129 <<
", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
131 complete_buffer_(buffer);
132 if (requests_) requests_->SendRequest(
true);
134 TLOG(TLVL_TRACE) <<
"AddFragment END";
138 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag,
size_t timeout_usec, FragmentPtr& outfrag)
140 TLOG(TLVL_TRACE) <<
"AddFragment(FragmentPtr) BEGIN";
141 auto hdr = *
reinterpret_cast<detail::RawFragmentHeader*
>(frag->headerAddress());
142 auto data = frag->headerAddress();
143 auto start = std::chrono::steady_clock::now();
145 while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
147 sts = AddFragment(hdr, data);
148 if (!sts) usleep(1000);
152 outfrag = std::move(frag);
154 TLOG(TLVL_TRACE) <<
"AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
160 TLOG(14) <<
"WriteFragmentHeader BEGIN";
161 auto buffer = getBufferForSequenceID_(frag.sequence_id,
true, frag.timestamp);
165 if (buffer == -1 && !dropIfNoBuffersAvailable)
return nullptr;
168 TLOG(TLVL_ERROR) <<
"Dropping fragment with sequence id " << frag.sequence_id <<
" and fragment id " << frag.fragment_id <<
" because data taking has already passed this event.";
172 TLOG(TLVL_ERROR) <<
"Dropping fragment with sequence id " << frag.sequence_id <<
" and fragment id " << frag.fragment_id <<
" because there is no room in the queue and reliable mode is off.";
174 dropped_data_.reset(
new Fragment(frag.word_count - frag.num_words()));
175 return dropped_data_->dataBegin();
180 metricMan->sendMetric(
"Input Fragment Rate", 1,
"Fragments/s", 1, MetricMode::Rate);
183 buffer_writes_pending_[buffer]++;
184 std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
186 auto hdrpos =
reinterpret_cast<RawDataType*
>(GetWritePos(buffer));
187 Write(buffer, &frag, frag.num_words() *
sizeof(RawDataType));
189 auto pos =
reinterpret_cast<RawDataType*
>(GetWritePos(buffer));
190 if (frag.word_count - frag.num_words() > 0)
192 auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) *
sizeof(RawDataType));
196 reinterpret_cast<detail::RawFragmentHeader*
>(hdrpos)->word_count = frag.num_words();
197 reinterpret_cast<detail::RawFragmentHeader*
>(hdrpos)->type = Fragment::InvalidFragmentType;
198 TLOG(TLVL_ERROR) <<
"Dropping fragment with sequence id " << frag.sequence_id <<
" and fragment id " << frag.fragment_id <<
" because there is no room in the current buffer for this Fragment! (Keeping header)";
199 dropped_data_.reset(
new Fragment(frag.word_count - frag.num_words()));
201 oversize_fragment_count_++;
203 if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
205 throw cet::exception(
"Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
208 return dropped_data_->dataBegin();
211 TLOG(14) <<
"WriteFragmentHeader END";
218 TLOG(TLVL_TRACE) <<
"DoneWritingFragment BEGIN";
219 auto buffer = getBufferForSequenceID_(frag.sequence_id,
false, frag.timestamp);
220 if (buffer == -1) Detach(
true,
"SharedMemoryEventManager",
"getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
221 if (buffer == -2)
return;
222 std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
225 TLOG(TLVL_DEBUG) <<
"DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id <<
" and fragment id " << frag.fragment_id <<
" (type " << (int)frag.type <<
")";
226 auto hdr = getEventHeader_(buffer);
229 hdr->run_id = run_id_;
230 hdr->subrun_id = subrun_id_;
233 buffer_writes_pending_[buffer]--;
234 if (buffer_writes_pending_[buffer] != 0)
236 TLOG(TLVL_TRACE) <<
"Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
239 auto frag_count = GetFragmentCount(frag.sequence_id);
240 hdr->is_complete = frag_count == num_fragments_per_event_;
241 #if ART_SUPPORTS_DUPLICATE_EVENTS
242 if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
244 hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
248 complete_buffer_(buffer);
249 if (requests_) requests_->SendRequest(
true);
250 TLOG(TLVL_TRACE) <<
"DoneWritingFragment END";
255 return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID,
false), type);
260 if (buffer == -1)
return 0;
261 ResetReadPos(buffer);
262 IncrementReadPos(buffer,
sizeof(detail::RawEventHeader));
266 while (MoreDataInBuffer(buffer))
268 auto fragHdr =
reinterpret_cast<artdaq::detail::RawFragmentHeader*
>(GetReadPos(buffer));
269 IncrementReadPos(buffer, fragHdr->word_count *
sizeof(RawDataType));
270 if (type != Fragment::InvalidFragmentType && fragHdr->type != type)
continue;
271 TLOG(TLVL_TRACE) <<
"Adding Fragment with size=" << fragHdr->word_count <<
" to Fragment count";
282 auto start_time = std::chrono::steady_clock::now();
284 TLOG(TLVL_INFO) <<
"Starting art process with config file " << config_file->getFileName();
286 char* filename =
new char[config_file->getFileName().length() + 1];
287 strcpy(filename, config_file->getFileName().c_str());
289 std::vector<char*> args{ (
char*)
"art", (
char*)
"-c", filename, NULL };
301 std::string envVarKey =
"ARTDAQ_PARTITION_NUMBER";
303 if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
305 TLOG(TLVL_ERROR) <<
"Error setting environment variable \"" << envVarKey
306 <<
"\" in the environment of a child art process. "
307 <<
"This may result in incorrect TCP port number "
308 <<
"assignments or other issues, and data may "
309 <<
"not flow through the system correctly.";
312 execvp(
"art", &args[0]);
321 std::cout <<
"Please run the following command in a separate terminal:" << std::endl
322 <<
"art -c " << config_file->getFileName() << std::endl
323 <<
"Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() <<
"\" and note the PID of the art process." << std::endl
324 <<
"Finally, return to this window and enter the pid: " << std::endl;
329 TLOG(TLVL_INFO) <<
"PID of new art process is " << pid;
330 art_processes_.insert(pid);
332 auto sts = waitid(P_PID, pid, &status, WEXITED);
333 TLOG(TLVL_INFO) <<
"Removing PID " << pid <<
" from process list";
334 art_processes_.erase(pid);
337 TLOG(TLVL_WARNING) <<
"Error occurred in waitid for art process " << pid <<
": " << errno <<
" (" << strerror(errno) <<
").";
339 else if (status.si_code == CLD_EXITED && status.si_status == 0)
341 TLOG(TLVL_INFO) <<
"art process " << pid <<
" exited normally, " << (restart_art_ ?
"restarting" :
"not restarting");
345 auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
346 if (art_lifetime < minimum_art_lifetime_s_) restart_art_ =
false;
348 auto exit_type =
"exited with status code";
349 switch (status.si_code)
353 exit_type =
"was killed with signal";
360 TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
361 <<
"art process " << pid <<
" " << exit_type <<
" " << status.si_status
362 << (status.si_code == CLD_DUMPED ?
" (core dumped)" :
"")
363 <<
" after " << std::setprecision(2) << art_lifetime <<
" seconds, "
364 << (restart_art_ ?
"restarting" :
"not restarting");
366 }
while (restart_art_);
371 restart_art_ = always_restart_art_;
372 if (num_art_processes_ == 0)
return;
373 for (
size_t ii = 0; ii < num_art_processes_; ++ii)
375 StartArtProcess(current_art_pset_);
381 static std::mutex start_art_mutex;
382 std::unique_lock<std::mutex> lk(start_art_mutex);
384 restart_art_ = always_restart_art_;
385 auto initialCount = GetAttachedCount();
386 auto startTime = std::chrono::steady_clock::now();
388 if (pset != current_art_pset_ || !current_art_config_file_)
390 current_art_pset_ = pset;
391 current_art_config_file_ = std::make_shared<art_config_file>(pset);
394 boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
397 auto currentCount = GetAttachedCount() - initialCount;
398 while ((currentCount < 1 || pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
401 currentCount = GetAttachedCount() - initialCount;
403 if ((currentCount < 1 || pid <= 0) && manual_art_)
405 TLOG(TLVL_WARNING) <<
"Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount <<
", PID:" << pid;
408 else if (currentCount < 1 || pid <= 0)
410 TLOG(TLVL_WARNING) <<
"art process has not started after 5s. Check art configuration!"
411 <<
" (pid=" << pid <<
", attachedCount=" << currentCount <<
")";
416 TLOG(TLVL_INFO) << std::setw(4) << std::fixed <<
"art initialization took "
417 << TimeUtils::GetElapsedTime(startTime) <<
" seconds.";
426 restart_art_ =
false;
430 auto check_pids = [&](
bool print) {
432 for (
auto pid = pids.begin(); pid != pids.end();)
437 TLOG(TLVL_WARNING) <<
"Removing an invalid PID (" << *pid
438 <<
") from the shutdown list.";
439 pid = pids.erase(pid);
441 else if (kill(*pid, 0) < 0)
443 pid = pids.erase(pid);
447 if (print) std::cout << *pid <<
" ";
453 if (pids.size() == 0)
455 TLOG(14) <<
"All art processes already exited, nothing to do.";
462 TLOG(TLVL_TRACE) <<
"Gently informing art processes that it is time to shut down";
463 for (
auto pid : pids)
465 TLOG(TLVL_TRACE) <<
"Sending SIGQUIT to pid " << pid;
469 int graceful_wait_ms = 5000;
470 int int_wait_ms = 1000;
472 TLOG(TLVL_TRACE) <<
"Waiting up to " << graceful_wait_ms <<
" ms for all art processes to exit gracefully";
473 for (
int ii = 0; ii < graceful_wait_ms; ++ii)
478 if (pids.size() == 0)
480 TLOG(TLVL_TRACE) <<
"All art processes exited after " << ii <<
" ms.";
485 TLOG(TLVL_TRACE) <<
"Insisting that the art processes shut down";
486 for (
auto pid : pids)
491 TLOG(TLVL_TRACE) <<
"Waiting up to " << int_wait_ms <<
" ms for all art processes to exit";
492 for (
int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
498 if (pids.size() == 0)
500 TLOG(TLVL_TRACE) <<
"All art processes exited after " << ii <<
" ms.";
505 TLOG(TLVL_TRACE) <<
"Killing remaning art processes with extreme prejudice";
506 while (pids.size() > 0)
508 kill(*pids.begin(), SIGKILL);
516 std::cout <<
"Please shut down all art processes, then hit return/enter" << std::endl;
517 while (pids.size() > 0)
519 std::cout <<
"The following PIDs are running: ";
521 std::cout << std::endl;
530 TLOG(TLVL_DEBUG) <<
"ReconfigureArt BEGIN";
531 if (restart_art_ || !always_restart_art_)
535 for (
size_t ii = 0; ii < broadcasts_.size(); ++ii)
537 broadcasts_.MarkBufferEmpty(ii,
true);
539 if (newRun == 0) newRun = run_id_ + 1;
541 if (art_pset != current_art_pset_ || !current_art_config_file_)
543 current_art_pset_ = art_pset;
544 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
547 if (n_art_processes != -1)
549 TLOG(TLVL_INFO) <<
"Setting number of art processes to " << n_art_processes;
550 num_art_processes_ = n_art_processes;
553 TLOG(TLVL_DEBUG) <<
"ReconfigureArt END";
558 init_fragment_.reset(
nullptr);
559 TLOG(TLVL_TRACE) <<
"SharedMemoryEventManager::endOfData";
560 restart_art_ =
false;
562 size_t initialStoreSize = GetIncompleteEventCount();
563 TLOG(TLVL_TRACE) <<
"endOfData: Flushing " << initialStoreSize
564 <<
" stale events from the SharedMemoryEventManager.";
565 int counter = initialStoreSize;
566 while (active_buffers_.size() > 0 && counter > 0)
568 complete_buffer_(*active_buffers_.begin());
571 TLOG(TLVL_TRACE) <<
"endOfData: Done flushing, there are now " << GetIncompleteEventCount()
572 <<
" stale events in the SharedMemoryEventManager.";
575 TLOG(TLVL_TRACE) <<
"Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) <<
" outstanding buffers...";
576 auto start = std::chrono::steady_clock::now();
577 auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
578 auto end_of_data_wait_us = art_event_processing_time_us_ * size();
581 while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && art_processes_.size() > 0)
583 auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
584 if (temp != lastReadCount)
586 TLOG(TLVL_TRACE) <<
"Waiting for " << temp <<
" outstanding buffers...";
587 lastReadCount = temp;
588 start = std::chrono::steady_clock::now();
590 if (lastReadCount > 0) usleep(art_event_processing_time_us_);
592 TLOG(TLVL_TRACE) <<
"endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount <<
", time waited: " << TimeUtils::GetElapsedTime(start) <<
" s / " << (end_of_data_wait_us / 1000000.0) <<
" s, art process count: " << art_processes_.size();
594 TLOG(TLVL_TRACE) <<
"endOfData: Broadcasting EndOfData Fragment";
595 FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
596 bool success = broadcastFragment_(std::move(outFrag), outFrag);
599 TLOG(TLVL_TRACE) <<
"endOfData: Clearing buffers to make room for EndOfData Fragment";
600 for (
size_t ii = 0; ii < broadcasts_.size(); ++ii)
602 broadcasts_.MarkBufferEmpty(ii,
true);
604 broadcastFragment_(std::move(outFrag), outFrag);
606 auto endOfDataProcessingStart = std::chrono::steady_clock::now();
608 if (art_processes_.size() > 0)
610 TLOG(TLVL_DEBUG) <<
"Allowing " << art_processes_.size() <<
" art processes the chance to end gracefully";
611 if (end_of_data_wait_us == 0)
613 TLOG(TLVL_DEBUG) <<
"Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
614 end_of_data_wait_us = 100 * 1000000;
617 auto sleep_count = (end_of_data_wait_us / 10000) + 1;
618 for (
size_t ii = 0; ii < sleep_count; ++ii)
621 if (art_processes_.size() == 0)
break;
625 while (art_processes_.size() > 0)
627 TLOG(TLVL_DEBUG) <<
"There are " << art_processes_.size() <<
" art processes remaining. Proceeding to shutdown.";
628 ShutdownArtProcesses(art_processes_);
630 TLOG(TLVL_INFO) <<
"It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) <<
" s for all art processes to close after sending EndOfData Fragment";
632 ResetAttachedCount();
634 TLOG(TLVL_TRACE) <<
"endOfData: Clearing buffers";
635 for (
size_t ii = 0; ii < size(); ++ii)
637 MarkBufferEmpty(ii,
true);
645 released_incomplete_events_.clear();
647 TLOG(TLVL_TRACE) <<
"endOfData: Shutting down RequestReceiver";
648 requests_.reset(
nullptr);
650 TLOG(TLVL_TRACE) <<
"endOfData END";
651 TLOG(TLVL_INFO) <<
"EndOfData Complete. There were " << GetLastSeenBufferID() <<
" buffers processed.";
659 init_fragment_.reset(
nullptr);
660 TLOG(TLVL_TRACE) <<
"startRun: Clearing broadcast buffers";
661 for (
size_t ii = 0; ii < broadcasts_.size(); ++ii)
663 broadcasts_.MarkBufferEmpty(ii,
true);
668 subrun_rollover_event_ = Fragment::InvalidSequenceID;
669 last_released_event_ = 0;
671 if (requests_) requests_->SendRoutingToken(queue_size_);
672 TLOG(TLVL_DEBUG) <<
"Starting run " << run_id_
673 <<
", max queue size = "
676 << GetLockedBufferCount();
679 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
680 metricMan->sendMetric(
"Run Number", runSubrun,
"Run:Subrun", 1, MetricMode::LastPoint);
687 subrun_rollover_event_ = Fragment::InvalidSequenceID;
690 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
691 metricMan->sendMetric(
"Run Number", runSubrun,
"Run:Subrun", 1, MetricMode::LastPoint);
697 TLOG(TLVL_INFO) <<
"Ending run " << run_id_;
698 FragmentPtr endOfRunFrag(
new
699 Fragment(static_cast<size_t>
700 (ceil(
sizeof(my_rank) /
701 static_cast<double>(
sizeof(Fragment::value_type))))));
703 TLOG(TLVL_DEBUG) <<
"Broadcasting EndOfRun Fragment";
704 endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
705 *endOfRunFrag->dataBegin() = my_rank;
706 broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
708 TLOG(TLVL_INFO) <<
"Run " << run_id_ <<
" has ended. There were " << run_event_count_ <<
" events in this run.";
709 run_event_count_ = 0;
710 run_incomplete_event_count_ = 0;
711 oversize_fragment_count_ = 0;
717 TLOG(TLVL_INFO) <<
"Ending subrun " << subrun_id_;
718 std::unique_ptr<artdaq::Fragment>
720 Fragment(static_cast<size_t>
721 (ceil(
sizeof(my_rank) /
722 static_cast<double>(
sizeof(Fragment::value_type))))));
724 TLOG(TLVL_DEBUG) <<
"Broadcasting EndOfSubrun Fragment";
725 endOfSubrunFrag->setSequenceID(subrun_rollover_event_);
726 endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
727 *endOfSubrunFrag->dataBegin() = my_rank;
729 broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
731 TLOG(TLVL_INFO) <<
"Subrun " << subrun_id_ <<
" in run " << run_id_ <<
" has ended. There were " << subrun_event_count_ <<
" events in this subrun.";
732 subrun_event_count_ = 0;
733 subrun_incomplete_event_count_ = 0;
741 if (boundary == 0 || boundary == Fragment::InvalidSequenceID)
return;
743 if (boundary < last_released_event_)
745 auto logLevel = TLVL_ERROR;
746 bool processAnyway =
false;
747 if (last_released_event_ - boundary < 100)
749 logLevel = TLVL_WARNING;
750 processAnyway =
true;
752 TLOG(logLevel) <<
"Subrun rollover requested for event that is in the past. (delta = " << (last_released_event_ - boundary) <<
").";
753 if (!processAnyway)
return;
755 TLOG(TLVL_INFO) <<
"Will roll over when I reach Sequence ID " << boundary;
764 if (boundary == last_released_event_ + 1) {
765 TLOG(TLVL_INFO) <<
"rolloverSubrun: Last released event had sequence id " << last_released_event_ << \
766 ", boundary is sequence id " << boundary <<
", so will start a new subrun here";
769 subrun_rollover_event_ = std::numeric_limits<sequence_id_t>::max();
772 subrun_rollover_event_ = boundary;
780 metricMan->sendMetric(
"Incomplete Event Count", GetIncompleteEventCount(),
"events", 1, MetricMode::LastPoint);
781 metricMan->sendMetric(
"Pending Event Count", GetPendingEventCount(),
"events", 1, MetricMode::LastPoint);
784 if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
786 if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
789 last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
790 std::ostringstream oss;
791 oss <<
"Incomplete Events (" << num_fragments_per_event_ <<
"): ";
792 for (
auto& ev : active_buffers_)
794 auto hdr = getEventHeader_(ev);
795 oss << hdr->sequence_id <<
" (" << GetFragmentCount(hdr->sequence_id) <<
"), ";
797 TLOG(TLVL_DEBUG) << oss.str();
801 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
803 TLOG(TLVL_DEBUG) <<
"Broadcasting Fragment with seqID=" << frag->sequenceID() <<
", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) <<
", size=" << frag->sizeBytes() <<
"B.";
804 auto buffer = broadcasts_.GetBufferForWriting(
false);
805 TLOG(TLVL_DEBUG) <<
"broadcastFragment_: after getting buffer 1st buffer=" << buffer;
806 auto start_time = std::chrono::steady_clock::now();
807 while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
810 buffer = broadcasts_.GetBufferForWriting(
false);
812 TLOG(TLVL_DEBUG) <<
"broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer <<
", elapsed time=" << TimeUtils::GetElapsedTime(start_time) <<
" s.";
815 TLOG(TLVL_ERROR) <<
"Broadcast of fragment type " << frag->typeString() <<
" failed due to timeout waiting for buffer!";
820 TLOG(TLVL_DEBUG) <<
"broadcastFragment_: Filling in RawEventHeader";
821 auto hdr =
reinterpret_cast<detail::RawEventHeader*
>(broadcasts_.GetBufferStart(buffer));
822 hdr->run_id = run_id_;
823 hdr->subrun_id = subrun_id_;
824 hdr->sequence_id = frag->sequenceID();
825 hdr->is_complete =
true;
826 broadcasts_.IncrementWritePos(buffer,
sizeof(detail::RawEventHeader));
828 TLOG(TLVL_DEBUG) <<
"broadcastFragment_ before Write calls";
829 broadcasts_.Write(buffer, frag->headerAddress(), frag->size() *
sizeof(RawDataType));
831 TLOG(TLVL_DEBUG) <<
"broadcastFragment_ Marking buffer full";
832 broadcasts_.MarkBufferFull(buffer, -1);
834 TLOG(TLVL_DEBUG) <<
"broadcastFragment_ Complete";
838 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(
int buffer)
840 return reinterpret_cast<detail::RawEventHeader*
>(GetBufferStart(buffer));
843 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID,
bool create_new, Fragment::timestamp_t timestamp)
845 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
846 TLOG(14) <<
"getBufferForSequenceID " << seqID <<
" BEGIN";
847 auto buffers = GetBuffersOwnedByManager();
848 for (
auto& buf : buffers)
850 auto hdr = getEventHeader_(buf);
851 if (hdr->sequence_id == seqID)
853 TLOG(14) <<
"getBufferForSequenceID " << seqID <<
" returning " << buf;
858 #if !ART_SUPPORTS_DUPLICATE_EVENTS
859 if (released_incomplete_events_.count(seqID))
861 TLOG(TLVL_ERROR) <<
"Event " << seqID <<
" has already been marked \"Incomplete\" and sent to art!";
866 if (!create_new)
return -1;
868 check_pending_buffers_(lk);
869 int new_buffer = GetBufferForWriting(
false);
871 if (new_buffer == -1)
873 new_buffer = GetBufferForWriting(overwrite_mode_);
876 if (new_buffer == -1)
return -1;
877 std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
879 auto hdr = getEventHeader_(new_buffer);
880 hdr->is_complete =
false;
881 hdr->run_id = run_id_;
882 hdr->subrun_id = subrun_id_;
883 hdr->event_id = use_sequence_id_for_event_number_ ?
static_cast<uint32_t
>(seqID) : static_cast<uint32_t>(timestamp);
884 hdr->sequence_id = seqID;
885 buffer_writes_pending_[new_buffer] = 0;
886 IncrementWritePos(new_buffer,
sizeof(detail::RawEventHeader));
887 SetMFIteration(
"Sequence ID " + std::to_string(seqID));
889 active_buffers_.insert(new_buffer);
893 if (timestamp != Fragment::InvalidTimestamp)
895 requests_->AddRequest(seqID, timestamp);
897 requests_->SendRequest();
899 TLOG(14) <<
"getBufferForSequenceID " << seqID <<
" returning newly initialized buffer " << new_buffer;
903 bool artdaq::SharedMemoryEventManager::hasFragments_(
int buffer)
905 if (buffer == -1)
return true;
906 if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
910 ResetReadPos(buffer);
911 IncrementReadPos(buffer,
sizeof(detail::RawEventHeader));
912 return MoreDataInBuffer(buffer);
915 void artdaq::SharedMemoryEventManager::complete_buffer_(
int buffer)
917 auto hdr = getEventHeader_(buffer);
918 if (hdr->is_complete)
920 TLOG(TLVL_DEBUG) <<
"complete_buffer_: This fragment completes event " << hdr->sequence_id <<
".";
924 requests_->RemoveRequest(hdr->sequence_id);
925 requests_->SendRoutingToken(1);
928 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
929 active_buffers_.erase(buffer);
930 pending_buffers_.insert(buffer);
933 check_pending_buffers_();
936 bool artdaq::SharedMemoryEventManager::bufferComparator(
int bufA,
int bufB)
938 return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
941 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex>
const& lock)
943 TLOG(TLVL_TRACE) <<
"check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
945 auto buffers = GetBuffersOwnedByManager();
946 for (
auto buf : buffers)
948 if (ResetBuffer(buf) && !pending_buffers_.count(buf))
950 auto hdr = getEventHeader_(buf);
951 if (active_buffers_.count(buf))
955 requests_->RemoveRequest(hdr->sequence_id);
956 requests_->SendRoutingToken(1);
958 active_buffers_.erase(buf);
959 pending_buffers_.insert(buf);
960 subrun_incomplete_event_count_++;
961 run_incomplete_event_count_++;
962 if (metricMan) metricMan->sendMetric(
"Incomplete Event Rate", 1,
"events/s", 3, MetricMode::Rate);
963 if (!released_incomplete_events_.count(hdr->sequence_id))
965 released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
969 released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
971 TLOG(TLVL_WARNING) <<
"Active event " << hdr->sequence_id <<
" is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] <<
" Fragments) to art.";
977 Fragment::sequence_id_t lowestSeqId = Fragment::InvalidSequenceID;
980 if (ReadyForWrite(
false))
982 for (
auto buf : active_buffers_)
984 auto hdr = getEventHeader_(buf);
985 TLOG(TLVL_TRACE) <<
"Buffer: " << buf <<
", SeqID: " << hdr->sequence_id <<
", ACTIVE";
986 if (hdr->sequence_id < lowestSeqId)
988 lowestSeqId = hdr->sequence_id;
991 TLOG(TLVL_TRACE) <<
"Lowest SeqID held: " << lowestSeqId;
994 std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
995 sorted_buffers.sort([
this](
int a,
int b) {
return bufferComparator(a, b); });
998 double eventSize = 0;
999 for (
auto buf : sorted_buffers)
1001 auto hdr = getEventHeader_(buf);
1002 if (hdr->sequence_id > lowestSeqId)
break;
1004 if (hdr->sequence_id >= subrun_rollover_event_)
1006 TLOG(TLVL_INFO) <<
"Subrun rollover reached at event " << hdr->sequence_id <<
" (boundary=" << subrun_rollover_event_ <<
"), last released event is " << last_released_event_ <<
".";
1010 if (hdr->sequence_id > last_released_event_) last_released_event_ = hdr->sequence_id;
1012 TLOG(TLVL_DEBUG) <<
"Releasing event " << hdr->sequence_id <<
" in buffer " << buf <<
" to art.";
1013 MarkBufferFull(buf);
1014 subrun_event_count_++;
1017 eventSize += BufferDataSize(buf);
1018 pending_buffers_.erase(buf);
1020 eventSize /= counter;
1022 TLOG(TLVL_TRACE) <<
"check_pending_buffers_: Sending Metrics";
1026 metricMan->sendMetric(
"Event Rate", counter,
"Events/s", 1, MetricMode::Rate);
1027 metricMan->sendMetric(
"Events Released to art this run", run_event_count_,
"Events", 1, MetricMode::LastPoint);
1028 metricMan->sendMetric(
"Incomplete Events Released to art this run", run_incomplete_event_count_,
"Events", 1, MetricMode::LastPoint);
1029 metricMan->sendMetric(
"Events Released to art this subrun", subrun_event_count_,
"Events", 2, MetricMode::LastPoint);
1030 metricMan->sendMetric(
"Incomplete Events Released to art this subrun", subrun_incomplete_event_count_,
"Events", 2, MetricMode::LastPoint);
1031 metricMan->sendMetric(
"Event Size", eventSize,
"Bytes", 1, MetricMode::Average);
1033 if (TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500)
1035 last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now();
1036 auto full = ReadReadyCount();
1037 auto empty = WriteReadyCount(overwrite_mode_);
1038 auto total = size();
1040 metricMan->sendMetric(
"Shared Memory Full Buffers", full,
"buffers", 2, MetricMode::LastPoint);
1041 metricMan->sendMetric(
"Shared Memory Available Buffers", empty,
"buffers", 2, MetricMode::LastPoint);
1042 metricMan->sendMetric(
"Shared Memory Full %", full * 100 / static_cast<double>(total),
"%", 2, MetricMode::LastPoint);
1043 metricMan->sendMetric(
"Shared Memory Available %", empty * 100 / static_cast<double>(total),
"%", 2, MetricMode::LastPoint);
1046 TLOG(TLVL_TRACE) <<
"check_pending_buffers_ END";
1049 void artdaq::SharedMemoryEventManager::send_init_frag_()
1051 if (init_fragment_ !=
nullptr)
1053 TLOG(TLVL_TRACE) <<
"Sending init Fragment to art...";
1056 std::string fileName =
"receiveInitMessage_" + std::to_string(my_rank) +
".bin";
1057 std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1058 ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1062 broadcastFragment_(std::move(init_fragment_), init_fragment_);
1063 TLOG(TLVL_TRACE) <<
"Init Fragment sent";
1065 else if (send_init_fragments_)
1067 TLOG(TLVL_WARNING) <<
"Cannot send init fragment because I haven't yet received one!";
1073 if (!init_fragment_ || init_fragment_ ==
nullptr)
1075 init_fragment_.swap(frag);
1080 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
void RunArt(std::shared_ptr< art_config_file > config_file, pid_t &pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
The SharedMemoryEventManager is a SharedMemoryManger which tracks events as they are built...
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
static int GetPartitionNumber()
Get the current partition number, as defined by the ARTDAQ_PARTITION_NUMBER environment variable...
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void StartArt()
Start all the art processes.
void ShutdownArtProcesses(std::set< pid_t > pids)
Shutdown a set of art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
void rolloverSubrun(sequence_id_t boundary)
Rollover the subrun after the specified event.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
void startSubrun()
Start a new Subrun, incrementing the subrun number.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endSubrun()
Send an EndOfSubRunFragment to the art thread.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
bool endOfData()
Indicate that the end of input has been reached to the art processes.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.