2 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
4 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
5 #include "artdaq-core/Core/StatisticsCollection.hh"
6 #include "artdaq-core/Utilities/TraceLock.hh"
9 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
12 : SharedMemoryManager(pset.get<uint32_t>(
"shared_memory_key", 0xBEE70000 + getpid()),
13 pset.get<size_t>(
"buffer_count"),
14 pset.has_key(
"max_event_size_bytes") ? pset.get<size_t>(
"max_event_size_bytes") : pset.get<size_t>(
"expected_fragments_per_event") * pset.get<size_t>(
"max_fragment_size_bytes"),
15 pset.get<size_t>(
"stale_buffer_timeout_usec", pset.get<size_t>(
"event_queue_wait_time", 5) * 1000000),
16 !pset.get<bool>(
"broadcast_mode", false))
17 , num_art_processes_(pset.get<size_t>(
"art_analyzer_count", 1))
18 , num_fragments_per_event_(pset.get<size_t>(
"expected_fragments_per_event"))
19 , queue_size_(pset.get<size_t>(
"buffer_count"))
22 , subrun_rollover_event_(Fragment::InvalidSequenceID)
23 , last_released_event_(0)
24 , update_run_ids_(pset.get<bool>(
"update_run_ids_on_new_fragment", true))
25 , overwrite_mode_(!pset.get<bool>(
"use_art", true) || pset.get<bool>(
"overwrite_mode", false) || pset.get<bool>(
"broadcast_mode", false))
26 , send_init_fragments_(pset.get<bool>(
"send_init_fragments", true))
28 , buffer_writes_pending_()
29 , incomplete_event_report_interval_ms_(pset.get<int>(
"incomplete_event_report_interval_ms", -1))
30 , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
31 , broadcast_timeout_ms_(pset.get<int>(
"fragment_broadcast_timeout_ms", 3000))
33 , run_incomplete_event_count_(0)
34 , subrun_event_count_(0)
35 , subrun_incomplete_event_count_(0)
38 , always_restart_art_(pset.get<bool>(
"restart_crashed_art_processes", true))
39 , current_art_pset_(art_pset)
40 , minimum_art_lifetime_s_(pset.get<double>(
"minimum_art_lifetime_s", 2.0))
41 , art_event_processing_time_us_(pset.get<size_t>(
"expected_art_event_processing_time_us", 100000))
44 , broadcasts_(pset.get<uint32_t>(
"broadcast_shared_memory_key", 0xCEE70000 + getpid()),
45 pset.get<size_t>(
"broadcast_buffer_count", 10),
46 pset.get<size_t>(
"broadcast_buffer_size", 0x100000),
47 pset.get<int>(
"expected_art_event_processing_time_us", 100000) * pset.get<size_t>(
"buffer_count"), false)
49 SetMinWriteSize(
sizeof(detail::RawEventHeader) +
sizeof(detail::RawFragmentHeader));
50 broadcasts_.SetMinWriteSize(
sizeof(detail::RawEventHeader) +
sizeof(detail::RawFragmentHeader));
52 if (pset.get<
bool>(
"use_art",
true) ==
false)
54 TLOG(TLVL_INFO) <<
"BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
55 num_art_processes_ = 0;
59 TLOG(TLVL_INFO) <<
"BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
60 TLOG(TLVL_TRACE) <<
"art_pset is " << art_pset.to_string();
62 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
64 if (overwrite_mode_ && num_art_processes_ > 0)
66 TLOG(TLVL_WARNING) <<
"Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
68 else if (overwrite_mode_)
70 TLOG(TLVL_INFO) <<
"Overwrite Mode enabled, no configured art processes at startup";
73 for (
size_t ii = 0; ii < size(); ++ii)
75 buffer_writes_pending_[ii] = 0;
78 if (!IsValid())
throw cet::exception(app_name +
"_SharedMemoryEventManager") <<
"Unable to attach to Shared Memory!";
80 TLOG(TLVL_TRACE) <<
"Setting Writer rank to " << my_rank;
82 TLOG(TLVL_DEBUG) <<
"Writer Rank is " << GetRank();
85 TLOG(TLVL_TRACE) <<
"END CONSTRUCTOR";
90 TLOG(TLVL_TRACE) <<
"DESTRUCTOR";
91 if (running_) endOfData();
92 TLOG(TLVL_TRACE) <<
"Destructor END";
95 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag,
void* dataPtr)
97 TLOG(TLVL_TRACE) <<
"AddFragment(Header, ptr) BEGIN frag.word_count=" << std::to_string(frag.word_count)
98 <<
", sequence_id=" << std::to_string(frag.sequence_id);
99 auto buffer = getBufferForSequenceID_(frag.sequence_id,
true, frag.timestamp);
100 TLOG(TLVL_TRACE) <<
"Using buffer " << std::to_string(buffer);
101 if (buffer == -1)
return false;
104 TLOG(TLVL_ERROR) <<
"Dropping event because data taking has already passed this event number: " << std::to_string(frag.sequence_id);
108 auto hdr = getEventHeader_(buffer);
111 hdr->run_id = run_id_;
112 hdr->subrun_id = subrun_id_;
115 TLOG(TLVL_TRACE) <<
"AddFragment before Write calls";
116 Write(buffer, dataPtr, frag.word_count *
sizeof(RawDataType));
118 TLOG(TLVL_TRACE) <<
"Checking for complete event";
119 auto fragmentCount = GetFragmentCount(frag.sequence_id);
120 hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
121 TLOG(TLVL_TRACE) <<
"hdr->is_complete=" << std::boolalpha << hdr->is_complete
122 <<
", fragmentCount=" << std::to_string(fragmentCount)
123 <<
", num_fragments_per_event=" << std::to_string(num_fragments_per_event_)
124 <<
", buffer_writes_pending_[buffer]=" << std::to_string(buffer_writes_pending_[buffer]);
126 complete_buffer_(buffer);
127 if (requests_) requests_->SendRequest(
true);
129 TLOG(TLVL_TRACE) <<
"AddFragment END";
133 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag,
size_t timeout_usec, FragmentPtr& outfrag)
135 TLOG(TLVL_TRACE) <<
"AddFragment(FragmentPtr) BEGIN";
136 auto hdr = *
reinterpret_cast<detail::RawFragmentHeader*
>(frag->headerAddress());
137 auto data = frag->headerAddress();
138 auto start = std::chrono::steady_clock::now();
140 while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
142 sts = AddFragment(hdr, data);
143 if (!sts) usleep(1000);
147 outfrag = std::move(frag);
149 TLOG(TLVL_TRACE) <<
"AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
155 TLOG(14) <<
"WriteFragmentHeader BEGIN";
156 auto buffer = getBufferForSequenceID_(frag.sequence_id,
true, frag.timestamp);
160 if (buffer == -1 && !dropIfNoBuffersAvailable)
return nullptr;
163 TLOG(TLVL_ERROR) <<
"Dropping fragment with sequence id " << std::to_string(frag.sequence_id) <<
" and fragment id " << std::to_string(frag.fragment_id) <<
" because data taking has already passed this event.";
167 TLOG(TLVL_ERROR) <<
"Dropping fragment with sequence id " << std::to_string(frag.sequence_id) <<
" and fragment id " << std::to_string(frag.fragment_id) <<
" because there is no room in the queue and reliable mode is off.";
169 dropped_data_.reset(
new Fragment(frag.word_count - frag.num_words()));
170 return dropped_data_->dataBegin();
175 metricMan->sendMetric(
"Input Fragment Rate", 1,
"Fragments/s", 1, MetricMode::Rate);
178 buffer_writes_pending_[buffer]++;
179 TraceLock lk(buffer_mutexes_[buffer], 50,
"WriteFragmentHeader");
180 Write(buffer, &frag, frag.num_words() *
sizeof(RawDataType));
182 auto pos =
reinterpret_cast<RawDataType*
>(GetWritePos(buffer));
183 if (frag.word_count - frag.num_words() > 0)
185 IncrementWritePos(buffer, (frag.word_count - frag.num_words()) *
sizeof(RawDataType));
188 TLOG(14) <<
"WriteFragmentHeader END";
195 TLOG(TLVL_TRACE) <<
"DoneWritingFragment BEGIN";
196 auto buffer = getBufferForSequenceID_(frag.sequence_id,
false, frag.timestamp);
197 if (buffer == -1) Detach(
true,
"SharedMemoryEventManager",
"getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
198 if (buffer == -2)
return;
199 TraceLock lk(buffer_mutexes_[buffer], 50,
"DoneWritingFragment");
201 auto hdr = getEventHeader_(buffer);
204 hdr->run_id = run_id_;
205 hdr->subrun_id = subrun_id_;
208 buffer_writes_pending_[buffer]--;
209 if (buffer_writes_pending_[buffer] != 0)
211 TLOG(TLVL_TRACE) <<
"Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
214 auto frag_count = GetFragmentCount(frag.sequence_id);
215 hdr->is_complete = frag_count == num_fragments_per_event_;
216 #if ART_SUPPORTS_DUPLICATE_EVENTS
217 if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
219 hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
223 complete_buffer_(buffer);
224 if (requests_) requests_->SendRequest(
true);
225 TLOG(TLVL_TRACE) <<
"DoneWritingFragment END";
230 return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID,
false), type);
235 if (buffer == -1)
return 0;
236 ResetReadPos(buffer);
237 IncrementReadPos(buffer,
sizeof(detail::RawEventHeader));
241 while (MoreDataInBuffer(buffer))
243 auto fragHdr =
reinterpret_cast<artdaq::detail::RawFragmentHeader*
>(GetReadPos(buffer));
244 IncrementReadPos(buffer, fragHdr->word_count *
sizeof(RawDataType));
245 if (type != Fragment::InvalidFragmentType && fragHdr->type != type)
continue;
246 TLOG(TLVL_TRACE) <<
"Adding Fragment with size=" << std::to_string(fragHdr->word_count) <<
" to Fragment count";
257 auto start_time = std::chrono::steady_clock::now();
259 TLOG(TLVL_INFO) <<
"Starting art process with config file " << config_file->getFileName();
261 char* filename =
new char[config_file->getFileName().length() + 1];
262 strcpy(filename, config_file->getFileName().c_str());
264 std::vector<char*> args{ (
char*)
"art", (
char*)
"-c", filename, NULL };
270 execvp(
"art", &args[0]);
277 TLOG(TLVL_INFO) <<
"PID of new art process is " << pid;
278 art_processes_.insert(pid);
280 auto sts = waitid(P_PID, pid, &status, WEXITED);
281 TLOG(TLVL_INFO) <<
"Removing PID " << pid <<
" from process list";
282 art_processes_.erase(pid);
285 TLOG(TLVL_WARNING) <<
"Error occurred in waitid for art process " << pid <<
": " << errno <<
" (" << strerror(errno) <<
").";
287 else if (status.si_code == CLD_EXITED && status.si_status == 0)
289 TLOG(TLVL_INFO) <<
"art process " << pid <<
" exited normally, " << (restart_art_ ?
"restarting" :
"not restarting");
293 auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
294 if (art_lifetime < minimum_art_lifetime_s_) restart_art_ =
false;
296 auto exit_type =
"exited with status code";
297 switch (status.si_code)
301 exit_type =
"was killed with signal";
308 TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
309 <<
"art process " << pid <<
" " << exit_type <<
" " << status.si_status
310 << (status.si_code == CLD_DUMPED ?
" (core dumped)" :
"")
311 <<
" after " << std::setprecision(2) << art_lifetime <<
" seconds, "
312 << (restart_art_ ?
"restarting" :
"not restarting");
314 }
while (restart_art_);
319 restart_art_ = always_restart_art_;
320 if (num_art_processes_ == 0)
return;
321 for (
size_t ii = 0; ii < num_art_processes_; ++ii)
323 StartArtProcess(current_art_pset_);
329 static std::mutex start_art_mutex;
330 TraceLock lk(start_art_mutex, 15,
"StartArtLock");
331 restart_art_ = always_restart_art_;
332 auto initialCount = GetAttachedCount();
333 auto startTime = std::chrono::steady_clock::now();
335 if (pset != current_art_pset_ || !current_art_config_file_)
337 current_art_pset_ = pset;
338 current_art_config_file_ = std::make_shared<art_config_file>(pset);
341 boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
345 while (GetAttachedCount() - initialCount < 1 && TimeUtils::GetElapsedTime(startTime) < 5)
349 if (GetAttachedCount() - initialCount < 1 || pid <= 0)
351 TLOG(TLVL_WARNING) <<
"art process has not started after 5s. Check art configuration!"
352 <<
" (pid=" << pid <<
", attachedCount=" << std::to_string(GetAttachedCount() - initialCount) <<
")";
357 TLOG(TLVL_INFO) << std::setw(4) << std::fixed <<
"art initialization took "
358 << TimeUtils::GetElapsedTime(startTime) <<
" seconds.";
367 restart_art_ =
false;
371 for (
auto pid = pids.begin(); pid != pids.end();)
373 if (kill(*pid, 0) < 0)
375 pid = pids.erase(pid);
382 if (pids.size() == 0)
384 TLOG(14) <<
"All art processes already exited, nothing to do.";
389 TLOG(TLVL_TRACE) <<
"Gently informing art processes that it is time to shut down";
390 for (
auto pid : pids)
395 int graceful_wait_ms = 5000;
396 int int_wait_ms = 1000;
398 TLOG(TLVL_TRACE) <<
"Waiting up to " << graceful_wait_ms <<
" ms for all art processes to exit gracefully";
399 for (
int ii = 0; ii < graceful_wait_ms; ++ii)
403 for (
auto pid = pids.begin(); pid != pids.end();)
405 if (kill(*pid, 0) < 0)
407 pid = pids.erase(pid);
414 if (pids.size() == 0)
416 TLOG(TLVL_TRACE) <<
"All art processes exited after " << ii <<
" ms.";
421 TLOG(TLVL_TRACE) <<
"Insisting that the art processes shut down";
422 for (
auto pid : pids)
427 TLOG(TLVL_TRACE) <<
"Waiting up to " << int_wait_ms <<
" ms for all art processes to exit";
428 for (
int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
432 for (
auto pid = pids.begin(); pid != pids.end();)
434 if (kill(*pid, 0) < 0)
436 pid = pids.erase(pid);
444 if (pids.size() == 0)
446 TLOG(TLVL_TRACE) <<
"All art processes exited after " << ii <<
" ms.";
451 TLOG(TLVL_TRACE) <<
"Killing remaning art processes with extreme prejudice";
452 while (pids.size() > 0)
454 kill(*pids.begin(), SIGKILL);
457 for (
auto pid = pids.begin(); pid != pids.end();)
459 if (kill(*pid, 0) < 0)
461 pid = pids.erase(pid);
473 TLOG(TLVL_DEBUG) <<
"ReconfigureArt BEGIN";
474 if (restart_art_ || !always_restart_art_)
478 for (
size_t ii = 0; ii < broadcasts_.size(); ++ii)
480 broadcasts_.MarkBufferEmpty(ii,
true);
482 if (newRun == 0) newRun = run_id_ + 1;
484 if (art_pset != current_art_pset_ || !current_art_config_file_)
486 current_art_pset_ = art_pset;
487 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
490 if (n_art_processes != -1)
492 TLOG(TLVL_INFO) <<
"Setting number of art processes to " << n_art_processes;
493 num_art_processes_ = n_art_processes;
496 TLOG(TLVL_DEBUG) <<
"ReconfigureArt END";
501 init_fragment_.reset(
nullptr);
502 TLOG(TLVL_TRACE) <<
"SharedMemoryEventManager::endOfData";
503 restart_art_ =
false;
505 size_t initialStoreSize = GetIncompleteEventCount();
506 TLOG(TLVL_TRACE) <<
"endOfData: Flushing " << initialStoreSize
507 <<
" stale events from the SharedMemoryEventManager.";
508 int counter = initialStoreSize;
509 while (active_buffers_.size() > 0 && counter > 0)
511 complete_buffer_(*active_buffers_.begin());
514 TLOG(TLVL_TRACE) <<
"endOfData: Done flushing, there are now " << GetIncompleteEventCount()
515 <<
" stale events in the SharedMemoryEventManager.";
518 TLOG(TLVL_TRACE) <<
"Waiting for " << std::to_string(ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) <<
" outstanding buffers...";
519 auto start = std::chrono::steady_clock::now();
520 auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
521 auto end_of_data_wait_us = art_event_processing_time_us_ * size();
524 while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && art_processes_.size() > 0)
526 auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
527 if (temp != lastReadCount)
529 TLOG(TLVL_TRACE) <<
"Waiting for " << std::to_string(temp) <<
" outstanding buffers...";
530 lastReadCount = temp;
531 start = std::chrono::steady_clock::now();
533 if (lastReadCount > 0) usleep(art_event_processing_time_us_);
535 TLOG(TLVL_TRACE) <<
"endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount <<
", time waited: " << TimeUtils::GetElapsedTime(start) <<
" s / " << (end_of_data_wait_us / 1000000.0) <<
" s, art process count: " << art_processes_.size();
537 TLOG(TLVL_TRACE) <<
"endOfData: Broadcasting EndOfData Fragment";
538 FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
539 bool success = broadcastFragment_(std::move(outFrag), outFrag);
542 TLOG(TLVL_TRACE) <<
"endOfData: Clearing buffers to make room for EndOfData Fragment";
543 for (
size_t ii = 0; ii < size(); ++ii)
545 broadcasts_.MarkBufferEmpty(ii,
true);
547 broadcastFragment_(std::move(outFrag), outFrag);
549 auto endOfDataProcessingStart = std::chrono::steady_clock::now();
551 if (art_processes_.size() > 0)
553 TLOG(TLVL_DEBUG) <<
"Allowing " << std::to_string(art_processes_.size()) <<
" art processes the chance to end gracefully";
554 if (end_of_data_wait_us == 0)
556 TLOG(TLVL_DEBUG) <<
"Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
557 end_of_data_wait_us = 100 * 1000000;
560 auto sleep_count = (end_of_data_wait_us / 10000) + 1;
561 for (
size_t ii = 0; ii < sleep_count; ++ii)
564 if (art_processes_.size() == 0)
break;
568 while (art_processes_.size() > 0)
570 TLOG(TLVL_DEBUG) <<
"There are " << std::to_string(art_processes_.size()) <<
" art processes remaining. Proceeding to shutdown.";
571 ShutdownArtProcesses(art_processes_);
573 TLOG(TLVL_INFO) <<
"It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) <<
" s for all art processes to close after sending EndOfData Fragment";
575 ResetAttachedCount();
577 TLOG(TLVL_TRACE) <<
"endOfData: Clearing buffers";
578 for (
size_t ii = 0; ii < size(); ++ii)
580 MarkBufferEmpty(ii,
true);
582 released_incomplete_events_.clear();
584 TLOG(TLVL_TRACE) <<
"endOfData: Shutting down RequestReceiver";
585 requests_.reset(
nullptr);
587 TLOG(TLVL_TRACE) <<
"endOfData END";
588 TLOG(TLVL_INFO) <<
"EndOfData Complete. There were " << GetLastSeenBufferID() <<
" buffers processed.";
596 init_fragment_.reset(
nullptr);
600 subrun_rollover_event_ = Fragment::InvalidSequenceID;
601 last_released_event_ = 0;
603 if (requests_) requests_->SendRoutingToken(queue_size_);
604 TLOG(TLVL_DEBUG) <<
"Starting run " << run_id_
605 <<
", max queue size = "
608 << GetLockedBufferCount();
611 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
612 metricMan->sendMetric(
"Run Number", runSubrun,
"Run:Subrun", 1, MetricMode::LastPoint);
619 subrun_rollover_event_ = Fragment::InvalidSequenceID;
622 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
623 metricMan->sendMetric(
"Run Number", runSubrun,
"Run:Subrun", 1, MetricMode::LastPoint);
629 TLOG(TLVL_INFO) <<
"Ending run " << run_id_;
630 FragmentPtr endOfRunFrag(
new
631 Fragment(static_cast<size_t>
632 (ceil(
sizeof(my_rank) /
633 static_cast<double>(
sizeof(Fragment::value_type))))));
635 TLOG(TLVL_DEBUG) <<
"Broadcasting EndOfRun Fragment";
636 endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
637 *endOfRunFrag->dataBegin() = my_rank;
638 broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
640 TLOG(TLVL_INFO) <<
"Run " << run_id_ <<
" has ended. There were " << run_event_count_ <<
" events in this run.";
641 run_event_count_ = 0;
642 run_incomplete_event_count_ = 0;
648 TLOG(TLVL_INFO) <<
"Ending subrun " << subrun_id_;
649 std::unique_ptr<artdaq::Fragment>
651 Fragment(static_cast<size_t>
652 (ceil(
sizeof(my_rank) /
653 static_cast<double>(
sizeof(Fragment::value_type))))));
655 TLOG(TLVL_DEBUG) <<
"Broadcasting EndOfSubrun Fragment";
656 endOfSubrunFrag->setSequenceID(subrun_rollover_event_);
657 endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
658 *endOfSubrunFrag->dataBegin() = my_rank;
660 broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
662 TLOG(TLVL_INFO) <<
"Subrun " << subrun_id_ <<
" in run " << run_id_ <<
" has ended. There were " << subrun_event_count_ <<
" events in this subrun.";
663 subrun_event_count_ = 0;
664 subrun_incomplete_event_count_ = 0;
672 if (boundary == 0 || boundary == Fragment::InvalidSequenceID)
return;
674 if (boundary < last_released_event_)
676 auto logLevel = TLVL_ERROR;
677 bool processAnyway =
false;
678 if (last_released_event_ - boundary < 100)
680 logLevel = TLVL_WARNING;
681 processAnyway =
true;
683 TLOG(logLevel) <<
"Subrun rollover requested for event that is in the past. (delta = " << (last_released_event_ - boundary) <<
").";
684 if (!processAnyway)
return;
686 TLOG(TLVL_INFO) <<
"Will roll over when I reach Sequence ID " << boundary;
687 subrun_rollover_event_ = boundary;
694 metricMan->sendMetric(
"Incomplete Event Count", GetIncompleteEventCount(),
"events", 1, MetricMode::LastPoint);
695 metricMan->sendMetric(
"Pending Event Count", GetPendingEventCount(),
"events", 1, MetricMode::LastPoint);
698 if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
700 if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
703 last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
704 std::ostringstream oss;
705 oss <<
"Incomplete Events (" << num_fragments_per_event_ <<
"): ";
706 for (
auto& ev : active_buffers_)
708 auto hdr = getEventHeader_(ev);
709 oss << hdr->sequence_id <<
" (" << GetFragmentCount(hdr->sequence_id) <<
"), ";
711 TLOG(TLVL_DEBUG) << oss.str();
715 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
717 TLOG(TLVL_DEBUG) <<
"Broadcasting Fragment with seqID=" << frag->sequenceID() <<
", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) <<
", size=" << frag->sizeBytes() <<
"B.";
718 auto buffer = broadcasts_.GetBufferForWriting(
false);
719 TLOG(TLVL_DEBUG) <<
"broadcastFragment_: after getting buffer 1st buffer=" << buffer;
720 auto start_time = std::chrono::steady_clock::now();
721 while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
724 buffer = broadcasts_.GetBufferForWriting(
false);
726 TLOG(TLVL_DEBUG) <<
"broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer <<
", elapsed time=" << TimeUtils::GetElapsedTime(start_time) <<
" s.";
729 TLOG(TLVL_ERROR) <<
"Broadcast of fragment type " << frag->typeString() <<
" failed due to timeout waiting for buffer!";
734 TLOG(TLVL_DEBUG) <<
"broadcastFragment_: Filling in RawEventHeader";
735 auto hdr =
reinterpret_cast<detail::RawEventHeader*
>(broadcasts_.GetBufferStart(buffer));
736 hdr->run_id = run_id_;
737 hdr->subrun_id = subrun_id_;
738 hdr->sequence_id = frag->sequenceID();
739 hdr->is_complete =
true;
740 broadcasts_.IncrementWritePos(buffer,
sizeof(detail::RawEventHeader));
742 TLOG(TLVL_DEBUG) <<
"broadcastFragment_ before Write calls";
743 broadcasts_.Write(buffer, frag->headerAddress(), frag->size() *
sizeof(RawDataType));
745 TLOG(TLVL_DEBUG) <<
"broadcastFragment_ Marking buffer full";
746 broadcasts_.MarkBufferFull(buffer, -1);
748 TLOG(TLVL_DEBUG) <<
"broadcastFragment_ Complete";
752 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(
int buffer)
754 return reinterpret_cast<detail::RawEventHeader*
>(GetBufferStart(buffer));
757 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID,
bool create_new, Fragment::timestamp_t timestamp)
759 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
760 TLOG(14) <<
"getBufferForSequenceID " << std::to_string(seqID) <<
" BEGIN";
761 auto buffers = GetBuffersOwnedByManager();
762 for (
auto& buf : buffers)
764 auto hdr = getEventHeader_(buf);
765 if (hdr->sequence_id == seqID)
767 TLOG(14) <<
"getBufferForSequenceID " << std::to_string(seqID) <<
" returning " << buf;
772 #if !ART_SUPPORTS_DUPLICATE_EVENTS
773 if (released_incomplete_events_.count(seqID))
775 TLOG(TLVL_ERROR) <<
"Event " << std::to_string(seqID) <<
" has already been marked \"Incomplete\" and sent to art!";
780 if (!create_new)
return -1;
782 check_pending_buffers_(lk);
783 int new_buffer = GetBufferForWriting(
false);
785 if (new_buffer == -1)
787 new_buffer = GetBufferForWriting(overwrite_mode_);
790 if (new_buffer == -1)
return -1;
791 TraceLock(buffer_mutexes_[new_buffer], 34,
"getBufferForSequenceID");
792 auto hdr = getEventHeader_(new_buffer);
793 hdr->is_complete =
false;
794 hdr->run_id = run_id_;
795 hdr->subrun_id = subrun_id_;
796 hdr->sequence_id = seqID;
797 buffer_writes_pending_[new_buffer] = 0;
798 IncrementWritePos(new_buffer,
sizeof(detail::RawEventHeader));
799 SetMFIteration(
"Sequence ID " + std::to_string(seqID));
801 active_buffers_.insert(new_buffer);
805 if (timestamp != Fragment::InvalidTimestamp)
807 requests_->AddRequest(seqID, timestamp);
809 requests_->SendRequest();
811 TLOG(14) <<
"getBufferForSequenceID " << std::to_string(seqID) <<
" returning newly initialized buffer " << new_buffer;
815 bool artdaq::SharedMemoryEventManager::hasFragments_(
int buffer)
817 if (buffer == -1)
return true;
818 if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
822 ResetReadPos(buffer);
823 IncrementReadPos(buffer,
sizeof(detail::RawEventHeader));
824 return MoreDataInBuffer(buffer);
827 void artdaq::SharedMemoryEventManager::complete_buffer_(
int buffer)
829 auto hdr = getEventHeader_(buffer);
830 if (hdr->is_complete)
832 TLOG(TLVL_DEBUG) <<
"complete_buffer_: This fragment completes event " << std::to_string(hdr->sequence_id) <<
".";
836 requests_->RemoveRequest(hdr->sequence_id);
837 requests_->SendRoutingToken(1);
840 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
841 active_buffers_.erase(buffer);
842 pending_buffers_.insert(buffer);
845 check_pending_buffers_();
848 bool artdaq::SharedMemoryEventManager::bufferComparator(
int bufA,
int bufB)
850 return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
853 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex>
const& lock)
855 TLOG(TLVL_TRACE) <<
"check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
857 auto buffers = GetBuffersOwnedByManager();
858 for (
auto buf : buffers)
860 if (ResetBuffer(buf) && !pending_buffers_.count(buf))
862 auto hdr = getEventHeader_(buf);
863 if (active_buffers_.count(buf))
867 requests_->RemoveRequest(hdr->sequence_id);
868 requests_->SendRoutingToken(1);
870 active_buffers_.erase(buf);
871 pending_buffers_.insert(buf);
872 subrun_incomplete_event_count_++;
873 run_incomplete_event_count_++;
874 if (metricMan) metricMan->sendMetric(
"Incomplete Event Rate", 1,
"events/s", 3, MetricMode::Rate);
875 if (!released_incomplete_events_.count(hdr->sequence_id))
877 released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
881 released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
883 TLOG(TLVL_WARNING) <<
"Active event " << std::to_string(hdr->sequence_id) <<
" is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] <<
" Fragments) to art.";
889 Fragment::sequence_id_t lowestSeqId = Fragment::InvalidSequenceID;
892 if (WriteReadyCount(
false) != 0)
894 for (
auto buf : active_buffers_)
896 auto hdr = getEventHeader_(buf);
897 TLOG(TLVL_TRACE) <<
"Buffer: " << buf <<
", SeqID: " << std::to_string(hdr->sequence_id) <<
", ACTIVE";
898 if (hdr->sequence_id < lowestSeqId)
900 lowestSeqId = hdr->sequence_id;
903 TLOG(TLVL_TRACE) <<
"Lowest SeqID held: " << std::to_string(lowestSeqId);
906 std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
907 sorted_buffers.sort([
this](
int a,
int b) {
return bufferComparator(a, b); });
910 double eventSize = 0;
911 for (
auto buf : sorted_buffers)
913 auto hdr = getEventHeader_(buf);
914 if (hdr->sequence_id > lowestSeqId)
break;
916 if (hdr->sequence_id >= subrun_rollover_event_)
918 TLOG(TLVL_INFO) <<
"Subrun rollover reached at event " << hdr->sequence_id <<
" (boundary=" << subrun_rollover_event_ <<
").";
922 if (hdr->sequence_id > last_released_event_) last_released_event_ = hdr->sequence_id;
924 TLOG(TLVL_DEBUG) <<
"Releasing event " << std::to_string(hdr->sequence_id) <<
" in buffer " << buf <<
" to art.";
926 subrun_event_count_++;
929 eventSize += BufferDataSize(buf);
930 pending_buffers_.erase(buf);
932 eventSize /= counter;
934 TLOG(TLVL_TRACE) <<
"check_pending_buffers_: Sending Metrics";
937 auto full = ReadReadyCount();
938 auto empty = WriteReadyCount(overwrite_mode_);
941 metricMan->sendMetric(
"Event Rate", counter,
"Events/s", 1, MetricMode::Rate);
942 metricMan->sendMetric(
"Events Released to art (run)", run_event_count_,
"Events", 1, MetricMode::LastPoint);
943 metricMan->sendMetric(
"Incomplete Events Released to art (run)", run_incomplete_event_count_,
"Events", 1, MetricMode::LastPoint);
944 metricMan->sendMetric(
"Events Released to art (subrun)", subrun_event_count_,
"Events", 2, MetricMode::LastPoint);
945 metricMan->sendMetric(
"Incomplete Events Released to art (subrun)", subrun_incomplete_event_count_,
"Events", 2, MetricMode::LastPoint);
946 metricMan->sendMetric(
"Event Size", eventSize,
"Bytes", 1, MetricMode::Average);
948 metricMan->sendMetric(
"Shared Memory Full Buffers", full,
"buffers", 2, MetricMode::LastPoint);
949 metricMan->sendMetric(
"Shared Memory Available Buffers", empty,
"buffers", 2, MetricMode::LastPoint);
950 metricMan->sendMetric(
"Shared Memory Full %", full * 100 / static_cast<double>(total),
"%", 2, MetricMode::LastPoint);
951 metricMan->sendMetric(
"Shared Memory Available %", empty * 100 / static_cast<double>(total),
"%", 2, MetricMode::LastPoint);
953 TLOG(TLVL_TRACE) <<
"check_pending_buffers_ END";
956 void artdaq::SharedMemoryEventManager::send_init_frag_()
958 if (init_fragment_ !=
nullptr)
960 TLOG(TLVL_TRACE) <<
"Sending init Fragment to art...";
963 std::string fileName =
"receiveInitMessage_" + std::to_string(my_rank) +
".bin";
964 std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
965 ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
969 broadcastFragment_(std::move(init_fragment_), init_fragment_);
970 TLOG(TLVL_TRACE) <<
"Init Fragment sent";
972 else if (send_init_fragments_)
974 TLOG(TLVL_WARNING) <<
"Cannot send init fragment because I haven't yet received one!";
980 if (!init_fragment_ || init_fragment_ ==
nullptr)
982 init_fragment_.swap(frag);
987 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
void RunArt(std::shared_ptr< art_config_file > config_file, pid_t &pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
The SharedMemoryEventManager is a SharedMemoryManger which tracks events as they are built...
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void StartArt()
Start all the art processes.
void ShutdownArtProcesses(std::set< pid_t > pids)
Shutdown a set of art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
void rolloverSubrun(sequence_id_t boundary)
Rollover the subrun after the specified event.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
void startSubrun()
Start a new Subrun, incrementing the subrun number.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endSubrun()
Send an EndOfSubRunFragment to the art thread.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
bool endOfData()
Indicate that the end of input has been reached to the art processes.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.