2 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
8 #include "artdaq-core/Core/StatisticsCollection.hh"
9 #include "artdaq-core/Utilities/TraceLock.hh"
11 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
13 #define TLVL_BUFFER 40
14 #define TLVL_BUFLCK 41
16 #define build_key(seed) ((seed) + ((GetPartitionNumber() + 1) << 16) + (getpid() & 0xFFFF))
18 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
19 std::mutex artdaq::SharedMemoryEventManager::subrun_event_map_mutex_;
26 : SharedMemoryManager(pset.get<uint32_t>(
"shared_memory_key", build_key(0xEE000000)),
27 pset.get<size_t>(
"buffer_count"),
28 pset.has_key(
"max_event_size_bytes") ? pset.get<size_t>(
"max_event_size_bytes") : pset.get<size_t>(
"expected_fragments_per_event") * pset.get<size_t>(
"max_fragment_size_bytes"),
29 pset.get<size_t>(
"stale_buffer_timeout_usec", pset.get<size_t>(
"event_queue_wait_time", 5) * 1000000),
30 !pset.get<bool>(
"broadcast_mode", false))
31 , num_art_processes_(pset.get<size_t>(
"art_analyzer_count", 1))
32 , num_fragments_per_event_(pset.get<size_t>(
"expected_fragments_per_event"))
33 , queue_size_(pset.get<size_t>(
"buffer_count"))
35 , max_subrun_event_map_length_(pset.get<size_t>(
"max_subrun_lookup_table_size", 100))
36 , max_event_list_length_(pset.get<size_t>(
"max_event_list_length", 100))
37 , update_run_ids_(pset.get<bool>(
"update_run_ids_on_new_fragment", true))
38 , use_sequence_id_for_event_number_(pset.get<bool>(
"use_sequence_id_for_event_number", true))
39 , overwrite_mode_(!pset.get<bool>(
"use_art", true) || pset.get<bool>(
"overwrite_mode", false) || pset.get<bool>(
"broadcast_mode", false))
40 , init_fragment_count_(pset.get<size_t>(
"init_fragment_count", pset.get<bool>(
"send_init_fragments", true) ? 1 : 0))
42 , buffer_writes_pending_()
43 , open_event_report_interval_ms_(pset.get<int>(
"open_event_report_interval_ms", pset.get<int>(
"incomplete_event_report_interval_ms", -1)))
44 , last_open_event_report_time_(std::chrono::steady_clock::now())
45 , last_backpressure_report_time_(std::chrono::steady_clock::now())
46 , last_fragment_header_write_time_(std::chrono::steady_clock::now())
47 , event_timing_(pset.get<size_t>(
"buffer_count"))
48 , broadcast_timeout_ms_(pset.get<int>(
"fragment_broadcast_timeout_ms", 3000))
50 , run_incomplete_event_count_(0)
51 , subrun_event_count_(0)
52 , subrun_incomplete_event_count_(0)
53 , oversize_fragment_count_(0)
54 , maximum_oversize_fragment_count_(pset.get<int>(
"maximum_oversize_fragment_count", 1))
56 , always_restart_art_(pset.get<bool>(
"restart_crashed_art_processes", true))
57 , manual_art_(pset.get<bool>(
"manual_art", false))
58 , current_art_pset_(art_pset)
59 , art_cmdline_(pset.get<std::string>(
"art_command_line",
"art -c #CONFIG_FILE#"))
60 , art_process_index_offset_(pset.get<size_t>(
"art_index_offset", 0))
61 , minimum_art_lifetime_s_(pset.get<double>(
"minimum_art_lifetime_s", 2.0))
62 , art_event_processing_time_us_(pset.get<size_t>(
"expected_art_event_processing_time_us", 1000000))
66 , broadcasts_(pset.get<uint32_t>(
"broadcast_shared_memory_key", build_key(0xBB000000)),
67 pset.get<size_t>(
"broadcast_buffer_count", 10),
68 pset.get<size_t>(
"broadcast_buffer_size", 0x100000),
69 pset.get<int>(
"expected_art_event_processing_time_us", 100000) * pset.get<size_t>(
"buffer_count"), false)
71 subrun_event_map_[0] = 1;
72 SetMinWriteSize(
sizeof(detail::RawEventHeader) +
sizeof(detail::RawFragmentHeader));
73 broadcasts_.SetMinWriteSize(
sizeof(detail::RawEventHeader) +
sizeof(detail::RawFragmentHeader));
75 if (!pset.get<
bool>(
"use_art",
true))
77 TLOG(TLVL_INFO) <<
"BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
78 num_art_processes_ = 0;
82 TLOG(TLVL_INFO) <<
"BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
83 TLOG(TLVL_DEBUG + 33) <<
"art_pset is " << art_pset.to_string();
87 current_art_config_file_ = std::make_shared<art_config_file>(art_pset, GetKey(),
GetBroadcastKey());
89 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
91 if (overwrite_mode_ && num_art_processes_ > 0)
93 TLOG(TLVL_WARNING) <<
"Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
95 else if (overwrite_mode_)
97 TLOG(TLVL_INFO) <<
"Overwrite Mode enabled, no configured art processes at startup";
100 for (
size_t ii = 0; ii < size(); ++ii)
102 buffer_writes_pending_[ii] = 0;
104 std::lock_guard<std::mutex> lk(buffer_mutexes_[ii]);
109 throw cet::exception(app_name +
"_SharedMemoryEventManager") <<
"Unable to attach to Shared Memory!";
112 TLOG(TLVL_DEBUG + 33) <<
"Setting Writer rank to " << my_rank;
114 TLOG(TLVL_DEBUG + 32) <<
"Writer Rank is " << GetRank();
122 TLOG(TLVL_DEBUG + 33) <<
"END CONSTRUCTOR";
127 TLOG(TLVL_DEBUG + 33) <<
"DESTRUCTOR";
139 TLOG(TLVL_DEBUG + 33) <<
"Destructor END";
142 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag,
void* dataPtr)
144 if (!running_)
return true;
146 TLOG(TLVL_DEBUG + 33) <<
"AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
147 <<
", sequence_id=" << frag.sequence_id;
148 auto buffer = getBufferForSequenceID_(frag.sequence_id,
true, frag.timestamp);
149 TLOG(TLVL_DEBUG + 33) <<
"Using buffer " << buffer <<
" for seqid=" << frag.sequence_id;
156 TLOG(TLVL_ERROR) <<
"Dropping event because data taking has already passed this event number: " << frag.sequence_id;
160 auto hdr = getEventHeader_(buffer);
163 hdr->run_id = run_id_;
165 hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
167 TLOG(TLVL_DEBUG + 33) <<
"AddFragment before Write calls";
168 Write(buffer, dataPtr, frag.word_count *
sizeof(RawDataType));
170 TLOG(TLVL_DEBUG + 33) <<
"Checking for complete event";
171 auto fragmentCount = GetFragmentCount(frag.sequence_id);
172 hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
173 TLOG(TLVL_DEBUG + 33) <<
"hdr->is_complete=" << std::boolalpha << hdr->is_complete
174 <<
", fragmentCount=" << fragmentCount
175 <<
", num_fragments_per_event=" << num_fragments_per_event_
176 <<
", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
178 complete_buffer_(buffer);
181 requests_->SendRequest(
true);
184 TLOG(TLVL_DEBUG + 33) <<
"AddFragment END";
185 statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count *
sizeof(RawDataType));
189 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag,
size_t timeout_usec, FragmentPtr& outfrag)
191 TLOG(TLVL_DEBUG + 33) <<
"AddFragment(FragmentPtr) BEGIN";
192 auto hdr = *
reinterpret_cast<detail::RawFragmentHeader*
>(frag->headerAddress());
193 auto data = frag->headerAddress();
194 auto start = std::chrono::steady_clock::now();
196 while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
198 sts = AddFragment(hdr, data);
206 outfrag = std::move(frag);
208 TLOG(TLVL_DEBUG + 33) <<
"AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
214 if (!running_)
return nullptr;
215 TLOG(TLVL_DEBUG + 34) <<
"WriteFragmentHeader BEGIN";
216 auto buffer = getBufferForSequenceID_(frag.sequence_id,
true, frag.timestamp);
220 if (buffer == -1 && !dropIfNoBuffersAvailable)
222 std::unique_lock<std::mutex> bp_lk(sequence_id_mutex_);
223 if (TimeUtils::GetElapsedTime(last_backpressure_report_time_) > 1.0)
225 TLOG(TLVL_WARNING) << app_name <<
": Back-pressure condition: All Shared Memory buffers have been full for " << TimeUtils::GetElapsedTime(last_fragment_header_write_time_) <<
" s!";
226 last_backpressure_report_time_ = std::chrono::steady_clock::now();
230 metricMan->sendMetric(
"Back-pressure wait time", TimeUtils::GetElapsedTime(last_fragment_header_write_time_),
"s", 1, MetricMode::LastPoint);
236 TLOG(TLVL_ERROR) <<
"Dropping fragment with sequence id " << frag.sequence_id <<
" and fragment id " << frag.fragment_id <<
" because data taking has already passed this event.";
240 TLOG(TLVL_INFO) <<
"Dropping fragment with sequence id " << frag.sequence_id <<
" and fragment id " << frag.fragment_id <<
" because there is no room in the queue and reliable mode is off.";
242 dropped_data_.emplace_back(frag, std::make_unique<Fragment>(frag.word_count - frag.num_words()));
243 auto it = dropped_data_.rbegin();
245 TLOG(TLVL_DEBUG + 35) <<
"Dropping fragment with sequence id " << frag.sequence_id <<
" and fragment id " << frag.fragment_id <<
" into "
246 <<
static_cast<void*
>(it->second->dataBegin()) <<
" sz=" << it->second->dataSizeBytes();
248 return it->second->dataBegin();
251 last_backpressure_report_time_ = std::chrono::steady_clock::now();
252 last_fragment_header_write_time_ = std::chrono::steady_clock::now();
254 buffer_writes_pending_[buffer]++;
258 metricMan->sendMetric(
"Input Fragment Rate", 1,
"Fragments/s", 1, MetricMode::Rate);
261 TLOG(TLVL_BUFLCK) <<
"WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
263 std::unique_lock<std::mutex> lk(buffer_mutexes_.at(buffer));
265 TLOG(TLVL_BUFLCK) <<
"WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
267 auto hdrpos =
reinterpret_cast<RawDataType*
>(GetWritePos(buffer));
268 Write(buffer, &frag, frag.num_words() *
sizeof(RawDataType));
270 auto pos =
reinterpret_cast<RawDataType*
>(GetWritePos(buffer));
271 if (frag.word_count - frag.num_words() > 0)
273 auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) *
sizeof(RawDataType));
277 reinterpret_cast<detail::RawFragmentHeader*
>(hdrpos)->word_count = frag.num_words();
278 reinterpret_cast<detail::RawFragmentHeader*
>(hdrpos)->type = Fragment::InvalidFragmentType;
279 TLOG(TLVL_ERROR) <<
"Dropping over-size fragment with sequence id " << frag.sequence_id <<
" and fragment id " << frag.fragment_id <<
" because there is no room in the current buffer for this Fragment! (Keeping header)";
280 dropped_data_.emplace_back(frag, std::make_unique<Fragment>(frag.word_count - frag.num_words()));
281 auto it = dropped_data_.rbegin();
283 oversize_fragment_count_++;
285 if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
287 throw cet::exception(
"Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
290 TLOG(TLVL_DEBUG + 35) <<
"Dropping over-size fragment with sequence id " << frag.sequence_id <<
" and fragment id " << frag.fragment_id
291 <<
" into " <<
static_cast<void*
>(it->second->dataBegin());
292 return it->second->dataBegin();
295 TLOG(TLVL_DEBUG + 34) <<
"WriteFragmentHeader END";
301 TLOG(TLVL_DEBUG + 33) <<
"DoneWritingFragment BEGIN";
303 auto buffer = getBufferForSequenceID_(frag.sequence_id,
false, frag.timestamp);
306 for (
auto it = dropped_data_.begin(); it != dropped_data_.end(); ++it)
308 if (it->first == frag)
310 dropped_data_.erase(it);
316 Detach(
true,
"SharedMemoryEventManager",
317 "getBufferForSequenceID_ returned -1 in DoneWritingFragment. This indicates a possible mismatch between expected Fragment count and the actual number of Fragments received.");
324 UpdateFragmentHeader(buffer, frag);
327 statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count *
sizeof(RawDataType));
329 TLOG(TLVL_BUFLCK) <<
"DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
331 std::unique_lock<std::mutex> lk(buffer_mutexes_.at(buffer));
333 TLOG(TLVL_BUFLCK) <<
"DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
335 TLOG(TLVL_DEBUG + 32) <<
"DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id <<
" and fragment id " << frag.fragment_id <<
" (type " <<
static_cast<int>(frag.type) <<
")";
336 auto hdr = getEventHeader_(buffer);
339 hdr->run_id = run_id_;
341 hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
343 TLOG(TLVL_DEBUG + 33) <<
"DoneWritingFragment: Updating buffer touch time";
346 if (buffer_writes_pending_[buffer] > 1)
348 TLOG(TLVL_DEBUG + 33) <<
"Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
349 buffer_writes_pending_[buffer]--;
352 TLOG(TLVL_DEBUG + 33) <<
"Done writing fragment, and no other writer. Doing bookkeeping steps.";
353 auto frag_count = GetFragmentCount(frag.sequence_id);
354 hdr->is_complete = frag_count >= num_fragments_per_event_;
356 if (frag_count > num_fragments_per_event_)
358 TLOG(TLVL_WARNING) <<
"DoneWritingFragment: This Event has more Fragments ( " << frag_count <<
" ) than specified in configuration ( " << num_fragments_per_event_ <<
" )!"
359 <<
" This is probably due to a misconfiguration and is *not* a reliable mode!";
362 TLOG(TLVL_DEBUG + 33) <<
"DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id <<
" and fragment id " << frag.fragment_id <<
", count/expected = " << frag_count <<
"/" << num_fragments_per_event_;
363 #if ART_SUPPORTS_DUPLICATE_EVENTS
364 if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
366 hdr->is_complete = frag_count >= released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
370 complete_buffer_(buffer);
373 buffer_writes_pending_[buffer]--;
377 requests_->SendRequest(
true);
379 TLOG(TLVL_DEBUG + 33) <<
"DoneWritingFragment END";
384 return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID,
false), type);
393 ResetReadPos(buffer);
394 IncrementReadPos(buffer,
sizeof(detail::RawEventHeader));
398 while (MoreDataInBuffer(buffer))
400 auto fragHdr =
reinterpret_cast<artdaq::detail::RawFragmentHeader*
>(GetReadPos(buffer));
401 IncrementReadPos(buffer, fragHdr->word_count *
sizeof(RawDataType));
402 if (type != Fragment::InvalidFragmentType && fragHdr->type != type)
406 TLOG(TLVL_DEBUG + 33) <<
"Adding Fragment with size=" << fragHdr->word_count <<
" to Fragment count";
413 void artdaq::SharedMemoryEventManager::UpdateFragmentHeader(
int buffer, artdaq::detail::RawFragmentHeader hdr)
419 ResetReadPos(buffer);
420 IncrementReadPos(buffer,
sizeof(detail::RawEventHeader));
422 while (MoreDataInBuffer(buffer))
424 auto fragHdr =
reinterpret_cast<artdaq::detail::RawFragmentHeader*
>(GetReadPos(buffer));
425 if (hdr.fragment_id == fragHdr->fragment_id)
439 auto start_time = std::chrono::steady_clock::now();
441 TLOG(TLVL_INFO) <<
"Starting art process with config file " << config_file->getFileName();
454 std::string envVarKey =
"ARTDAQ_PARTITION_NUMBER";
455 std::string envVarValue = std::to_string(GetPartitionNumber());
456 if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
458 TLOG(TLVL_ERROR) <<
"Error setting environment variable \"" << envVarKey
459 <<
"\" in the environment of a child art process. "
460 <<
"This may result in incorrect TCP port number "
461 <<
"assignments or other issues, and data may "
462 <<
"not flow through the system correctly.";
464 envVarKey =
"ARTDAQ_APPLICATION_NAME";
465 envVarValue = app_name;
466 if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
468 TLOG(TLVL_DEBUG + 32) <<
"Error setting environment variable \"" << envVarKey
469 <<
"\" in the environment of a child art process. ";
471 envVarKey =
"ARTDAQ_RANK";
472 envVarValue = std::to_string(my_rank);
473 if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
475 TLOG(TLVL_DEBUG + 32) <<
"Error setting environment variable \"" << envVarKey
476 <<
"\" in the environment of a child art process. ";
479 TLOG(TLVL_DEBUG + 33) <<
"Parsing art command line";
480 auto args = parse_art_command_line_(config_file, process_index);
482 TLOG(TLVL_DEBUG + 33) <<
"Calling execvp with application name " << args[0];
483 execvp(args[0], &args[0]);
485 TLOG(TLVL_DEBUG + 33) <<
"Application exited, cleaning up";
486 for (
auto& arg : args)
497 std::cout <<
"Please run the following command in a separate terminal:" << std::endl
498 <<
"art -c " << config_file->getFileName() << std::endl
499 <<
"Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() <<
"\" and note the PID of the art process." << std::endl
500 <<
"Finally, return to this window and enter the pid: " << std::endl;
505 TLOG(TLVL_INFO) <<
"PID of new art process is " << pid;
507 std::unique_lock<std::mutex> lk(art_process_mutex_);
508 art_processes_.insert(pid);
514 sts = waitid(P_PID, pid, &status, WEXITED);
518 while (kill(pid, 0) >= 0) usleep(10000);
520 TLOG(TLVL_INFO) <<
"Faking good exit status, please see art process for actual exit status!";
521 status.si_code = CLD_EXITED;
522 status.si_status = 0;
524 TLOG(TLVL_INFO) <<
"Removing PID " << pid <<
" from process list";
526 std::unique_lock<std::mutex> lk(art_process_mutex_);
527 art_processes_.erase(pid);
531 TLOG(TLVL_WARNING) <<
"Error occurred in waitid for art process " << pid <<
": " << errno <<
" (" << strerror(errno) <<
").";
533 else if (status.si_code == CLD_EXITED && status.si_status == 0)
535 TLOG(TLVL_INFO) <<
"art process " << pid <<
" exited normally, " << (restart_art_ ?
"restarting" :
"not restarting");
539 auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
540 if (art_lifetime < minimum_art_lifetime_s_)
542 restart_art_ =
false;
545 auto exit_type =
"exited with status code";
546 switch (status.si_code)
550 exit_type =
"was killed with signal";
557 TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
558 <<
"art process " << pid <<
" " << exit_type <<
" " << status.si_status
559 << (status.si_code == CLD_DUMPED ?
" (core dumped)" :
"")
560 <<
" after running for " << std::setprecision(2) << std::fixed << art_lifetime <<
" seconds, "
561 << (restart_art_ ?
"restarting" :
"not restarting");
563 }
while (restart_art_);
568 restart_art_ = always_restart_art_;
569 if (num_art_processes_ == 0)
573 for (
size_t ii = 0; ii < num_art_processes_; ++ii)
575 StartArtProcess(current_art_pset_, ii);
581 static std::mutex start_art_mutex;
582 std::unique_lock<std::mutex> lk(start_art_mutex);
584 restart_art_ = always_restart_art_;
585 auto initialCount = GetAttachedCount();
586 auto startTime = std::chrono::steady_clock::now();
588 if (pset != current_art_pset_ || !current_art_config_file_)
590 current_art_pset_ = pset;
592 current_art_config_file_ = std::make_shared<art_config_file>(pset, GetKey(), GetBroadcastKey());
594 current_art_config_file_ = std::make_shared<art_config_file>(pset);
596 std::shared_ptr<std::atomic<pid_t>> pid(
new std::atomic<pid_t>(-1));
597 boost::thread thread([=] { RunArt(current_art_config_file_, process_index, pid); });
600 auto currentCount = GetAttachedCount() - initialCount;
601 while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
604 currentCount = GetAttachedCount() - initialCount;
606 if ((currentCount < 1 || *pid <= 0) && manual_art_)
608 TLOG(TLVL_WARNING) <<
"Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount <<
", PID:" << pid;
611 if (currentCount < 1 || *pid <= 0)
613 TLOG(TLVL_WARNING) <<
"art process has not started after 5s. Check art configuration!"
614 <<
" (pid=" << *pid <<
", attachedCount=" << currentCount <<
")";
618 TLOG(TLVL_INFO) << std::setw(4) << std::fixed <<
"art initialization took "
619 << TimeUtils::GetElapsedTime(startTime) <<
" seconds.";
626 restart_art_ =
false;
630 auto check_pids = [&](
bool print) {
631 std::unique_lock<std::mutex> lk(art_process_mutex_);
632 for (
auto pid = pids.begin(); pid != pids.end();)
638 TLOG(TLVL_WARNING) <<
"Removing an invalid PID (" << *pid
639 <<
") from the shutdown list.";
640 pid = pids.erase(pid);
642 else if (kill(*pid, 0) < 0)
644 pid = pids.erase(pid);
650 std::cout << *pid <<
" ";
656 auto count_pids = [&]() {
657 std::unique_lock<std::mutex> lk(art_process_mutex_);
661 if (count_pids() == 0)
663 TLOG(TLVL_DEBUG + 34) <<
"All art processes already exited, nothing to do.";
670 int graceful_wait_ms = art_event_processing_time_us_ * size() * 10 / 1000;
671 int gentle_wait_ms = art_event_processing_time_us_ * size() * 2 / 1000;
672 int int_wait_ms = art_event_processing_time_us_ * size() / 1000;
673 auto shutdown_start = std::chrono::steady_clock::now();
677 TLOG(TLVL_DEBUG + 33) <<
"Waiting up to " << graceful_wait_ms <<
" ms for all art processes to exit gracefully";
678 for (
int ii = 0; ii < graceful_wait_ms; ++ii)
683 if (count_pids() == 0)
685 TLOG(TLVL_INFO) <<
"All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) <<
" ms.";
692 TLOG(TLVL_DEBUG + 33) <<
"Gently informing art processes that it is time to shut down";
693 std::unique_lock<std::mutex> lk(art_process_mutex_);
694 for (
auto pid : pids)
696 TLOG(TLVL_DEBUG + 33) <<
"Sending SIGQUIT to pid " << pid;
701 TLOG(TLVL_DEBUG + 33) <<
"Waiting up to " << gentle_wait_ms <<
" ms for all art processes to exit from SIGQUIT";
702 for (
int ii = 0; ii < gentle_wait_ms; ++ii)
707 if (count_pids() == 0)
709 TLOG(TLVL_INFO) <<
"All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) <<
" ms (SIGQUIT).";
715 TLOG(TLVL_DEBUG + 33) <<
"Insisting that the art processes shut down";
716 std::unique_lock<std::mutex> lk(art_process_mutex_);
717 for (
auto pid : pids)
723 TLOG(TLVL_DEBUG + 33) <<
"Waiting up to " << int_wait_ms <<
" ms for all art processes to exit from SIGINT";
724 for (
int ii = 0; ii < int_wait_ms; ++ii)
730 if (count_pids() == 0)
732 TLOG(TLVL_INFO) <<
"All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) <<
" ms (SIGINT).";
737 TLOG(TLVL_DEBUG + 33) <<
"Killing remaning art processes with extreme prejudice";
738 while (count_pids() > 0)
741 std::unique_lock<std::mutex> lk(art_process_mutex_);
742 kill(*pids.begin(), SIGKILL);
747 TLOG(TLVL_INFO) <<
"All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) <<
" ms (SIGKILL).";
751 std::cout <<
"Please shut down all art processes, then hit return/enter" << std::endl;
752 while (count_pids() > 0)
754 std::cout <<
"The following PIDs are running: ";
756 std::cout << std::endl;
764 TLOG(TLVL_DEBUG + 32) <<
"ReconfigureArt BEGIN";
765 if (restart_art_ || !always_restart_art_)
769 for (
size_t ii = 0; ii < broadcasts_.size(); ++ii)
771 broadcasts_.MarkBufferEmpty(ii,
true);
775 newRun = run_id_ + 1;
778 if (art_pset != current_art_pset_ || !current_art_config_file_)
780 current_art_pset_ = art_pset;
782 current_art_config_file_ = std::make_shared<art_config_file>(art_pset, GetKey(), GetBroadcastKey());
784 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
787 if (n_art_processes != -1)
789 TLOG(TLVL_INFO) <<
"Setting number of art processes to " << n_art_processes;
790 num_art_processes_ = n_art_processes;
793 TLOG(TLVL_DEBUG + 32) <<
"ReconfigureArt END";
799 init_fragments_.clear();
800 received_init_frags_.clear();
801 TLOG(TLVL_DEBUG + 32) <<
"SharedMemoryEventManager::endOfData";
802 restart_art_ =
false;
804 auto start = std::chrono::steady_clock::now();
805 auto pendingWriteCount = std::accumulate(buffer_writes_pending_.begin(), buffer_writes_pending_.end(), 0, [](
int a,
auto& b) {
return a + b.second.load(); });
806 TLOG(TLVL_DEBUG + 32) <<
"endOfData: Waiting for " << pendingWriteCount <<
" pending writes to complete";
807 while (pendingWriteCount > 0 && TimeUtils::GetElapsedTimeMicroseconds(start) < 1000000)
810 pendingWriteCount = std::accumulate(buffer_writes_pending_.begin(), buffer_writes_pending_.end(), 0, [](
int a,
auto& b) {
return a + b.second.load(); });
813 size_t initialStoreSize = GetOpenEventCount();
814 TLOG(TLVL_DEBUG + 32) <<
"endOfData: Flushing " << initialStoreSize
815 <<
" stale events from the SharedMemoryEventManager.";
816 int counter = initialStoreSize;
817 while (!active_buffers_.empty() && counter > 0)
819 complete_buffer_(*active_buffers_.begin());
822 TLOG(TLVL_DEBUG + 32) <<
"endOfData: Done flushing, there are now " << GetOpenEventCount()
823 <<
" stale events in the SharedMemoryEventManager.";
825 TLOG(TLVL_DEBUG + 32) <<
"Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) <<
" outstanding buffers...";
826 start = std::chrono::steady_clock::now();
827 auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
828 auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1);
830 auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
833 while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
835 auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
836 if (temp != lastReadCount)
838 TLOG(TLVL_DEBUG + 33) <<
"Waiting for " << temp <<
" outstanding buffers...";
839 lastReadCount = temp;
840 start = std::chrono::steady_clock::now();
842 if (lastReadCount > 0)
844 TLOG(TLVL_DEBUG + 38) <<
"About to sleep " << outstanding_buffer_wait_time <<
" us - lastReadCount=" << lastReadCount <<
" size=" << size() <<
" end_of_data_wait_us=" << end_of_data_wait_us;
845 usleep(outstanding_buffer_wait_time);
849 TLOG(TLVL_DEBUG + 32) <<
"endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount <<
", time waited: "
850 << TimeUtils::GetElapsedTime(start) <<
" s / " << (end_of_data_wait_us / 1000000.0) <<
" s, art process count: " << get_art_process_count_();
852 TLOG(TLVL_DEBUG + 32) <<
"endOfData: Broadcasting EndOfData Fragment";
853 FragmentPtrs broadcast;
854 broadcast.emplace_back(Fragment::eodFrag(GetBufferCount()));
855 bool success = broadcastFragments_(broadcast);
858 TLOG(TLVL_DEBUG + 32) <<
"endOfData: Clearing buffers to make room for EndOfData Fragment";
859 for (
size_t ii = 0; ii < broadcasts_.size(); ++ii)
861 broadcasts_.MarkBufferEmpty(ii,
true);
863 broadcastFragments_(broadcast);
865 auto endOfDataProcessingStart = std::chrono::steady_clock::now();
866 while (get_art_process_count_() > 0)
868 TLOG(TLVL_DEBUG + 32) <<
"There are " << get_art_process_count_() <<
" art processes remaining. Proceeding to shutdown.";
870 ShutdownArtProcesses(art_processes_);
872 TLOG(TLVL_DEBUG + 32) <<
"It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) <<
" s for all art processes to close after sending EndOfData Fragment";
874 ResetAttachedCount();
876 TLOG(TLVL_DEBUG + 32) <<
"endOfData: Clearing buffers";
877 for (
size_t ii = 0; ii < size(); ++ii)
879 MarkBufferEmpty(ii,
true);
887 released_events_.clear();
888 released_incomplete_events_.clear();
890 TLOG(TLVL_DEBUG + 32) <<
"endOfData END";
891 TLOG(TLVL_INFO) <<
"EndOfData Complete. There were " << GetLastSeenBufferID() <<
" buffers processed.";
898 init_fragments_.clear();
899 received_init_frags_.clear();
900 statsHelper_.resetStatistics();
901 TLOG(TLVL_DEBUG + 33) <<
"startRun: Clearing broadcast buffers";
902 for (
size_t ii = 0; ii < broadcasts_.size(); ++ii)
904 broadcasts_.MarkBufferEmpty(ii,
true);
906 released_events_.clear();
907 released_incomplete_events_.clear();
911 std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
912 subrun_event_map_.clear();
913 subrun_event_map_[0] = 1;
915 run_event_count_ = 0;
916 run_incomplete_event_count_ = 0;
917 requests_ = std::make_unique<RequestSender>(data_pset_);
920 requests_->SetRunNumber(static_cast<uint32_t>(run_id_));
922 if (data_pset_.has_key(
"routing_token_config"))
924 auto rmPset = data_pset_.get<fhicl::ParameterSet>(
"routing_token_config");
925 if (rmPset.get<
bool>(
"use_routing_manager",
false))
927 tokens_ = std::make_unique<TokenSender>(rmPset);
928 tokens_->SetRunNumber(static_cast<uint32_t>(run_id_));
929 tokens_->SendRoutingToken(queue_size_, run_id_);
932 TLOG(TLVL_DEBUG + 32) <<
"Starting run " << run_id_
933 <<
", max queue size = "
936 << GetLockedBufferCount();
939 metricMan->sendMetric(
"Run Number", static_cast<uint64_t>(run_id_),
"Run", 1, MetricMode::LastPoint | MetricMode::Persist);
945 TLOG(TLVL_INFO) <<
"Ending run " << run_id_;
946 FragmentPtr endOfRunFrag(
new Fragment(static_cast<size_t>(ceil(
sizeof(my_rank) /
947 static_cast<double>(
sizeof(Fragment::value_type))))));
949 TLOG(TLVL_DEBUG + 32) <<
"Shutting down RequestSender";
950 requests_.reset(
nullptr);
951 TLOG(TLVL_DEBUG + 32) <<
"Shutting down TokenSender";
952 tokens_.reset(
nullptr);
954 TLOG(TLVL_DEBUG + 32) <<
"Broadcasting EndOfRun Fragment";
955 endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
956 *endOfRunFrag->dataBegin() = my_rank;
957 FragmentPtrs broadcast;
958 broadcast.emplace_back(std::move(endOfRunFrag));
959 broadcastFragments_(broadcast);
961 TLOG(TLVL_INFO) <<
"Run " << run_id_ <<
" has ended. There were " << run_event_count_ <<
" events in this run.";
962 run_event_count_ = 0;
963 run_incomplete_event_count_ = 0;
964 oversize_fragment_count_ = 0;
966 std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
967 subrun_event_map_.clear();
968 subrun_event_map_[0] = 1;
976 if (boundary == 0 || boundary == Fragment::InvalidSequenceID)
981 std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
984 if (!subrun_event_map_.empty() && subrun_event_map_.rbegin()->second == subrun)
988 TLOG(TLVL_INFO) <<
"Will roll over to subrun " << subrun <<
" when I reach Sequence ID " << boundary;
989 subrun_event_map_[boundary] = subrun;
990 while (subrun_event_map_.size() > max_subrun_event_map_length_)
992 subrun_event_map_.erase(subrun_event_map_.begin());
998 Fragment::sequence_id_t seqID = 0;
1001 std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
1002 for (
auto& it : subrun_event_map_)
1004 if (it.first >= seqID)
1006 seqID = it.first + 1;
1008 if (it.second >= subrun)
1010 subrun = it.second + 1;
1014 rolloverSubrun(seqID, subrun);
1021 metricMan->sendMetric(
"Open Event Count", GetOpenEventCount(),
"events", 1, MetricMode::LastPoint);
1022 metricMan->sendMetric(
"Pending Event Count", GetPendingEventCount(),
"events", 1, MetricMode::LastPoint);
1025 if (open_event_report_interval_ms_ > 0 && GetLockedBufferCount() != 0u)
1027 if (TimeUtils::GetElapsedTimeMilliseconds(last_open_event_report_time_) < static_cast<size_t>(open_event_report_interval_ms_))
1032 last_open_event_report_time_ = std::chrono::steady_clock::now();
1033 std::ostringstream oss;
1034 oss <<
"Open Events (expecting " << num_fragments_per_event_ <<
" Fragments): ";
1035 for (
auto& ev : active_buffers_)
1037 auto hdr = getEventHeader_(ev);
1038 oss << hdr->sequence_id <<
" (has " << GetFragmentCount(hdr->sequence_id) <<
" Fragments), ";
1040 TLOG(TLVL_DEBUG + 32) << oss.str();
1044 bool artdaq::SharedMemoryEventManager::broadcastFragments_(FragmentPtrs& frags)
1048 TLOG(TLVL_ERROR) <<
"Requested broadcast but no Fragments given!";
1051 if (!broadcasts_.IsValid())
1053 TLOG(TLVL_ERROR) <<
"Broadcast attempted but broadcast shared memory is unavailable!";
1056 TLOG(TLVL_DEBUG + 32) <<
"Broadcasting Fragments with seqID=" << frags.front()->sequenceID()
1057 <<
", type " << detail::RawFragmentHeader::SystemTypeToString(frags.front()->type())
1058 <<
", size=" << frags.front()->sizeBytes() <<
"B.";
1059 auto buffer = broadcasts_.GetBufferForWriting(
false);
1060 TLOG(TLVL_DEBUG + 32) <<
"broadcastFragments_: after getting buffer 1st buffer=" << buffer;
1061 auto start_time = std::chrono::steady_clock::now();
1062 while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
1065 buffer = broadcasts_.GetBufferForWriting(
false);
1067 TLOG(TLVL_DEBUG + 32) <<
"broadcastFragments_: after getting buffer w/timeout, buffer=" << buffer <<
", elapsed time=" << TimeUtils::GetElapsedTime(start_time) <<
" s.";
1070 TLOG(TLVL_ERROR) <<
"Broadcast of fragment type " << frags.front()->typeString() <<
" failed due to timeout waiting for buffer!";
1074 TLOG(TLVL_DEBUG + 32) <<
"broadcastFragments_: Filling in RawEventHeader";
1075 auto hdr =
reinterpret_cast<detail::RawEventHeader*
>(broadcasts_.GetBufferStart(buffer));
1076 hdr->run_id = run_id_;
1077 hdr->subrun_id = GetSubrunForSequenceID(frags.front()->sequenceID());
1078 hdr->sequence_id = frags.front()->sequenceID();
1079 hdr->is_complete =
true;
1080 broadcasts_.IncrementWritePos(buffer,
sizeof(detail::RawEventHeader));
1082 for (
auto& frag : frags)
1084 TLOG(TLVL_DEBUG + 32) <<
"broadcastFragments_ before Write calls";
1085 if (frag->sequenceID() != hdr->sequence_id || frag->type() != frags.front()->type())
1087 TLOG(TLVL_WARNING) <<
"Not sending fragment because its SequenceID or Type disagrees with leading Fragment";
1090 broadcasts_.Write(buffer, frag->headerAddress(), frag->size() *
sizeof(RawDataType));
1093 TLOG(TLVL_DEBUG + 32) <<
"broadcastFragments_ Marking buffer full";
1094 broadcasts_.MarkBufferFull(buffer, -1);
1095 TLOG(TLVL_DEBUG + 32) <<
"broadcastFragments_ Complete";
1099 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(
int buffer)
1101 return reinterpret_cast<detail::RawEventHeader*
>(GetBufferStart(buffer));
1106 std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
1108 TLOG(TLVL_DEBUG + 33) <<
"GetSubrunForSequenceID BEGIN map size = " << subrun_event_map_.size();
1109 auto it = subrun_event_map_.begin();
1112 while (it->first <= seqID && it != subrun_event_map_.end())
1114 TLOG(TLVL_DEBUG + 33) <<
"Map has sequence ID " << it->first <<
", subrun " << it->second <<
" (looking for <= " << seqID <<
")";
1115 subrun = it->second;
1119 TLOG(TLVL_DEBUG + 32) <<
"GetSubrunForSequenceID returning subrun " << subrun <<
" for sequence ID " << seqID;
1123 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID,
bool create_new, Fragment::timestamp_t timestamp)
1125 TLOG(TLVL_DEBUG + 34) <<
"getBufferForSequenceID " << seqID <<
" BEGIN";
1126 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1128 TLOG(TLVL_DEBUG + 34) <<
"getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
1130 auto buffers = GetBuffersOwnedByManager();
1131 for (
auto& buf : buffers)
1133 auto hdr = getEventHeader_(buf);
1134 if (hdr->sequence_id == seqID)
1136 TLOG(TLVL_DEBUG + 34) <<
"getBufferForSequenceID " << seqID <<
" returning " << buf;
1141 #if !ART_SUPPORTS_DUPLICATE_EVENTS
1142 if (released_incomplete_events_.count(seqID) != 0u)
1144 TLOG(TLVL_ERROR) <<
"Event " << seqID <<
" has already been marked \"Incomplete\" and sent to art!";
1147 if (released_events_.count(seqID) != 0u)
1149 TLOG(TLVL_ERROR) <<
"Event " << seqID <<
" has already been completed and released to art! Check configuration for inconsistent Fragment count per event!";
1159 check_pending_buffers_(lk);
1160 int new_buffer = GetBufferForWriting(
false);
1162 if (new_buffer == -1)
1164 new_buffer = GetBufferForWriting(overwrite_mode_);
1167 if (new_buffer == -1)
1171 TLOG(TLVL_BUFLCK) <<
"getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
1172 std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_.at(new_buffer));
1173 TLOG(TLVL_BUFLCK) <<
"getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
1175 event_timing_[new_buffer] = std::chrono::steady_clock::now();
1177 auto hdr = getEventHeader_(new_buffer);
1178 hdr->is_complete =
false;
1179 hdr->run_id = run_id_;
1180 hdr->subrun_id = GetSubrunForSequenceID(seqID);
1181 hdr->event_id = use_sequence_id_for_event_number_ ?
static_cast<uint32_t
>(seqID) : static_cast<uint32_t>(timestamp);
1182 hdr->sequence_id = seqID;
1183 hdr->timestamp = timestamp;
1184 buffer_writes_pending_[new_buffer] = 0;
1185 IncrementWritePos(new_buffer,
sizeof(detail::RawEventHeader));
1186 SetMFIteration(
"Sequence ID " + std::to_string(seqID));
1188 TLOG(TLVL_BUFFER) <<
"getBufferForSequenceID placing " << new_buffer <<
" to active.";
1189 active_buffers_.insert(new_buffer);
1190 TLOG(TLVL_BUFFER) <<
"Buffer occupancy now (total,full,reading,empty,pending,active)=("
1192 << ReadReadyCount() <<
","
1193 << WriteReadyCount(
true) - WriteReadyCount(
false) - ReadReadyCount() <<
","
1194 << WriteReadyCount(
false) <<
","
1195 << pending_buffers_.size() <<
","
1196 << active_buffers_.size() <<
")";
1200 requests_->AddRequest(seqID, timestamp);
1202 TLOG(TLVL_DEBUG + 34) <<
"getBufferForSequenceID " << seqID <<
" returning newly initialized buffer " << new_buffer;
1206 bool artdaq::SharedMemoryEventManager::hasFragments_(
int buffer)
1212 if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
1216 ResetReadPos(buffer);
1217 IncrementReadPos(buffer,
sizeof(detail::RawEventHeader));
1218 return MoreDataInBuffer(buffer);
1221 void artdaq::SharedMemoryEventManager::complete_buffer_(
int buffer)
1223 auto hdr = getEventHeader_(buffer);
1224 if (hdr->is_complete)
1226 TLOG(TLVL_DEBUG + 32) <<
"complete_buffer_: This fragment completes event " << hdr->sequence_id <<
".";
1229 TLOG(TLVL_BUFFER) <<
"complete_buffer_ moving " << buffer <<
" from active to pending.";
1231 TLOG(TLVL_BUFLCK) <<
"complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1232 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1233 TLOG(TLVL_BUFLCK) <<
"complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1234 active_buffers_.erase(buffer);
1235 pending_buffers_.insert(buffer);
1236 released_events_.insert(hdr->sequence_id);
1237 while (released_events_.size() > max_event_list_length_)
1239 released_events_.erase(released_events_.begin());
1242 TLOG(TLVL_BUFFER) <<
"Buffer occupancy now (total,full,reading,empty,pending,active)=("
1244 << ReadReadyCount() <<
","
1245 << WriteReadyCount(
true) - WriteReadyCount(
false) - ReadReadyCount() <<
","
1246 << WriteReadyCount(
false) <<
","
1247 << pending_buffers_.size() <<
","
1248 << active_buffers_.size() <<
")";
1252 requests_->RemoveRequest(hdr->sequence_id);
1255 CheckPendingBuffers();
1258 bool artdaq::SharedMemoryEventManager::bufferComparator(
int bufA,
int bufB)
1260 return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1265 TLOG(TLVL_BUFLCK) <<
"CheckPendingBuffers: Obtaining sequence_id_mutex_";
1266 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1267 TLOG(TLVL_BUFLCK) <<
"CheckPendingBuffers: Obtained sequence_id_mutex_";
1268 check_pending_buffers_(lk);
1271 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex>
const& lock)
1273 TLOG(TLVL_DEBUG + 34) <<
"check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1275 auto buffers = GetBuffersOwnedByManager();
1276 for (
auto buf : buffers)
1278 if (ResetBuffer(buf) && (pending_buffers_.count(buf) == 0u))
1280 TLOG(TLVL_DEBUG + 36) <<
"check_pending_buffers_ Incomplete buffer detected, buf=" << buf <<
" active_bufers_.count(buf)=" << active_buffers_.count(buf) <<
" buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1281 auto hdr = getEventHeader_(buf);
1282 if ((active_buffers_.count(buf) != 0u) && buffer_writes_pending_[buf].load() == 0)
1286 requests_->RemoveRequest(hdr->sequence_id);
1288 TLOG(TLVL_BUFFER) <<
"check_pending_buffers_ moving buffer " << buf <<
" from active to pending";
1289 active_buffers_.erase(buf);
1290 pending_buffers_.insert(buf);
1291 TLOG(TLVL_BUFFER) <<
"Buffer occupancy now (total,full,reading,empty,pending,active)=("
1293 << ReadReadyCount() <<
","
1294 << WriteReadyCount(
true) - WriteReadyCount(
false) - ReadReadyCount() <<
","
1295 << WriteReadyCount(
false) <<
","
1296 << pending_buffers_.size() <<
","
1297 << active_buffers_.size() <<
")";
1299 run_incomplete_event_count_++;
1302 metricMan->sendMetric(
"Incomplete Event Rate", 1,
"events/s", 3, MetricMode::Rate);
1304 if (released_incomplete_events_.count(hdr->sequence_id) == 0u)
1306 released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1310 released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1313 TLOG(TLVL_WARNING) <<
"Event " << hdr->sequence_id
1314 <<
" was opened " << TimeUtils::GetElapsedTime(event_timing_[buf]) <<
" s ago"
1315 <<
" and has timed out (missing " << released_incomplete_events_[hdr->sequence_id] <<
" Fragments)."
1316 <<
"Scheduling release to art.";
1321 std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1322 sorted_buffers.sort([
this](
int a,
int b) {
return bufferComparator(a, b); });
1325 double eventSize = 0;
1326 double eventTime = 0;
1327 for (
auto buf : sorted_buffers)
1329 auto hdr = getEventHeader_(buf);
1330 auto thisEventSize = BufferDataSize(buf);
1332 TLOG(TLVL_DEBUG + 32) <<
"Releasing event " << std::to_string(hdr->sequence_id) <<
" in buffer " << buf <<
" to art, "
1333 <<
"event_size=" << thisEventSize <<
", buffer_size=" << BufferSize();
1334 statsHelper_.addSample(EVENTS_RELEASED_STAT_KEY, thisEventSize);
1336 TLOG(TLVL_BUFFER) <<
"check_pending_buffers_ removing buffer " << buf <<
" moving from pending to full";
1337 MarkBufferFull(buf);
1340 eventSize += thisEventSize;
1341 eventTime += TimeUtils::GetElapsedTime(event_timing_[buf]);
1342 pending_buffers_.erase(buf);
1343 TLOG(TLVL_BUFFER) <<
"Buffer occupancy now (total,full,reading,empty,pending,active)=("
1345 << ReadReadyCount() <<
","
1346 << WriteReadyCount(
true) - WriteReadyCount(
false) - ReadReadyCount() <<
","
1347 << WriteReadyCount(
false) <<
","
1348 << pending_buffers_.size() <<
","
1349 << active_buffers_.size() <<
")";
1352 if (tokens_ && tokens_->RoutingTokenSendsEnabled())
1354 TLOG(TLVL_DEBUG + 33) <<
"Sent tokens: " << tokens_->GetSentTokenCount() <<
", Event count: " << run_event_count_;
1355 auto outstanding_tokens = tokens_->GetSentTokenCount() - run_event_count_;
1356 auto available_buffers = WriteReadyCount(overwrite_mode_);
1358 TLOG(TLVL_DEBUG + 33) <<
"check_pending_buffers_: outstanding_tokens: " << outstanding_tokens <<
", available_buffers: " << available_buffers
1359 <<
", tokens_to_send: " << available_buffers - outstanding_tokens;
1361 if (available_buffers > outstanding_tokens)
1363 auto tokens_to_send = available_buffers - outstanding_tokens;
1365 while (tokens_to_send > 0)
1367 TLOG(35) <<
"check_pending_buffers_: Sending a Routing Token";
1368 tokens_->SendRoutingToken(1, run_id_);
1374 if (statsHelper_.readyToReport())
1376 std::string statString = buildStatisticsString_();
1377 TLOG(TLVL_INFO) << statString;
1382 TLOG(TLVL_DEBUG + 34) <<
"check_pending_buffers_: Sending Metrics";
1383 metricMan->sendMetric(
"Event Rate", counter,
"Events", 1, MetricMode::Rate);
1384 metricMan->sendMetric(
"Data Rate", eventSize,
"Bytes", 1, MetricMode::Rate);
1387 metricMan->sendMetric(
"Average Event Size", eventSize / counter,
"Bytes", 1, MetricMode::Average);
1388 metricMan->sendMetric(
"Average Event Building Time", eventTime / counter,
"s", 1, MetricMode::Average);
1391 metricMan->sendMetric(
"Events Released to art this run", run_event_count_,
"Events", 1, MetricMode::LastPoint);
1392 metricMan->sendMetric(
"Incomplete Events Released to art this run", run_incomplete_event_count_,
"Events", 1, MetricMode::LastPoint);
1393 if (tokens_ && tokens_->RoutingTokenSendsEnabled())
1395 metricMan->sendMetric(
"Tokens sent", tokens_->GetSentTokenCount(),
"Tokens", 2, MetricMode::LastPoint);
1398 auto bufferReport = GetBufferReport();
1399 int full = 0, empty = 0, writing = 0, reading = 0;
1400 for (
auto& buf : bufferReport)
1404 case BufferSemaphoreFlags::Full:
1407 case BufferSemaphoreFlags::Empty:
1410 case BufferSemaphoreFlags::Writing:
1413 case BufferSemaphoreFlags::Reading:
1418 auto total = size();
1419 TLOG(TLVL_DEBUG + 36) <<
"Buffer usage: full=" << full <<
", empty=" << empty <<
", writing=" << writing <<
", reading=" << reading <<
", total=" << total;
1421 metricMan->sendMetric(
"Shared Memory Full Buffers", full,
"buffers", 2, MetricMode::LastPoint);
1422 metricMan->sendMetric(
"Shared Memory Available Buffers", empty,
"buffers", 2, MetricMode::LastPoint);
1423 metricMan->sendMetric(
"Shared Memory Pending Buffers", writing,
"buffers", 2, MetricMode::LastPoint);
1424 metricMan->sendMetric(
"Shared Memory Reading Buffers", reading,
"buffers", 2, MetricMode::LastPoint);
1427 metricMan->sendMetric(
"Shared Memory Full %", full * 100 / static_cast<double>(total),
"%", 2, MetricMode::LastPoint);
1428 metricMan->sendMetric(
"Shared Memory Available %", empty * 100 / static_cast<double>(total),
"%", 2, MetricMode::LastPoint);
1431 TLOG(TLVL_DEBUG + 34) <<
"check_pending_buffers_ END";
1434 std::vector<char*> artdaq::SharedMemoryEventManager::parse_art_command_line_(
const std::shared_ptr<art_config_file>& config_file,
size_t process_index)
1436 auto offset_index = process_index + art_process_index_offset_;
1437 TLOG(TLVL_DEBUG + 37) <<
"parse_art_command_line_: Parsing command line " << art_cmdline_ <<
", config_file: " << config_file->getFileName() <<
", index: " << process_index <<
" (w/offset: " << offset_index <<
")";
1438 std::string art_cmdline_tmp = art_cmdline_;
1439 auto filenameit = art_cmdline_tmp.find(
"#CONFIG_FILE#");
1440 if (filenameit != std::string::npos)
1442 art_cmdline_tmp.replace(filenameit, 13, config_file->getFileName());
1444 auto indexit = art_cmdline_tmp.find(
"#PROCESS_INDEX#");
1445 if (indexit != std::string::npos)
1447 art_cmdline_tmp.replace(indexit, 15, std::to_string(offset_index));
1449 TLOG(TLVL_DEBUG + 37) <<
"parse_art_command_line_: After replacing index and config parameters, command line is " << art_cmdline_tmp;
1451 std::istringstream iss(art_cmdline_tmp);
1452 auto tokens = std::vector<std::string>{std::istream_iterator<std::string>{iss}, std::istream_iterator<std::string>{}};
1453 std::vector<char*> output;
1455 for (
auto& token : tokens)
1457 TLOG(TLVL_DEBUG + 37) <<
"parse_art_command_line_: Adding cmdline token " << token <<
" to output list";
1458 output.emplace_back(
new char[token.length() + 1]);
1459 memcpy(output.back(), token.c_str(), token.length());
1460 output.back()[token.length()] =
'\0';
1462 output.emplace_back(
nullptr);
1467 void artdaq::SharedMemoryEventManager::send_init_frags_()
1469 if (init_fragments_.size() >= init_fragment_count_ && init_fragment_count_ > 0)
1471 TLOG(TLVL_INFO) <<
"Broadcasting " << init_fragments_.size() <<
" Init Fragment(s) to all art subprocesses...";
1474 std::string fileName =
"receiveInitMessage_" + std::to_string(my_rank) +
".bin";
1475 std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1476 ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1480 broadcastFragments_(init_fragments_);
1481 TLOG(TLVL_DEBUG + 33) <<
"Init Fragment sent";
1483 else if (init_fragment_count_ > 0 && init_fragments_.size() == 0)
1485 TLOG(TLVL_WARNING) <<
"Cannot send Init Fragment(s) because I haven't yet received them! Set send_init_fragments to false or init_fragment_count to 0 if this process does not receive serialized art events to avoid potentially lengthy timeouts!";
1487 else if (init_fragment_count_ > 0)
1489 TLOG(TLVL_INFO) <<
"Cannot send Init Fragment(s) because I haven't yet received them (have " << init_fragments_.size() <<
" of " << init_fragment_count_ <<
")!";
1494 artdaq::FragmentPtrs begin_run_fragments_;
1495 begin_run_fragments_.emplace_back(
new artdaq::Fragment());
1496 begin_run_fragments_.back()->setSystemType(artdaq::Fragment::InitFragmentType);
1497 broadcastFragments_(begin_run_fragments_);
1503 static std::mutex init_fragment_mutex;
1504 std::lock_guard<std::mutex> lk(init_fragment_mutex);
1505 if (received_init_frags_.count(frag->fragmentID()) == 0)
1507 TLOG(TLVL_DEBUG + 32) <<
"Received Init Fragment from rank " << frag->fragmentID() <<
". Now have " << init_fragments_.size() + 1 <<
" of " << init_fragment_count_;
1508 received_init_frags_.insert(frag->fragmentID());
1509 init_fragments_.push_back(std::move(frag));
1512 if (init_fragments_.size() >= init_fragment_count_)
1519 TLOG(TLVL_DEBUG + 33) <<
"Ignoring duplicate Init Fragment from rank " << frag->fragmentID();
1525 TLOG(TLVL_DEBUG + 32) <<
"UpdateArtConfiguration BEGIN";
1526 if (art_pset != current_art_pset_ || !current_art_config_file_)
1528 current_art_pset_ = art_pset;
1530 current_art_config_file_ = std::make_shared<art_config_file>(art_pset, GetKey(), GetBroadcastKey());
1532 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
1534 TLOG(TLVL_DEBUG + 32) <<
"UpdateArtConfiguration END";
1537 std::string artdaq::SharedMemoryEventManager::buildStatisticsString_()
const
1539 std::ostringstream oss;
1540 oss << app_name <<
" statistics:" << std::endl;
1542 artdaq::MonitoredQuantityPtr mqPtr =
1543 artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(EVENTS_RELEASED_STAT_KEY);
1544 if (mqPtr.get() !=
nullptr)
1546 artdaq::MonitoredQuantityStats stats;
1547 mqPtr->getStats(stats);
1548 oss <<
" Event statistics: " << stats.recentSampleCount <<
" events released at " << stats.recentSampleRate
1549 <<
" events/sec, effective data rate = "
1550 << (stats.recentValueRate / 1024.0 / 1024.0)
1551 <<
" MB/sec, monitor window = " << stats.recentDuration
1552 <<
" sec, min::max event size = " << (stats.recentValueMin / 1024.0 / 1024.0)
1553 <<
"::" << (stats.recentValueMax / 1024.0 / 1024.0) <<
" MB" << std::endl;
1554 if (stats.recentSampleRate > 0.0)
1556 oss <<
" Average time per event: ";
1557 oss <<
" elapsed time = " << (1.0 / stats.recentSampleRate) <<
" sec" << std::endl;
1561 mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(FRAGMENTS_RECEIVED_STAT_KEY);
1562 if (mqPtr.get() !=
nullptr)
1564 artdaq::MonitoredQuantityStats stats;
1565 mqPtr->getStats(stats);
1566 oss <<
" Fragment statistics: " << stats.recentSampleCount <<
" fragments received at " << stats.recentSampleRate
1567 <<
" fragments/sec, effective data rate = "
1568 << (stats.recentValueRate / 1024.0 / 1024.0)
1569 <<
" MB/sec, monitor window = " << stats.recentDuration
1570 <<
" sec, min::max fragment size = " << (stats.recentValueMin / 1024.0 / 1024.0)
1571 <<
"::" << (stats.recentValueMax / 1024.0 / 1024.0) <<
" MB" << std::endl;
1574 oss <<
" Event counts: Run -- " << run_event_count_ <<
" Total, " << run_incomplete_event_count_ <<
" Incomplete."
1575 <<
" Subrun -- " << subrun_event_count_ <<
" Total, " << subrun_incomplete_event_count_ <<
" Incomplete. "
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
void AddInitFragment(FragmentPtr &frag)
Set the stored Init fragment, if one has not yet been set already.
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
pid_t StartArtProcess(fhicl::ParameterSet pset, size_t process_index)
Start one art process.
void StartArt()
Start all the art processes.
subrun_id_t GetSubrunForSequenceID(Fragment::sequence_id_t seqID)
Get the subrun number that the given Sequence ID would be assigned to.
SharedMemoryEventManager(const fhicl::ParameterSet &pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
void RunArt(const std::shared_ptr< art_config_file > &config_file, size_t process_index, const std::shared_ptr< std::atomic< pid_t >> &pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
void rolloverSubrun()
Add a subrun transition immediately after the highest currently define sequence ID.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
static const std::string FRAGMENTS_RECEIVED_STAT_KEY
Key for Fragments Received MonitoredQuantity.
bool createCollectors(fhicl::ParameterSet const &pset, int defaultReportIntervalFragments, double defaultReportIntervalSeconds, double defaultMonitorWindow, std::string const &primaryStatKeyName)
Create MonitoredQuantity objects for all names registered with the StatisticsHelper.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
static const std::string EVENTS_RELEASED_STAT_KEY
Key for the Events Released MonitoredQuantity.
uint32_t GetBroadcastKey()
Gets the shared memory key of the broadcast SharedMemoryManager.
bool endOfData()
Indicate that the end of input has been reached to the art processes.
RawEvent::subrun_id_t subrun_id_t
Copy RawEvent::subrun_id_t into local scope.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...