00001
00002 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
00003
00004 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
00005 #include "artdaq-core/Core/StatisticsCollection.hh"
00006 #include "artdaq-core/Utilities/TraceLock.hh"
00007 #include <sys/wait.h>
00008
00009 #define TLVL_BUFFER 40
00010 #define TLVL_BUFLCK 41
00011
00012 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
00013
00014 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
00015 : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
00016 pset.get<size_t>("buffer_count"),
00017 pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
00018 pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
00019 !pset.get<bool>("broadcast_mode", false))
00020 , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
00021 , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
00022 , queue_size_(pset.get<size_t>("buffer_count"))
00023 , run_id_(0)
00024 , subrun_id_(0)
00025 , subrun_rollover_event_(Fragment::InvalidSequenceID)
00026 , last_released_event_(0)
00027 , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
00028 , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
00029 , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
00030 , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
00031 , running_(false)
00032 , buffer_writes_pending_()
00033 , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
00034 , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
00035 , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now())
00036 , metric_data_()
00037 , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
00038 , run_event_count_(0)
00039 , run_incomplete_event_count_(0)
00040 , subrun_event_count_(0)
00041 , subrun_incomplete_event_count_(0)
00042 , oversize_fragment_count_(0)
00043 , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
00044 , art_processes_()
00045 , restart_art_(false)
00046 , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
00047 , manual_art_(pset.get<bool>("manual_art", false))
00048 , current_art_pset_(art_pset)
00049 , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
00050 , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 100000))
00051 , requests_(nullptr)
00052 , data_pset_(pset)
00053 , dropped_data_()
00054 , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
00055 pset.get<size_t>("broadcast_buffer_count", 10),
00056 pset.get<size_t>("broadcast_buffer_size", 0x100000),
00057 pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
00058 {
00059 SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00060 broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00061
00062 if (pset.get<bool>("use_art", true) == false)
00063 {
00064 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
00065 num_art_processes_ = 0;
00066 }
00067 else
00068 {
00069 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
00070 TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
00071 }
00072 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00073
00074 if (overwrite_mode_ && num_art_processes_ > 0)
00075 {
00076 TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
00077 }
00078 else if (overwrite_mode_)
00079 {
00080 TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
00081 }
00082
00083 for (size_t ii = 0; ii < size(); ++ii)
00084 {
00085 buffer_writes_pending_[ii] = 0;
00086 }
00087
00088 if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
00089
00090 TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
00091 SetRank(my_rank);
00092 TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
00093
00094
00095 TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
00096 }
00097
00098 artdaq::SharedMemoryEventManager::~SharedMemoryEventManager()
00099 {
00100 TLOG(TLVL_TRACE) << "DESTRUCTOR";
00101 if (running_) endOfData();
00102 TLOG(TLVL_TRACE) << "Destructor END";
00103 }
00104
00105 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
00106 {
00107 TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
00108 << ", sequence_id=" << frag.sequence_id;
00109 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00110 TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
00111 if (buffer == -1) return false;
00112 if (buffer == -2)
00113 {
00114 TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
00115 return true;
00116 }
00117
00118 auto hdr = getEventHeader_(buffer);
00119 if (update_run_ids_)
00120 {
00121 hdr->run_id = run_id_;
00122 hdr->subrun_id = subrun_id_;
00123 }
00124
00125 TLOG(TLVL_TRACE) << "AddFragment before Write calls";
00126 Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
00127
00128 TLOG(TLVL_TRACE) << "Checking for complete event";
00129 auto fragmentCount = GetFragmentCount(frag.sequence_id);
00130 hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
00131 TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
00132 << ", fragmentCount=" << fragmentCount
00133 << ", num_fragments_per_event=" << num_fragments_per_event_
00134 << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
00135
00136 complete_buffer_(buffer);
00137 if (requests_) requests_->SendRequest(true);
00138
00139 TLOG(TLVL_TRACE) << "AddFragment END";
00140 return true;
00141 }
00142
00143 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
00144 {
00145 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
00146 auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
00147 auto data = frag->headerAddress();
00148 auto start = std::chrono::steady_clock::now();
00149 bool sts = false;
00150 while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
00151 {
00152 sts = AddFragment(hdr, data);
00153 if (!sts) usleep(1000);
00154 }
00155 if (!sts)
00156 {
00157 outfrag = std::move(frag);
00158 }
00159 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
00160 return sts;
00161 }
00162
00163 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
00164 {
00165 TLOG(14) << "WriteFragmentHeader BEGIN";
00166 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00167
00168 if (buffer < 0)
00169 {
00170 if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
00171 if (buffer == -2)
00172 {
00173 TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
00174 }
00175 else
00176 {
00177 TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
00178 }
00179 dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
00180
00181 TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
00182 return dropped_data_[frag.fragment_id]->dataBegin();
00183 }
00184
00185
00186 buffer_writes_pending_[buffer]++;
00187
00188 if (metricMan)
00189 {
00190 metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
00191 }
00192
00193 TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
00194
00195 std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
00196
00197 TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
00198
00199
00200 auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
00201 Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
00202
00203 auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
00204 if (frag.word_count - frag.num_words() > 0)
00205 {
00206 auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
00207
00208 if (!sts)
00209 {
00210 reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words();
00211 reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType;
00212 TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
00213 dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
00214
00215 oversize_fragment_count_++;
00216
00217 if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
00218 {
00219 throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
00220 }
00221
00222 TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin();
00223 return dropped_data_[frag.fragment_id]->dataBegin();
00224 }
00225 }
00226 TLOG(14) << "WriteFragmentHeader END";
00227 return pos;
00228
00229 }
00230
00231 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
00232 {
00233 TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
00234 auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
00235 if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
00236 if (buffer == -2) { return; }
00237
00238 TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
00239
00240 std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
00241
00242 TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
00243
00244
00245
00246 TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")";
00247 auto hdr = getEventHeader_(buffer);
00248 if (update_run_ids_)
00249 {
00250 hdr->run_id = run_id_;
00251 hdr->subrun_id = subrun_id_;
00252 }
00253
00254 buffer_writes_pending_[buffer]--;
00255 if (buffer_writes_pending_[buffer] != 0)
00256 {
00257 TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
00258 return;
00259 }
00260 TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
00261 auto frag_count = GetFragmentCount(frag.sequence_id);
00262 hdr->is_complete = frag_count == num_fragments_per_event_;
00263 TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
00264 #if ART_SUPPORTS_DUPLICATE_EVENTS
00265 if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
00266 {
00267 hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
00268 }
00269 #endif
00270
00271 complete_buffer_(buffer);
00272 if (requests_) requests_->SendRequest(true);
00273 TLOG(TLVL_TRACE) << "DoneWritingFragment END";
00274 }
00275
00276 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
00277 {
00278 return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
00279 }
00280
00281 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
00282 {
00283 if (buffer == -1) return 0;
00284 ResetReadPos(buffer);
00285 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00286
00287 size_t count = 0;
00288
00289 while (MoreDataInBuffer(buffer))
00290 {
00291 auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
00292 IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
00293 if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
00294 TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
00295 ++count;
00296 }
00297
00298 return count;
00299 }
00300
00301 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out)
00302 {
00303 do
00304 {
00305 auto start_time = std::chrono::steady_clock::now();
00306 send_init_frag_();
00307 TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
00308
00309 pid_t pid = 0;
00310
00311 if (!manual_art_)
00312 {
00313 char* filename = new char[config_file->getFileName().length() + 1];
00314 strcpy(filename, config_file->getFileName().c_str());
00315
00316 std::vector<char*> args{ (char*)"art", (char*)"-c", filename, NULL };
00317 pid = fork();
00318 if (pid == 0)
00319 {
00320
00321
00322
00323
00324 std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
00325 std::string envVarValue = std::to_string(GetPartitionNumber());
00326 if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
00327 {
00328 TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
00329 << "\" in the environment of a child art process. "
00330 << "This may result in incorrect TCP port number "
00331 << "assignments or other issues, and data may "
00332 << "not flow through the system correctly.";
00333 }
00334
00335 execvp("art", &args[0]);
00336 delete[] filename;
00337 exit(1);
00338 }
00339 delete[] filename;
00340 }
00341 else
00342 {
00343
00344 std::cout << "Please run the following command in a separate terminal:" << std::endl
00345 << "art -c " << config_file->getFileName() << std::endl
00346 << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
00347 << "Finally, return to this window and enter the pid: " << std::endl;
00348 std::cin >> pid;
00349 }
00350 *pid_out = pid;
00351
00352 TLOG(TLVL_INFO) << "PID of new art process is " << pid;
00353 {
00354 std::unique_lock<std::mutex> lk(art_process_mutex_);
00355 art_processes_.insert(pid);
00356 }
00357 siginfo_t status;
00358 auto sts = waitid(P_PID, pid, &status, WEXITED);
00359 TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
00360 {
00361 std::unique_lock<std::mutex> lk(art_process_mutex_);
00362 art_processes_.erase(pid);
00363 }
00364 if (sts < 0)
00365 {
00366 TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
00367 }
00368 else if (status.si_code == CLD_EXITED && status.si_status == 0)
00369 {
00370 TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
00371 }
00372 else
00373 {
00374 auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
00375 if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
00376
00377 auto exit_type = "exited with status code";
00378 switch (status.si_code)
00379 {
00380 case CLD_DUMPED:
00381 case CLD_KILLED:
00382 exit_type = "was killed with signal";
00383 break;
00384 case CLD_EXITED:
00385 default:
00386 break;
00387 }
00388
00389 TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
00390 << "art process " << pid << " " << exit_type << " " << status.si_status
00391 << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
00392 << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
00393 << (restart_art_ ? "restarting" : "not restarting");
00394 }
00395 } while (restart_art_);
00396 }
00397
00398 void artdaq::SharedMemoryEventManager::StartArt()
00399 {
00400 restart_art_ = always_restart_art_;
00401 if (num_art_processes_ == 0) return;
00402 for (size_t ii = 0; ii < num_art_processes_; ++ii)
00403 {
00404 StartArtProcess(current_art_pset_);
00405 }
00406 }
00407
00408 pid_t artdaq::SharedMemoryEventManager::StartArtProcess(fhicl::ParameterSet pset)
00409 {
00410 static std::mutex start_art_mutex;
00411 std::unique_lock<std::mutex> lk(start_art_mutex);
00412
00413 restart_art_ = always_restart_art_;
00414 auto initialCount = GetAttachedCount();
00415 auto startTime = std::chrono::steady_clock::now();
00416
00417 if (pset != current_art_pset_ || !current_art_config_file_)
00418 {
00419 current_art_pset_ = pset;
00420 current_art_config_file_ = std::make_shared<art_config_file>(pset);
00421 }
00422 std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
00423 boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
00424 thread.detach();
00425
00426 auto currentCount = GetAttachedCount() - initialCount;
00427 while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
00428 {
00429 usleep(10000);
00430 currentCount = GetAttachedCount() - initialCount;
00431 }
00432 if ((currentCount < 1 || *pid <= 0) && manual_art_)
00433 {
00434 TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
00435 return 0;
00436 }
00437 else if (currentCount < 1 || *pid <= 0)
00438 {
00439 TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
00440 << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
00441 return 0;
00442 }
00443 else
00444 {
00445 TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
00446 << TimeUtils::GetElapsedTime(startTime) << " seconds.";
00447
00448 return *pid;
00449 }
00450
00451 }
00452
00453 void artdaq::SharedMemoryEventManager::ShutdownArtProcesses(std::set<pid_t>& pids)
00454 {
00455 restart_art_ = false;
00456
00457
00458
00459 auto check_pids = [&](bool print) {
00460
00461 for (auto pid = pids.begin(); pid != pids.end();)
00462 {
00463
00464
00465 std::unique_lock<std::mutex> lk(art_process_mutex_);
00466 if (*pid <= 0)
00467 {
00468 TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
00469 << ") from the shutdown list.";
00470 pid = pids.erase(pid);
00471 }
00472 else if (kill(*pid, 0) < 0)
00473 {
00474 pid = pids.erase(pid);
00475 }
00476 else
00477 {
00478 if (print) std::cout << *pid << " ";
00479 ++pid;
00480 }
00481 }
00482 };
00483 check_pids(false);
00484 if (pids.size() == 0)
00485 {
00486 TLOG(14) << "All art processes already exited, nothing to do.";
00487 usleep(1000);
00488 return;
00489 }
00490
00491 if (!manual_art_)
00492 {
00493 TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
00494 for (auto pid : pids)
00495 {
00496 TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
00497 kill(pid, SIGQUIT);
00498 }
00499
00500 int graceful_wait_ms = 5000;
00501 int int_wait_ms = 1000;
00502
00503 TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
00504 for (int ii = 0; ii < graceful_wait_ms; ++ii)
00505 {
00506 usleep(1000);
00507
00508 check_pids(false);
00509 if (pids.size() == 0)
00510 {
00511 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
00512 return;
00513 }
00514 }
00515
00516 TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
00517 for (auto pid : pids)
00518 {
00519 kill(pid, SIGINT);
00520 }
00521
00522 TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
00523 for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
00524 {
00525 usleep(1000);
00526
00527 check_pids(false);
00528
00529 if (pids.size() == 0)
00530 {
00531 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
00532 return;
00533 }
00534 }
00535
00536 TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
00537 while (pids.size() > 0)
00538 {
00539 kill(*pids.begin(), SIGKILL);
00540 usleep(1000);
00541
00542 check_pids(false);
00543 }
00544 }
00545 else
00546 {
00547 std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
00548 while (pids.size() > 0)
00549 {
00550 std::cout << "The following PIDs are running: ";
00551 check_pids(true);
00552 std::cout << std::endl;
00553 std::string ignored;
00554 std::cin >> ignored;
00555 }
00556 }
00557 }
00558
00559 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
00560 {
00561 TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
00562 if (restart_art_ || !always_restart_art_)
00563 {
00564 endOfData();
00565 }
00566 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00567 {
00568 broadcasts_.MarkBufferEmpty(ii, true);
00569 }
00570 if (newRun == 0) newRun = run_id_ + 1;
00571
00572 if (art_pset != current_art_pset_ || !current_art_config_file_)
00573 {
00574 current_art_pset_ = art_pset;
00575 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00576 }
00577
00578 if (n_art_processes != -1)
00579 {
00580 TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
00581 num_art_processes_ = n_art_processes;
00582 }
00583 startRun(newRun);
00584 TLOG(TLVL_DEBUG) << "ReconfigureArt END";
00585 }
00586
00587 bool artdaq::SharedMemoryEventManager::endOfData()
00588 {
00589 running_ = false;
00590 init_fragment_.reset(nullptr);
00591 TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
00592 restart_art_ = false;
00593
00594 size_t initialStoreSize = GetIncompleteEventCount();
00595 TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
00596 << " stale events from the SharedMemoryEventManager.";
00597 int counter = initialStoreSize;
00598 while (active_buffers_.size() > 0 && counter > 0)
00599 {
00600 complete_buffer_(*active_buffers_.begin());
00601 counter--;
00602 }
00603 TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
00604 << " stale events in the SharedMemoryEventManager.";
00605
00606
00607 TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
00608 auto start = std::chrono::steady_clock::now();
00609 auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00610 auto end_of_data_wait_us = art_event_processing_time_us_ * lastReadCount;
00611
00612 auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
00613
00614
00615 while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
00616 {
00617 auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00618 if (temp != lastReadCount)
00619 {
00620 TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
00621 lastReadCount = temp;
00622 start = std::chrono::steady_clock::now();
00623 }
00624 if (lastReadCount > 0) {
00625 TRACE(19,"About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu",outstanding_buffer_wait_time,lastReadCount,size(),end_of_data_wait_us );
00626 usleep(outstanding_buffer_wait_time);
00627 }
00628 }
00629
00630 TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
00631 << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
00632
00633 TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
00634 FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
00635 bool success = broadcastFragment_(std::move(outFrag), outFrag);
00636 if (!success)
00637 {
00638 TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
00639 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00640 {
00641 broadcasts_.MarkBufferEmpty(ii, true);
00642 }
00643 broadcastFragment_(std::move(outFrag), outFrag);
00644 }
00645 auto endOfDataProcessingStart = std::chrono::steady_clock::now();
00646
00647 if (get_art_process_count_() > 0)
00648 {
00649 TLOG(TLVL_DEBUG) << "Allowing " << get_art_process_count_() << " art processes the chance to end gracefully";
00650 if (end_of_data_wait_us == 0)
00651 {
00652 TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
00653 end_of_data_wait_us = 100 * 1000000;
00654 }
00655
00656 auto sleep_count = (end_of_data_wait_us / 10000) + 1;
00657 for (size_t ii = 0; ii < sleep_count; ++ii)
00658 {
00659 usleep(10000);
00660 if (get_art_process_count_() == 0) break;
00661 }
00662 }
00663
00664 while (get_art_process_count_() > 0)
00665 {
00666 TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
00667
00668 ShutdownArtProcesses(art_processes_);
00669 }
00670 TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
00671
00672 ResetAttachedCount();
00673
00674 TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
00675 for (size_t ii = 0; ii < size(); ++ii)
00676 {
00677 MarkBufferEmpty(ii, true);
00678 }
00679
00680
00681
00682
00683
00684
00685 released_incomplete_events_.clear();
00686
00687 TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestReceiver";
00688 requests_.reset(nullptr);
00689
00690 TLOG(TLVL_DEBUG) << "endOfData END";
00691 TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
00692 return true;
00693 }
00694
00695 void artdaq::SharedMemoryEventManager::startRun(run_id_t runID)
00696 {
00697 running_ = true;
00698 init_fragment_.reset(nullptr);
00699 TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
00700 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00701 {
00702 broadcasts_.MarkBufferEmpty(ii, true);
00703 }
00704 StartArt();
00705 run_id_ = runID;
00706 subrun_id_ = 1;
00707 subrun_rollover_event_ = Fragment::InvalidSequenceID;
00708 last_released_event_ = 0;
00709 requests_.reset(new RequestSender(data_pset_));
00710 if (requests_)
00711 {
00712 requests_->SendRoutingToken(queue_size_);
00713 }
00714 TLOG(TLVL_DEBUG) << "Starting run " << run_id_
00715 << ", max queue size = "
00716 << queue_size_
00717 << ", queue size = "
00718 << GetLockedBufferCount();
00719 if (metricMan)
00720 {
00721 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00722 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00723 }
00724 }
00725
00726 void artdaq::SharedMemoryEventManager::startSubrun()
00727 {
00728 ++subrun_id_;
00729 subrun_rollover_event_ = Fragment::InvalidSequenceID;
00730 if (metricMan)
00731 {
00732 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00733 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00734 }
00735 }
00736
00737 bool artdaq::SharedMemoryEventManager::endRun()
00738 {
00739 TLOG(TLVL_INFO) << "Ending run " << run_id_;
00740 FragmentPtr endOfRunFrag(new
00741 Fragment(static_cast<size_t>
00742 (ceil(sizeof(my_rank) /
00743 static_cast<double>(sizeof(Fragment::value_type))))));
00744
00745 TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
00746 endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
00747 *endOfRunFrag->dataBegin() = my_rank;
00748 broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
00749
00750 TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
00751 run_event_count_ = 0;
00752 run_incomplete_event_count_ = 0;
00753 oversize_fragment_count_ = 0;
00754 return true;
00755 }
00756
00757 bool artdaq::SharedMemoryEventManager::endSubrun()
00758 {
00759 TLOG(TLVL_INFO) << "Ending subrun " << subrun_id_;
00760 std::unique_ptr<artdaq::Fragment>
00761 endOfSubrunFrag(new
00762 Fragment(static_cast<size_t>
00763 (ceil(sizeof(my_rank) /
00764 static_cast<double>(sizeof(Fragment::value_type))))));
00765
00766 TLOG(TLVL_DEBUG) << "Broadcasting EndOfSubrun Fragment";
00767 endOfSubrunFrag->setSequenceID(subrun_rollover_event_);
00768 endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
00769 *endOfSubrunFrag->dataBegin() = my_rank;
00770
00771 broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
00772
00773 TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun.";
00774 subrun_event_count_ = 0;
00775 subrun_incomplete_event_count_ = 0;
00776
00777 return true;
00778 }
00779
00780 void artdaq::SharedMemoryEventManager::rolloverSubrun(sequence_id_t boundary)
00781 {
00782
00783 if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
00784
00785 if (boundary < last_released_event_)
00786 {
00787 auto logLevel = TLVL_ERROR;
00788 bool processAnyway = false;
00789 if (last_released_event_ - boundary < 100)
00790 {
00791 logLevel = TLVL_WARNING;
00792 processAnyway = true;
00793 }
00794 TLOG(logLevel) << "Subrun rollover requested for event that is in the past. (last_released_event="
00795 << last_released_event_ << ",requested_rollover_boundary=" << boundary << ").";
00796 if (!processAnyway) return;
00797 }
00798 TLOG(TLVL_INFO) << "Will roll over when I reach Sequence ID " << boundary;
00799
00800
00801
00802
00803
00804
00805
00806
00807 if (boundary == last_released_event_ + 1) {
00808 TLOG(TLVL_INFO) << "rolloverSubrun: Last released event had sequence id " << last_released_event_ << \
00809 ", boundary is sequence id " << boundary << ", so will start a new subrun here";
00810 endSubrun();
00811 startSubrun();
00812 subrun_rollover_event_ = std::numeric_limits<sequence_id_t>::max();
00813 }
00814 else {
00815 subrun_rollover_event_ = boundary;
00816 }
00817 }
00818
00819 void artdaq::SharedMemoryEventManager::sendMetrics()
00820 {
00821 if (metricMan)
00822 {
00823 metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
00824 metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
00825 }
00826
00827 if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
00828 {
00829 if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
00830 return;
00831
00832 last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
00833 std::ostringstream oss;
00834 oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
00835 for (auto& ev : active_buffers_)
00836 {
00837 auto hdr = getEventHeader_(ev);
00838 oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
00839 }
00840 TLOG(TLVL_DEBUG) << oss.str();
00841 }
00842 }
00843
00844 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
00845 {
00846 TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
00847 auto buffer = broadcasts_.GetBufferForWriting(false);
00848 TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
00849 auto start_time = std::chrono::steady_clock::now();
00850 while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
00851 {
00852 usleep(10000);
00853 buffer = broadcasts_.GetBufferForWriting(false);
00854 }
00855 TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
00856 if (buffer == -1)
00857 {
00858 TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
00859 outFrag.swap(frag);
00860 return false;
00861 }
00862
00863 TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
00864 auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
00865 hdr->run_id = run_id_;
00866 hdr->subrun_id = subrun_id_;
00867 hdr->sequence_id = frag->sequenceID();
00868 hdr->is_complete = true;
00869 broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
00870
00871 TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
00872 broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
00873
00874 TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
00875 broadcasts_.MarkBufferFull(buffer, -1);
00876 outFrag.swap(frag);
00877 TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
00878 return true;
00879 }
00880
00881 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
00882 {
00883 return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
00884 }
00885
00886 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
00887 {
00888 TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
00889 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00890
00891 TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
00892
00893 auto buffers = GetBuffersOwnedByManager();
00894 for (auto& buf : buffers)
00895 {
00896 auto hdr = getEventHeader_(buf);
00897 if (hdr->sequence_id == seqID)
00898 {
00899 TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
00900 return buf;
00901 }
00902 }
00903
00904 #if !ART_SUPPORTS_DUPLICATE_EVENTS
00905 if (released_incomplete_events_.count(seqID))
00906 {
00907 TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
00908 return -2;
00909 }
00910 #endif
00911
00912 if (!create_new) return -1;
00913
00914 check_pending_buffers_(lk);
00915 int new_buffer = GetBufferForWriting(false);
00916
00917 if (new_buffer == -1)
00918 {
00919 new_buffer = GetBufferForWriting(overwrite_mode_);
00920 }
00921
00922 if (new_buffer == -1) return -1;
00923 TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
00924 std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
00925 TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
00926
00927 auto hdr = getEventHeader_(new_buffer);
00928 hdr->is_complete = false;
00929 hdr->run_id = run_id_;
00930 hdr->subrun_id = subrun_id_;
00931 hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
00932 hdr->sequence_id = seqID;
00933 buffer_writes_pending_[new_buffer] = 0;
00934 IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
00935 SetMFIteration("Sequence ID " + std::to_string(seqID));
00936
00937 TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
00938 active_buffers_.insert(new_buffer);
00939 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
00940 << size() << ","
00941 << ReadReadyCount() << ","
00942 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
00943 << WriteReadyCount(false) << ","
00944 << pending_buffers_.size() << ","
00945 << active_buffers_.size() << ")";
00946
00947 if (requests_)
00948 {
00949 if (timestamp != Fragment::InvalidTimestamp)
00950 {
00951 requests_->AddRequest(seqID, timestamp);
00952 }
00953
00954
00955 else
00956 {
00957 requests_->SendRequest();
00958 }
00959 }
00960 TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
00961 return new_buffer;
00962 }
00963
00964 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
00965 {
00966 if (buffer == -1) return true;
00967 if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
00968 {
00969 return true;
00970 }
00971 ResetReadPos(buffer);
00972 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00973 return MoreDataInBuffer(buffer);
00974 }
00975
00976 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
00977 {
00978 auto hdr = getEventHeader_(buffer);
00979 if (hdr->is_complete)
00980 {
00981 TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
00982
00983 {
00984 TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
00985
00986 TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
00987 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00988 TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
00989 active_buffers_.erase(buffer);
00990 pending_buffers_.insert(buffer);
00991
00992 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
00993 << size() << ","
00994 << ReadReadyCount() << ","
00995 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
00996 << WriteReadyCount(false) << ","
00997 << pending_buffers_.size() << ","
00998 << active_buffers_.size() << ")";
00999 }
01000 if (requests_)
01001 {
01002 requests_->RemoveRequest(hdr->sequence_id);
01003 }
01004 }
01005 CheckPendingBuffers();
01006 }
01007
01008 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
01009 {
01010 return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
01011 }
01012
01013 void artdaq::SharedMemoryEventManager::CheckPendingBuffers()
01014 {
01015 TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
01016 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
01017 TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
01018 check_pending_buffers_(lk);
01019 }
01020
01021 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
01022 {
01023 TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
01024
01025 auto buffers = GetBuffersOwnedByManager();
01026 for (auto buf : buffers)
01027 {
01028 if (ResetBuffer(buf) && !pending_buffers_.count(buf))
01029 {
01030 TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
01031 auto hdr = getEventHeader_(buf);
01032 if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_))
01033 {
01034 if (requests_)
01035 {
01036 requests_->RemoveRequest(hdr->sequence_id);
01037 }
01038 TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
01039 active_buffers_.erase(buf);
01040 pending_buffers_.insert(buf);
01041 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
01042 << size() << ","
01043 << ReadReadyCount() << ","
01044 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
01045 << WriteReadyCount(false) << ","
01046 << pending_buffers_.size() << ","
01047 << active_buffers_.size() << ")";
01048
01049 subrun_incomplete_event_count_++;
01050 run_incomplete_event_count_++;
01051 if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
01052 if (!released_incomplete_events_.count(hdr->sequence_id))
01053 {
01054 released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
01055 }
01056 else
01057 {
01058 released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
01059 }
01060 TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
01061 }
01062
01063 }
01064 }
01065
01066 Fragment::sequence_id_t lowestSeqId = Fragment::InvalidSequenceID;
01067
01068
01069 if (ReadyForWrite(false))
01070 {
01071 for (auto buf : active_buffers_)
01072 {
01073 auto hdr = getEventHeader_(buf);
01074 TLOG(TLVL_TRACE) << "Buffer: " << buf << ", SeqID: " << hdr->sequence_id << ", ACTIVE";
01075 if (hdr->sequence_id < lowestSeqId)
01076 {
01077 lowestSeqId = hdr->sequence_id;
01078 }
01079 }
01080 TLOG(TLVL_TRACE) << "Lowest SeqID held: " << lowestSeqId;
01081 }
01082
01083 std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
01084 sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
01085
01086 auto counter = 0;
01087 double eventSize = 0;
01088 for (auto buf : sorted_buffers)
01089 {
01090 auto hdr = getEventHeader_(buf);
01091 if (hdr->sequence_id > lowestSeqId) break;
01092
01093 if (hdr->sequence_id >= subrun_rollover_event_)
01094 {
01095 TLOG(TLVL_INFO) << "Subrun rollover reached at event " << hdr->sequence_id << " (boundary=" << subrun_rollover_event_ << "), last released event is " << last_released_event_ << ".";
01096 endSubrun();
01097 startSubrun();
01098 }
01099 if (hdr->sequence_id > last_released_event_) last_released_event_ = hdr->sequence_id;
01100
01101 TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
01102 << "event_size=" << BufferDataSize(buf) << ", buffer_size=" << BufferSize();
01103
01104 TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
01105 MarkBufferFull(buf);
01106 subrun_event_count_++;
01107 run_event_count_++;
01108 counter++;
01109 eventSize += BufferDataSize(buf);
01110 pending_buffers_.erase(buf);
01111 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
01112 << size() << ","
01113 << ReadReadyCount() << ","
01114 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
01115 << WriteReadyCount(false) << ","
01116 << pending_buffers_.size() << ","
01117 << active_buffers_.size() << ")";
01118 }
01119
01120 if (requests_)
01121 {
01122 auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
01123 auto available_buffers = WriteReadyCount(overwrite_mode_);
01124
01125 TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
01126 << ", tokens_to_send: " << available_buffers - outstanding_tokens;
01127
01128 if (available_buffers > outstanding_tokens)
01129 {
01130 auto tokens_to_send = available_buffers - outstanding_tokens;
01131
01132 while (tokens_to_send > 0)
01133 {
01134 TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
01135 requests_->SendRoutingToken(1);
01136 tokens_to_send--;
01137 }
01138 }
01139 }
01140
01141 metric_data_.event_count += counter;
01142 metric_data_.event_size += eventSize;
01143
01144 if (metricMan && TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500)
01145 {
01146 TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
01147 metricMan->sendMetric("Event Rate", metric_data_.event_count, "Events/s", 1, MetricMode::Rate);
01148 if (metric_data_.event_count > 0) metricMan->sendMetric("Average Event Size", metric_data_.event_size / metric_data_.event_count, "Bytes", 1, MetricMode::Average);
01149 metric_data_ = MetricData();
01150
01151 metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
01152 metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
01153 metricMan->sendMetric("Events Released to art this subrun", subrun_event_count_, "Events", 2, MetricMode::LastPoint);
01154 metricMan->sendMetric("Incomplete Events Released to art this subrun", subrun_incomplete_event_count_, "Events", 2, MetricMode::LastPoint);
01155 if(requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
01156
01157 auto bufferReport = GetBufferReport();
01158 int full = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Full; });
01159 int empty = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Empty; });
01160 int writing = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Writing; });
01161 int reading = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Reading; });
01162 auto total = size();
01163 TLOG(TLVL_DEBUG) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
01164
01165 metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
01166 metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
01167 metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
01168 metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
01169 if(total > 0)
01170 {
01171 metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
01172 metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
01173 }
01174
01175 last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now();
01176 }
01177 TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
01178 }
01179
01180 void artdaq::SharedMemoryEventManager::send_init_frag_()
01181 {
01182 if (init_fragment_ != nullptr)
01183 {
01184 TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
01185
01186 #if 0
01187 std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
01188 std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
01189 ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
01190 ostream.close();
01191 #endif
01192
01193 broadcastFragment_(std::move(init_fragment_), init_fragment_);
01194 TLOG(TLVL_TRACE) << "Init Fragment sent";
01195 }
01196 else if (send_init_fragments_)
01197 {
01198 TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
01199 }
01200 }
01201
01202 void artdaq::SharedMemoryEventManager::SetInitFragment(FragmentPtr frag)
01203 {
01204 if (!init_fragment_ || init_fragment_ == nullptr)
01205 {
01206 init_fragment_.swap(frag);
01207 send_init_frag_();
01208 }
01209 }
01210
01211 void artdaq::SharedMemoryEventManager::UpdateArtConfiguration(fhicl::ParameterSet art_pset)
01212 {
01213 TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
01214 if (art_pset != current_art_pset_ || !current_art_config_file_)
01215 {
01216 current_art_pset_ = art_pset;
01217 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
01218 }
01219 TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
01220 }
01221
01222 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
01223 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager)
01224 #endif