00001
00002 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
00003
00004 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
00005 #include "artdaq-core/Core/StatisticsCollection.hh"
00006 #include "artdaq-core/Utilities/TraceLock.hh"
00007 #include <sys/wait.h>
00008
00009 #define TLVL_BUFFER 40
00010 #define TLVL_BUFLCK 41
00011
00012 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
00013
00014 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
00015 : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
00016 pset.get<size_t>("buffer_count"),
00017 pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
00018 pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
00019 !pset.get<bool>("broadcast_mode", false))
00020 , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
00021 , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
00022 , queue_size_(pset.get<size_t>("buffer_count"))
00023 , run_id_(0)
00024 , subrun_id_(0)
00025 , subrun_rollover_event_(Fragment::InvalidSequenceID)
00026 , last_released_event_(0)
00027 , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
00028 , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
00029 , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
00030 , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
00031 , running_(false)
00032 , buffer_writes_pending_()
00033 , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
00034 , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
00035 , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now())
00036 , metric_data_()
00037 , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
00038 , run_event_count_(0)
00039 , run_incomplete_event_count_(0)
00040 , subrun_event_count_(0)
00041 , subrun_incomplete_event_count_(0)
00042 , oversize_fragment_count_(0)
00043 , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
00044 , art_processes_()
00045 , restart_art_(false)
00046 , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
00047 , manual_art_(pset.get<bool>("manual_art", false))
00048 , current_art_pset_(art_pset)
00049 , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
00050 , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 100000))
00051 , requests_(nullptr)
00052 , data_pset_(pset)
00053 , dropped_data_()
00054 , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
00055 pset.get<size_t>("broadcast_buffer_count", 10),
00056 pset.get<size_t>("broadcast_buffer_size", 0x100000),
00057 pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
00058 {
00059 SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00060 broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00061
00062 if (pset.get<bool>("use_art", true) == false)
00063 {
00064 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
00065 num_art_processes_ = 0;
00066 }
00067 else
00068 {
00069 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
00070 TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
00071 }
00072 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00073
00074 if (overwrite_mode_ && num_art_processes_ > 0)
00075 {
00076 TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
00077 }
00078 else if (overwrite_mode_)
00079 {
00080 TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
00081 }
00082
00083 for (size_t ii = 0; ii < size(); ++ii)
00084 {
00085 buffer_writes_pending_[ii] = 0;
00086 }
00087
00088 if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
00089
00090 TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
00091 SetRank(my_rank);
00092 TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
00093
00094
00095 TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
00096 }
00097
00098 artdaq::SharedMemoryEventManager::~SharedMemoryEventManager()
00099 {
00100 TLOG(TLVL_TRACE) << "DESTRUCTOR";
00101 if (running_) endOfData();
00102 TLOG(TLVL_TRACE) << "Destructor END";
00103 }
00104
00105 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
00106 {
00107 TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
00108 << ", sequence_id=" << frag.sequence_id;
00109 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00110 TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
00111 if (buffer == -1) return false;
00112 if (buffer == -2)
00113 {
00114 TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
00115 return true;
00116 }
00117
00118 auto hdr = getEventHeader_(buffer);
00119 if (update_run_ids_)
00120 {
00121 hdr->run_id = run_id_;
00122 hdr->subrun_id = subrun_id_;
00123 }
00124
00125 TLOG(TLVL_TRACE) << "AddFragment before Write calls";
00126 Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
00127
00128 TLOG(TLVL_TRACE) << "Checking for complete event";
00129 auto fragmentCount = GetFragmentCount(frag.sequence_id);
00130 hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
00131 TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
00132 << ", fragmentCount=" << fragmentCount
00133 << ", num_fragments_per_event=" << num_fragments_per_event_
00134 << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
00135
00136 complete_buffer_(buffer);
00137 if (requests_) requests_->SendRequest(true);
00138
00139 TLOG(TLVL_TRACE) << "AddFragment END";
00140 return true;
00141 }
00142
00143 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
00144 {
00145 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
00146 auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
00147 auto data = frag->headerAddress();
00148 auto start = std::chrono::steady_clock::now();
00149 bool sts = false;
00150 while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
00151 {
00152 sts = AddFragment(hdr, data);
00153 if (!sts) usleep(1000);
00154 }
00155 if (!sts)
00156 {
00157 outfrag = std::move(frag);
00158 }
00159 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
00160 return sts;
00161 }
00162
00163 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
00164 {
00165 TLOG(14) << "WriteFragmentHeader BEGIN";
00166 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00167
00168 if (buffer < 0)
00169 {
00170 if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
00171 if (buffer == -2)
00172 {
00173 TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
00174 }
00175 else
00176 {
00177 TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
00178 }
00179 dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
00180
00181 TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
00182 return dropped_data_[frag.fragment_id]->dataBegin();
00183 }
00184
00185
00186 buffer_writes_pending_[buffer]++;
00187
00188 if (metricMan)
00189 {
00190 metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
00191 }
00192
00193 TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
00194
00195 std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
00196
00197 TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
00198
00199
00200 auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
00201 Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
00202
00203 auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
00204 if (frag.word_count - frag.num_words() > 0)
00205 {
00206 auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
00207
00208 if (!sts)
00209 {
00210 reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words();
00211 reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType;
00212 TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
00213 dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
00214
00215 oversize_fragment_count_++;
00216
00217 if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
00218 {
00219 throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
00220 }
00221
00222 TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin();
00223 return dropped_data_[frag.fragment_id]->dataBegin();
00224 }
00225 }
00226 TLOG(14) << "WriteFragmentHeader END";
00227 return pos;
00228
00229 }
00230
00231 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
00232 {
00233 TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
00234 auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
00235 if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
00236 if (buffer == -2) { return; }
00237
00238 {
00239 TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
00240
00241 std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
00242
00243 TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
00244
00245
00246
00247 TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")";
00248 auto hdr = getEventHeader_(buffer);
00249 if (update_run_ids_)
00250 {
00251 hdr->run_id = run_id_;
00252 hdr->subrun_id = subrun_id_;
00253 }
00254
00255 TLOG(TLVL_TRACE) << "DoneWritingFragment: Updating buffer touch time";
00256 TouchBuffer(buffer);
00257
00258 buffer_writes_pending_[buffer]--;
00259 if (buffer_writes_pending_[buffer] != 0)
00260 {
00261 TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
00262 return;
00263 }
00264 TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
00265 auto frag_count = GetFragmentCount(frag.sequence_id);
00266 hdr->is_complete = frag_count == num_fragments_per_event_;
00267 TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
00268 #if ART_SUPPORTS_DUPLICATE_EVENTS
00269 if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
00270 {
00271 hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
00272 }
00273 #endif
00274 }
00275
00276 complete_buffer_(buffer);
00277 if (requests_) requests_->SendRequest(true);
00278 TLOG(TLVL_TRACE) << "DoneWritingFragment END";
00279 }
00280
00281 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
00282 {
00283 return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
00284 }
00285
00286 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
00287 {
00288 if (buffer == -1) return 0;
00289 ResetReadPos(buffer);
00290 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00291
00292 size_t count = 0;
00293
00294 while (MoreDataInBuffer(buffer))
00295 {
00296 auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
00297 IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
00298 if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
00299 TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
00300 ++count;
00301 }
00302
00303 return count;
00304 }
00305
00306 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out)
00307 {
00308 do
00309 {
00310 auto start_time = std::chrono::steady_clock::now();
00311 send_init_frag_();
00312 TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
00313
00314 pid_t pid = 0;
00315
00316 if (!manual_art_)
00317 {
00318 char* filename = new char[config_file->getFileName().length() + 1];
00319 strcpy(filename, config_file->getFileName().c_str());
00320
00321 std::vector<char*> args{ (char*)"art", (char*)"-c", filename, NULL };
00322 pid = fork();
00323 if (pid == 0)
00324 {
00325
00326
00327
00328
00329 std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
00330 std::string envVarValue = std::to_string(GetPartitionNumber());
00331 if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
00332 {
00333 TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
00334 << "\" in the environment of a child art process. "
00335 << "This may result in incorrect TCP port number "
00336 << "assignments or other issues, and data may "
00337 << "not flow through the system correctly.";
00338 }
00339 envVarKey = "ARTDAQ_APPLICATION_NAME";
00340 envVarValue = app_name;
00341 if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
00342 {
00343 TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
00344 << "\" in the environment of a child art process. ";
00345 }
00346
00347 execvp("art", &args[0]);
00348 delete[] filename;
00349 exit(1);
00350 }
00351 delete[] filename;
00352 }
00353 else
00354 {
00355
00356 std::cout << "Please run the following command in a separate terminal:" << std::endl
00357 << "art -c " << config_file->getFileName() << std::endl
00358 << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
00359 << "Finally, return to this window and enter the pid: " << std::endl;
00360 std::cin >> pid;
00361 }
00362 *pid_out = pid;
00363
00364 TLOG(TLVL_INFO) << "PID of new art process is " << pid;
00365 {
00366 std::unique_lock<std::mutex> lk(art_process_mutex_);
00367 art_processes_.insert(pid);
00368 }
00369 siginfo_t status;
00370 auto sts = waitid(P_PID, pid, &status, WEXITED);
00371 TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
00372 {
00373 std::unique_lock<std::mutex> lk(art_process_mutex_);
00374 art_processes_.erase(pid);
00375 }
00376 if (sts < 0)
00377 {
00378 TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
00379 }
00380 else if (status.si_code == CLD_EXITED && status.si_status == 0)
00381 {
00382 TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
00383 }
00384 else
00385 {
00386 auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
00387 if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
00388
00389 auto exit_type = "exited with status code";
00390 switch (status.si_code)
00391 {
00392 case CLD_DUMPED:
00393 case CLD_KILLED:
00394 exit_type = "was killed with signal";
00395 break;
00396 case CLD_EXITED:
00397 default:
00398 break;
00399 }
00400
00401 TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
00402 << "art process " << pid << " " << exit_type << " " << status.si_status
00403 << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
00404 << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
00405 << (restart_art_ ? "restarting" : "not restarting");
00406 }
00407 } while (restart_art_);
00408 }
00409
00410 void artdaq::SharedMemoryEventManager::StartArt()
00411 {
00412 restart_art_ = always_restart_art_;
00413 if (num_art_processes_ == 0) return;
00414 for (size_t ii = 0; ii < num_art_processes_; ++ii)
00415 {
00416 StartArtProcess(current_art_pset_);
00417 }
00418 }
00419
00420 pid_t artdaq::SharedMemoryEventManager::StartArtProcess(fhicl::ParameterSet pset)
00421 {
00422 static std::mutex start_art_mutex;
00423 std::unique_lock<std::mutex> lk(start_art_mutex);
00424
00425 restart_art_ = always_restart_art_;
00426 auto initialCount = GetAttachedCount();
00427 auto startTime = std::chrono::steady_clock::now();
00428
00429 if (pset != current_art_pset_ || !current_art_config_file_)
00430 {
00431 current_art_pset_ = pset;
00432 current_art_config_file_ = std::make_shared<art_config_file>(pset);
00433 }
00434 std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
00435 boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
00436 thread.detach();
00437
00438 auto currentCount = GetAttachedCount() - initialCount;
00439 while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
00440 {
00441 usleep(10000);
00442 currentCount = GetAttachedCount() - initialCount;
00443 }
00444 if ((currentCount < 1 || *pid <= 0) && manual_art_)
00445 {
00446 TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
00447 return 0;
00448 }
00449 else if (currentCount < 1 || *pid <= 0)
00450 {
00451 TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
00452 << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
00453 return 0;
00454 }
00455 else
00456 {
00457 TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
00458 << TimeUtils::GetElapsedTime(startTime) << " seconds.";
00459
00460 return *pid;
00461 }
00462
00463 }
00464
00465 void artdaq::SharedMemoryEventManager::ShutdownArtProcesses(std::set<pid_t>& pids)
00466 {
00467 restart_art_ = false;
00468
00469
00470
00471 auto check_pids = [&](bool print) {
00472
00473 std::unique_lock<std::mutex> lk(art_process_mutex_);
00474 for (auto pid = pids.begin(); pid != pids.end();)
00475 {
00476
00477
00478 if (*pid <= 0)
00479 {
00480 TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
00481 << ") from the shutdown list.";
00482 pid = pids.erase(pid);
00483 }
00484 else if (kill(*pid, 0) < 0)
00485 {
00486 pid = pids.erase(pid);
00487 }
00488 else
00489 {
00490 if (print) std::cout << *pid << " ";
00491 ++pid;
00492 }
00493 }
00494 };
00495 auto count_pids = [&]() {
00496 std::unique_lock<std::mutex> lk(art_process_mutex_);
00497 return pids.size();
00498 };
00499 check_pids(false);
00500 if (count_pids() == 0)
00501 {
00502 TLOG(14) << "All art processes already exited, nothing to do.";
00503 usleep(1000);
00504 return;
00505 }
00506
00507 if (!manual_art_)
00508 {
00509 {
00510 TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
00511 std::unique_lock<std::mutex> lk(art_process_mutex_);
00512 for (auto pid : pids)
00513 {
00514 TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
00515 kill(pid, SIGQUIT);
00516 }
00517 }
00518
00519 int graceful_wait_ms = 5000;
00520 int int_wait_ms = 1000;
00521
00522 TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
00523 for (int ii = 0; ii < graceful_wait_ms; ++ii)
00524 {
00525 usleep(1000);
00526
00527 check_pids(false);
00528 if (count_pids() == 0)
00529 {
00530 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
00531 return;
00532 }
00533 }
00534
00535 {
00536 TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
00537 std::unique_lock<std::mutex> lk(art_process_mutex_);
00538 for (auto pid : pids)
00539 {
00540 kill(pid, SIGINT);
00541 }
00542 }
00543
00544 TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
00545 for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
00546 {
00547 usleep(1000);
00548
00549 check_pids(false);
00550
00551 if (count_pids() == 0)
00552 {
00553 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
00554 return;
00555 }
00556 }
00557
00558 TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
00559 while (count_pids() > 0)
00560 {
00561 {
00562 std::unique_lock<std::mutex> lk(art_process_mutex_);
00563 kill(*pids.begin(), SIGKILL);
00564 usleep(1000);
00565 }
00566 check_pids(false);
00567 }
00568 }
00569 else
00570 {
00571 std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
00572 while (count_pids() > 0)
00573 {
00574 std::cout << "The following PIDs are running: ";
00575 check_pids(true);
00576 std::cout << std::endl;
00577 std::string ignored;
00578 std::cin >> ignored;
00579 }
00580 }
00581 }
00582
00583 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
00584 {
00585 TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
00586 if (restart_art_ || !always_restart_art_)
00587 {
00588 endOfData();
00589 }
00590 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00591 {
00592 broadcasts_.MarkBufferEmpty(ii, true);
00593 }
00594 if (newRun == 0) newRun = run_id_ + 1;
00595
00596 if (art_pset != current_art_pset_ || !current_art_config_file_)
00597 {
00598 current_art_pset_ = art_pset;
00599 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00600 }
00601
00602 if (n_art_processes != -1)
00603 {
00604 TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
00605 num_art_processes_ = n_art_processes;
00606 }
00607 startRun(newRun);
00608 TLOG(TLVL_DEBUG) << "ReconfigureArt END";
00609 }
00610
00611 bool artdaq::SharedMemoryEventManager::endOfData()
00612 {
00613 running_ = false;
00614 init_fragment_.reset(nullptr);
00615 TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
00616 restart_art_ = false;
00617
00618 size_t initialStoreSize = GetIncompleteEventCount();
00619 TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
00620 << " stale events from the SharedMemoryEventManager.";
00621 int counter = initialStoreSize;
00622 while (active_buffers_.size() > 0 && counter > 0)
00623 {
00624 complete_buffer_(*active_buffers_.begin());
00625 counter--;
00626 }
00627 TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
00628 << " stale events in the SharedMemoryEventManager.";
00629
00630
00631 TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
00632 auto start = std::chrono::steady_clock::now();
00633 auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00634 auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1);
00635
00636 auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
00637
00638
00639 while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
00640 {
00641 auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00642 if (temp != lastReadCount)
00643 {
00644 TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
00645 lastReadCount = temp;
00646 start = std::chrono::steady_clock::now();
00647 }
00648 if (lastReadCount > 0) {
00649 TRACE(19, "About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu", outstanding_buffer_wait_time, lastReadCount, size(), end_of_data_wait_us);
00650 usleep(outstanding_buffer_wait_time);
00651 }
00652 }
00653
00654 TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
00655 << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
00656
00657 TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
00658 FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
00659 bool success = broadcastFragment_(std::move(outFrag), outFrag);
00660 if (!success)
00661 {
00662 TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
00663 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00664 {
00665 broadcasts_.MarkBufferEmpty(ii, true);
00666 }
00667 broadcastFragment_(std::move(outFrag), outFrag);
00668 }
00669 auto endOfDataProcessingStart = std::chrono::steady_clock::now();
00670
00671 if (get_art_process_count_() > 0)
00672 {
00673 TLOG(TLVL_DEBUG) << "Allowing " << get_art_process_count_() << " art processes the chance to end gracefully";
00674 if (end_of_data_wait_us == 0)
00675 {
00676 TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
00677 end_of_data_wait_us = 100 * 1000000;
00678 }
00679
00680 auto sleep_count = (end_of_data_wait_us / 10000) + 1;
00681 for (size_t ii = 0; ii < sleep_count; ++ii)
00682 {
00683 usleep(10000);
00684 if (get_art_process_count_() == 0) break;
00685 }
00686 }
00687
00688 while (get_art_process_count_() > 0)
00689 {
00690 TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
00691
00692 ShutdownArtProcesses(art_processes_);
00693 }
00694 TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
00695
00696 ResetAttachedCount();
00697
00698 TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
00699 for (size_t ii = 0; ii < size(); ++ii)
00700 {
00701 MarkBufferEmpty(ii, true);
00702 }
00703
00704
00705
00706
00707
00708
00709 released_incomplete_events_.clear();
00710
00711 TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestReceiver";
00712 requests_.reset(nullptr);
00713
00714 TLOG(TLVL_DEBUG) << "endOfData END";
00715 TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
00716 return true;
00717 }
00718
00719 void artdaq::SharedMemoryEventManager::startRun(run_id_t runID)
00720 {
00721 running_ = true;
00722 init_fragment_.reset(nullptr);
00723 TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
00724 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00725 {
00726 broadcasts_.MarkBufferEmpty(ii, true);
00727 }
00728 StartArt();
00729 run_id_ = runID;
00730 subrun_id_ = 1;
00731 subrun_rollover_event_ = Fragment::InvalidSequenceID;
00732 last_released_event_ = 0;
00733 run_event_count_ = 0;
00734 run_incomplete_event_count_ = 0;
00735 subrun_event_count_ = 0;
00736 subrun_incomplete_event_count_ = 0;
00737 requests_.reset(new RequestSender(data_pset_));
00738 if (requests_)
00739 {
00740 requests_->SendRoutingToken(queue_size_, run_id_);
00741 }
00742 TLOG(TLVL_DEBUG) << "Starting run " << run_id_
00743 << ", max queue size = "
00744 << queue_size_
00745 << ", queue size = "
00746 << GetLockedBufferCount();
00747 if (metricMan)
00748 {
00749 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00750 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00751 }
00752 }
00753
00754 void artdaq::SharedMemoryEventManager::startSubrun()
00755 {
00756 ++subrun_id_;
00757 subrun_event_count_ = 0;
00758 subrun_incomplete_event_count_ = 0;
00759 subrun_rollover_event_ = Fragment::InvalidSequenceID;
00760 if (metricMan)
00761 {
00762 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00763 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00764 }
00765 }
00766
00767 bool artdaq::SharedMemoryEventManager::endRun()
00768 {
00769 TLOG(TLVL_INFO) << "Ending run " << run_id_;
00770 FragmentPtr endOfRunFrag(new
00771 Fragment(static_cast<size_t>
00772 (ceil(sizeof(my_rank) /
00773 static_cast<double>(sizeof(Fragment::value_type))))));
00774
00775 TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
00776 endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
00777 *endOfRunFrag->dataBegin() = my_rank;
00778 broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
00779
00780 TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
00781 run_event_count_ = 0;
00782 run_incomplete_event_count_ = 0;
00783 subrun_event_count_ = 0;
00784 subrun_incomplete_event_count_ = 0;
00785 oversize_fragment_count_ = 0;
00786 return true;
00787 }
00788
00789 bool artdaq::SharedMemoryEventManager::endSubrun()
00790 {
00791 TLOG(TLVL_INFO) << "Ending subrun " << subrun_id_;
00792 std::unique_ptr<artdaq::Fragment>
00793 endOfSubrunFrag(new
00794 Fragment(static_cast<size_t>
00795 (ceil(sizeof(my_rank) /
00796 static_cast<double>(sizeof(Fragment::value_type))))));
00797
00798 TLOG(TLVL_DEBUG) << "Broadcasting EndOfSubrun Fragment";
00799 endOfSubrunFrag->setSequenceID(subrun_rollover_event_);
00800 endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
00801 *endOfSubrunFrag->dataBegin() = my_rank;
00802
00803 broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
00804
00805 TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun.";
00806 subrun_event_count_ = 0;
00807 subrun_incomplete_event_count_ = 0;
00808
00809 return true;
00810 }
00811
00812 void artdaq::SharedMemoryEventManager::rolloverSubrun(sequence_id_t boundary)
00813 {
00814
00815 if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
00816
00817 if (boundary < last_released_event_)
00818 {
00819 auto logLevel = TLVL_ERROR;
00820 bool processAnyway = false;
00821 if (last_released_event_ - boundary < 100)
00822 {
00823 logLevel = TLVL_WARNING;
00824 processAnyway = true;
00825 }
00826 TLOG(logLevel) << "Subrun rollover requested for event that is in the past. (last_released_event="
00827 << last_released_event_ << ",requested_rollover_boundary=" << boundary << ").";
00828 if (!processAnyway) return;
00829 }
00830 TLOG(TLVL_INFO) << "Will roll over when I reach Sequence ID " << boundary;
00831
00832
00833
00834
00835
00836
00837
00838
00839 if (boundary == last_released_event_ + 1) {
00840 TLOG(TLVL_INFO) << "rolloverSubrun: Last released event had sequence id " << last_released_event_ << \
00841 ", boundary is sequence id " << boundary << ", so will start a new subrun here";
00842 endSubrun();
00843 startSubrun();
00844 subrun_rollover_event_ = std::numeric_limits<sequence_id_t>::max();
00845 }
00846 else {
00847 subrun_rollover_event_ = boundary;
00848 }
00849 }
00850
00851 void artdaq::SharedMemoryEventManager::sendMetrics()
00852 {
00853 if (metricMan)
00854 {
00855 metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
00856 metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
00857 }
00858
00859 if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
00860 {
00861 if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
00862 return;
00863
00864 last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
00865 std::ostringstream oss;
00866 oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
00867 for (auto& ev : active_buffers_)
00868 {
00869 auto hdr = getEventHeader_(ev);
00870 oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
00871 }
00872 TLOG(TLVL_DEBUG) << oss.str();
00873 }
00874 }
00875
00876 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
00877 {
00878 TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
00879 auto buffer = broadcasts_.GetBufferForWriting(false);
00880 TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
00881 auto start_time = std::chrono::steady_clock::now();
00882 while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
00883 {
00884 usleep(10000);
00885 buffer = broadcasts_.GetBufferForWriting(false);
00886 }
00887 TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
00888 if (buffer == -1)
00889 {
00890 TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
00891 outFrag.swap(frag);
00892 return false;
00893 }
00894
00895 TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
00896 auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
00897 hdr->run_id = run_id_;
00898 hdr->subrun_id = subrun_id_;
00899 hdr->sequence_id = frag->sequenceID();
00900 hdr->is_complete = true;
00901 broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
00902
00903 TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
00904 broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
00905
00906 TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
00907 broadcasts_.MarkBufferFull(buffer, -1);
00908 outFrag.swap(frag);
00909 TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
00910 return true;
00911 }
00912
00913 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
00914 {
00915 return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
00916 }
00917
00918 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
00919 {
00920 TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
00921 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00922
00923 TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
00924
00925 auto buffers = GetBuffersOwnedByManager();
00926 for (auto& buf : buffers)
00927 {
00928 auto hdr = getEventHeader_(buf);
00929 if (hdr->sequence_id == seqID)
00930 {
00931 TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
00932 return buf;
00933 }
00934 }
00935
00936 #if !ART_SUPPORTS_DUPLICATE_EVENTS
00937 if (released_incomplete_events_.count(seqID))
00938 {
00939 TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
00940 return -2;
00941 }
00942 #endif
00943
00944 if (!create_new) return -1;
00945
00946 check_pending_buffers_(lk);
00947 int new_buffer = GetBufferForWriting(false);
00948
00949 if (new_buffer == -1)
00950 {
00951 new_buffer = GetBufferForWriting(overwrite_mode_);
00952 }
00953
00954 if (new_buffer == -1) return -1;
00955 TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
00956 std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
00957 TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
00958
00959 auto hdr = getEventHeader_(new_buffer);
00960 hdr->is_complete = false;
00961 hdr->run_id = run_id_;
00962 hdr->subrun_id = subrun_id_;
00963 hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
00964 hdr->sequence_id = seqID;
00965 buffer_writes_pending_[new_buffer] = 0;
00966 IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
00967 SetMFIteration("Sequence ID " + std::to_string(seqID));
00968
00969 TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
00970 active_buffers_.insert(new_buffer);
00971 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
00972 << size() << ","
00973 << ReadReadyCount() << ","
00974 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
00975 << WriteReadyCount(false) << ","
00976 << pending_buffers_.size() << ","
00977 << active_buffers_.size() << ")";
00978
00979 if (requests_)
00980 {
00981 if (timestamp != Fragment::InvalidTimestamp)
00982 {
00983 requests_->AddRequest(seqID, timestamp);
00984 }
00985
00986
00987 else
00988 {
00989 requests_->SendRequest();
00990 }
00991 }
00992 TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
00993 return new_buffer;
00994 }
00995
00996 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
00997 {
00998 if (buffer == -1) return true;
00999 if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
01000 {
01001 return true;
01002 }
01003 ResetReadPos(buffer);
01004 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
01005 return MoreDataInBuffer(buffer);
01006 }
01007
01008 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
01009 {
01010 auto hdr = getEventHeader_(buffer);
01011 if (hdr->is_complete)
01012 {
01013 TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
01014
01015 {
01016 TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
01017
01018 TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
01019 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
01020 TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
01021 active_buffers_.erase(buffer);
01022 pending_buffers_.insert(buffer);
01023
01024 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
01025 << size() << ","
01026 << ReadReadyCount() << ","
01027 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
01028 << WriteReadyCount(false) << ","
01029 << pending_buffers_.size() << ","
01030 << active_buffers_.size() << ")";
01031 }
01032 if (requests_)
01033 {
01034 requests_->RemoveRequest(hdr->sequence_id);
01035 }
01036 }
01037 CheckPendingBuffers();
01038 }
01039
01040 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
01041 {
01042 return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
01043 }
01044
01045 void artdaq::SharedMemoryEventManager::CheckPendingBuffers()
01046 {
01047 TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
01048 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
01049 TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
01050 check_pending_buffers_(lk);
01051 }
01052
01053 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
01054 {
01055 TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
01056
01057 auto buffers = GetBuffersOwnedByManager();
01058 for (auto buf : buffers)
01059 {
01060 if (ResetBuffer(buf) && !pending_buffers_.count(buf))
01061 {
01062 TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
01063 auto hdr = getEventHeader_(buf);
01064 if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_))
01065 {
01066 if (requests_)
01067 {
01068 requests_->RemoveRequest(hdr->sequence_id);
01069 }
01070 TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
01071 active_buffers_.erase(buf);
01072 pending_buffers_.insert(buf);
01073 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
01074 << size() << ","
01075 << ReadReadyCount() << ","
01076 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
01077 << WriteReadyCount(false) << ","
01078 << pending_buffers_.size() << ","
01079 << active_buffers_.size() << ")";
01080
01081 subrun_incomplete_event_count_++;
01082 run_incomplete_event_count_++;
01083 if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
01084 if (!released_incomplete_events_.count(hdr->sequence_id))
01085 {
01086 released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
01087 }
01088 else
01089 {
01090 released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
01091 }
01092 TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
01093 }
01094
01095 }
01096 }
01097
01098 std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
01099 sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
01100
01101 auto counter = 0;
01102 double eventSize = 0;
01103 for (auto buf : sorted_buffers)
01104 {
01105 auto hdr = getEventHeader_(buf);
01106
01107 if (hdr->sequence_id >= subrun_rollover_event_)
01108 {
01109 TLOG(TLVL_INFO) << "Subrun rollover reached at event " << hdr->sequence_id << " (boundary=" << subrun_rollover_event_ << "), last released event is " << last_released_event_ << ".";
01110 endSubrun();
01111 startSubrun();
01112 }
01113 if (hdr->sequence_id > last_released_event_) last_released_event_ = hdr->sequence_id;
01114
01115 TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
01116 << "event_size=" << BufferDataSize(buf) << ", buffer_size=" << BufferSize();
01117
01118 TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
01119 MarkBufferFull(buf);
01120 subrun_event_count_++;
01121 run_event_count_++;
01122 counter++;
01123 eventSize += BufferDataSize(buf);
01124 pending_buffers_.erase(buf);
01125 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
01126 << size() << ","
01127 << ReadReadyCount() << ","
01128 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
01129 << WriteReadyCount(false) << ","
01130 << pending_buffers_.size() << ","
01131 << active_buffers_.size() << ")";
01132 }
01133
01134 if (requests_)
01135 {
01136 TLOG(TLVL_TRACE) << "Sent tokens: " << requests_->GetSentTokenCount() << ", Event count: " << run_event_count_;
01137 auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
01138 auto available_buffers = WriteReadyCount(overwrite_mode_);
01139
01140 TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
01141 << ", tokens_to_send: " << available_buffers - outstanding_tokens;
01142
01143 if (available_buffers > outstanding_tokens)
01144 {
01145 auto tokens_to_send = available_buffers - outstanding_tokens;
01146
01147 while (tokens_to_send > 0)
01148 {
01149 TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
01150 requests_->SendRoutingToken(1, run_id_);
01151 tokens_to_send--;
01152 }
01153 }
01154 }
01155
01156 metric_data_.event_count += counter;
01157 metric_data_.event_size += eventSize;
01158
01159 if (metricMan && TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500)
01160 {
01161 TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
01162 metricMan->sendMetric("Event Rate", metric_data_.event_count, "Events/s", 1, MetricMode::Rate);
01163 if (metric_data_.event_count > 0) metricMan->sendMetric("Average Event Size", metric_data_.event_size / metric_data_.event_count, "Bytes", 1, MetricMode::Average);
01164 metric_data_ = MetricData();
01165
01166 metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
01167 metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
01168 metricMan->sendMetric("Events Released to art this subrun", subrun_event_count_, "Events", 2, MetricMode::LastPoint);
01169 metricMan->sendMetric("Incomplete Events Released to art this subrun", subrun_incomplete_event_count_, "Events", 2, MetricMode::LastPoint);
01170 if (requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
01171
01172 auto bufferReport = GetBufferReport();
01173 int full = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Full; });
01174 int empty = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Empty; });
01175 int writing = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Writing; });
01176 int reading = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Reading; });
01177 auto total = size();
01178 TLOG(TLVL_DEBUG) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
01179
01180 metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
01181 metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
01182 metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
01183 metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
01184 if (total > 0)
01185 {
01186 metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
01187 metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
01188 }
01189
01190 last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now();
01191 }
01192 TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
01193 }
01194
01195 void artdaq::SharedMemoryEventManager::send_init_frag_()
01196 {
01197 if (init_fragment_ != nullptr)
01198 {
01199 TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
01200
01201 #if 0
01202 std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
01203 std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
01204 ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
01205 ostream.close();
01206 #endif
01207
01208 broadcastFragment_(std::move(init_fragment_), init_fragment_);
01209 TLOG(TLVL_TRACE) << "Init Fragment sent";
01210 }
01211 else if (send_init_fragments_)
01212 {
01213 TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
01214 }
01215 }
01216
01217 void artdaq::SharedMemoryEventManager::SetInitFragment(FragmentPtr frag)
01218 {
01219 if (!init_fragment_ || init_fragment_ == nullptr)
01220 {
01221 init_fragment_.swap(frag);
01222 send_init_frag_();
01223 }
01224 }
01225
01226 void artdaq::SharedMemoryEventManager::UpdateArtConfiguration(fhicl::ParameterSet art_pset)
01227 {
01228 TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
01229 if (art_pset != current_art_pset_ || !current_art_config_file_)
01230 {
01231 current_art_pset_ = art_pset;
01232 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
01233 }
01234 TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
01235 }
01236
01237 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
01238 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager)
01239 #endif