00001
00002 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
00003
00004 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
00005 #include "artdaq-core/Core/StatisticsCollection.hh"
00006 #include "artdaq-core/Utilities/TraceLock.hh"
00007 #include <sys/wait.h>
00008
00009 #define TLVL_BUFFER 40
00010 #define TLVL_BUFLCK 41
00011
00012 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
00013
00014 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
00015 : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
00016 pset.get<size_t>("buffer_count"),
00017 pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
00018 pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
00019 !pset.get<bool>("broadcast_mode", false))
00020 , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
00021 , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
00022 , queue_size_(pset.get<size_t>("buffer_count"))
00023 , run_id_(0)
00024 , subrun_id_(0)
00025 , subrun_rollover_event_(Fragment::InvalidSequenceID)
00026 , last_released_event_(0)
00027 , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
00028 , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
00029 , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
00030 , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
00031 , running_(false)
00032 , buffer_writes_pending_()
00033 , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
00034 , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
00035 , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now())
00036 , metric_data_()
00037 , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
00038 , run_event_count_(0)
00039 , run_incomplete_event_count_(0)
00040 , subrun_event_count_(0)
00041 , subrun_incomplete_event_count_(0)
00042 , oversize_fragment_count_(0)
00043 , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
00044 , art_processes_()
00045 , restart_art_(false)
00046 , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
00047 , manual_art_(pset.get<bool>("manual_art", false))
00048 , current_art_pset_(art_pset)
00049 , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
00050 , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 100000))
00051 , requests_(nullptr)
00052 , data_pset_(pset)
00053 , dropped_data_()
00054 , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
00055 pset.get<size_t>("broadcast_buffer_count", 10),
00056 pset.get<size_t>("broadcast_buffer_size", 0x100000),
00057 pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
00058 {
00059 SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00060 broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00061
00062 if (pset.get<bool>("use_art", true) == false)
00063 {
00064 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
00065 num_art_processes_ = 0;
00066 }
00067 else
00068 {
00069 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
00070 TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
00071 }
00072 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00073
00074 if (overwrite_mode_ && num_art_processes_ > 0)
00075 {
00076 TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
00077 }
00078 else if (overwrite_mode_)
00079 {
00080 TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
00081 }
00082
00083 for (size_t ii = 0; ii < size(); ++ii)
00084 {
00085 buffer_writes_pending_[ii] = 0;
00086 }
00087
00088 if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
00089
00090 TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
00091 SetRank(my_rank);
00092 TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
00093
00094
00095 TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
00096 }
00097
00098 artdaq::SharedMemoryEventManager::~SharedMemoryEventManager()
00099 {
00100 TLOG(TLVL_TRACE) << "DESTRUCTOR";
00101 if (running_) endOfData();
00102 TLOG(TLVL_TRACE) << "Destructor END";
00103 }
00104
00105 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
00106 {
00107 TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
00108 << ", sequence_id=" << frag.sequence_id;
00109 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00110 TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
00111 if (buffer == -1) return false;
00112 if (buffer == -2)
00113 {
00114 TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
00115 return true;
00116 }
00117
00118 auto hdr = getEventHeader_(buffer);
00119 if (update_run_ids_)
00120 {
00121 hdr->run_id = run_id_;
00122 hdr->subrun_id = subrun_id_;
00123 }
00124
00125 TLOG(TLVL_TRACE) << "AddFragment before Write calls";
00126 Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
00127
00128 TLOG(TLVL_TRACE) << "Checking for complete event";
00129 auto fragmentCount = GetFragmentCount(frag.sequence_id);
00130 hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
00131 TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
00132 << ", fragmentCount=" << fragmentCount
00133 << ", num_fragments_per_event=" << num_fragments_per_event_
00134 << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
00135
00136 complete_buffer_(buffer);
00137 if (requests_) requests_->SendRequest(true);
00138
00139 TLOG(TLVL_TRACE) << "AddFragment END";
00140 return true;
00141 }
00142
00143 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
00144 {
00145 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
00146 auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
00147 auto data = frag->headerAddress();
00148 auto start = std::chrono::steady_clock::now();
00149 bool sts = false;
00150 while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
00151 {
00152 sts = AddFragment(hdr, data);
00153 if (!sts) usleep(1000);
00154 }
00155 if (!sts)
00156 {
00157 outfrag = std::move(frag);
00158 }
00159 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
00160 return sts;
00161 }
00162
00163 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
00164 {
00165 TLOG(14) << "WriteFragmentHeader BEGIN";
00166 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00167
00168 if (buffer < 0)
00169 {
00170 if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
00171 if (buffer == -2)
00172 {
00173 TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
00174 }
00175 else
00176 {
00177 TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
00178 }
00179 dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
00180
00181 TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
00182 return dropped_data_[frag.fragment_id]->dataBegin();
00183 }
00184
00185
00186 buffer_writes_pending_[buffer]++;
00187
00188 if (metricMan)
00189 {
00190 metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
00191 }
00192
00193 TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
00194
00195 std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
00196
00197 TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
00198
00199
00200 auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
00201 Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
00202
00203 auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
00204 if (frag.word_count - frag.num_words() > 0)
00205 {
00206 auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
00207
00208 if (!sts)
00209 {
00210 reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words();
00211 reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType;
00212 TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
00213 dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
00214
00215 oversize_fragment_count_++;
00216
00217 if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
00218 {
00219 throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
00220 }
00221
00222 TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin();
00223 return dropped_data_[frag.fragment_id]->dataBegin();
00224 }
00225 }
00226 TLOG(14) << "WriteFragmentHeader END";
00227 return pos;
00228
00229 }
00230
00231 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
00232 {
00233 TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
00234 auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
00235 if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
00236 if (buffer == -2) { return; }
00237
00238 {
00239 TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
00240
00241 std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
00242
00243 TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
00244
00245
00246
00247 TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")";
00248 auto hdr = getEventHeader_(buffer);
00249 if (update_run_ids_)
00250 {
00251 hdr->run_id = run_id_;
00252 hdr->subrun_id = subrun_id_;
00253 }
00254
00255 buffer_writes_pending_[buffer]--;
00256 if (buffer_writes_pending_[buffer] != 0)
00257 {
00258 TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
00259 return;
00260 }
00261 TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
00262 auto frag_count = GetFragmentCount(frag.sequence_id);
00263 hdr->is_complete = frag_count == num_fragments_per_event_;
00264 TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
00265 #if ART_SUPPORTS_DUPLICATE_EVENTS
00266 if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
00267 {
00268 hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
00269 }
00270 #endif
00271 }
00272
00273 complete_buffer_(buffer);
00274 if (requests_) requests_->SendRequest(true);
00275 TLOG(TLVL_TRACE) << "DoneWritingFragment END";
00276 }
00277
00278 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
00279 {
00280 return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
00281 }
00282
00283 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
00284 {
00285 if (buffer == -1) return 0;
00286 ResetReadPos(buffer);
00287 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00288
00289 size_t count = 0;
00290
00291 while (MoreDataInBuffer(buffer))
00292 {
00293 auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
00294 IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
00295 if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
00296 TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
00297 ++count;
00298 }
00299
00300 return count;
00301 }
00302
00303 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out)
00304 {
00305 do
00306 {
00307 auto start_time = std::chrono::steady_clock::now();
00308 send_init_frag_();
00309 TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
00310
00311 pid_t pid = 0;
00312
00313 if (!manual_art_)
00314 {
00315 char* filename = new char[config_file->getFileName().length() + 1];
00316 strcpy(filename, config_file->getFileName().c_str());
00317
00318 std::vector<char*> args{ (char*)"art", (char*)"-c", filename, NULL };
00319 pid = fork();
00320 if (pid == 0)
00321 {
00322
00323
00324
00325
00326 std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
00327 std::string envVarValue = std::to_string(GetPartitionNumber());
00328 if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
00329 {
00330 TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
00331 << "\" in the environment of a child art process. "
00332 << "This may result in incorrect TCP port number "
00333 << "assignments or other issues, and data may "
00334 << "not flow through the system correctly.";
00335 }
00336
00337 execvp("art", &args[0]);
00338 delete[] filename;
00339 exit(1);
00340 }
00341 delete[] filename;
00342 }
00343 else
00344 {
00345
00346 std::cout << "Please run the following command in a separate terminal:" << std::endl
00347 << "art -c " << config_file->getFileName() << std::endl
00348 << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
00349 << "Finally, return to this window and enter the pid: " << std::endl;
00350 std::cin >> pid;
00351 }
00352 *pid_out = pid;
00353
00354 TLOG(TLVL_INFO) << "PID of new art process is " << pid;
00355 {
00356 std::unique_lock<std::mutex> lk(art_process_mutex_);
00357 art_processes_.insert(pid);
00358 }
00359 siginfo_t status;
00360 auto sts = waitid(P_PID, pid, &status, WEXITED);
00361 TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
00362 {
00363 std::unique_lock<std::mutex> lk(art_process_mutex_);
00364 art_processes_.erase(pid);
00365 }
00366 if (sts < 0)
00367 {
00368 TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
00369 }
00370 else if (status.si_code == CLD_EXITED && status.si_status == 0)
00371 {
00372 TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
00373 }
00374 else
00375 {
00376 auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
00377 if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
00378
00379 auto exit_type = "exited with status code";
00380 switch (status.si_code)
00381 {
00382 case CLD_DUMPED:
00383 case CLD_KILLED:
00384 exit_type = "was killed with signal";
00385 break;
00386 case CLD_EXITED:
00387 default:
00388 break;
00389 }
00390
00391 TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
00392 << "art process " << pid << " " << exit_type << " " << status.si_status
00393 << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
00394 << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
00395 << (restart_art_ ? "restarting" : "not restarting");
00396 }
00397 } while (restart_art_);
00398 }
00399
00400 void artdaq::SharedMemoryEventManager::StartArt()
00401 {
00402 restart_art_ = always_restart_art_;
00403 if (num_art_processes_ == 0) return;
00404 for (size_t ii = 0; ii < num_art_processes_; ++ii)
00405 {
00406 StartArtProcess(current_art_pset_);
00407 }
00408 }
00409
00410 pid_t artdaq::SharedMemoryEventManager::StartArtProcess(fhicl::ParameterSet pset)
00411 {
00412 static std::mutex start_art_mutex;
00413 std::unique_lock<std::mutex> lk(start_art_mutex);
00414
00415 restart_art_ = always_restart_art_;
00416 auto initialCount = GetAttachedCount();
00417 auto startTime = std::chrono::steady_clock::now();
00418
00419 if (pset != current_art_pset_ || !current_art_config_file_)
00420 {
00421 current_art_pset_ = pset;
00422 current_art_config_file_ = std::make_shared<art_config_file>(pset);
00423 }
00424 std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
00425 boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
00426 thread.detach();
00427
00428 auto currentCount = GetAttachedCount() - initialCount;
00429 while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
00430 {
00431 usleep(10000);
00432 currentCount = GetAttachedCount() - initialCount;
00433 }
00434 if ((currentCount < 1 || *pid <= 0) && manual_art_)
00435 {
00436 TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
00437 return 0;
00438 }
00439 else if (currentCount < 1 || *pid <= 0)
00440 {
00441 TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
00442 << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
00443 return 0;
00444 }
00445 else
00446 {
00447 TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
00448 << TimeUtils::GetElapsedTime(startTime) << " seconds.";
00449
00450 return *pid;
00451 }
00452
00453 }
00454
00455 void artdaq::SharedMemoryEventManager::ShutdownArtProcesses(std::set<pid_t>& pids)
00456 {
00457 restart_art_ = false;
00458
00459
00460
00461 auto check_pids = [&](bool print) {
00462
00463 for (auto pid = pids.begin(); pid != pids.end();)
00464 {
00465
00466
00467 std::unique_lock<std::mutex> lk(art_process_mutex_);
00468 if (*pid <= 0)
00469 {
00470 TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
00471 << ") from the shutdown list.";
00472 pid = pids.erase(pid);
00473 }
00474 else if (kill(*pid, 0) < 0)
00475 {
00476 pid = pids.erase(pid);
00477 }
00478 else
00479 {
00480 if (print) std::cout << *pid << " ";
00481 ++pid;
00482 }
00483 }
00484 };
00485 check_pids(false);
00486 if (pids.size() == 0)
00487 {
00488 TLOG(14) << "All art processes already exited, nothing to do.";
00489 usleep(1000);
00490 return;
00491 }
00492
00493 if (!manual_art_)
00494 {
00495 TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
00496 for (auto pid : pids)
00497 {
00498 TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
00499 kill(pid, SIGQUIT);
00500 }
00501
00502 int graceful_wait_ms = 5000;
00503 int int_wait_ms = 1000;
00504
00505 TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
00506 for (int ii = 0; ii < graceful_wait_ms; ++ii)
00507 {
00508 usleep(1000);
00509
00510 check_pids(false);
00511 if (pids.size() == 0)
00512 {
00513 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
00514 return;
00515 }
00516 }
00517
00518 TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
00519 for (auto pid : pids)
00520 {
00521 kill(pid, SIGINT);
00522 }
00523
00524 TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
00525 for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
00526 {
00527 usleep(1000);
00528
00529 check_pids(false);
00530
00531 if (pids.size() == 0)
00532 {
00533 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
00534 return;
00535 }
00536 }
00537
00538 TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
00539 while (pids.size() > 0)
00540 {
00541 kill(*pids.begin(), SIGKILL);
00542 usleep(1000);
00543
00544 check_pids(false);
00545 }
00546 }
00547 else
00548 {
00549 std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
00550 while (pids.size() > 0)
00551 {
00552 std::cout << "The following PIDs are running: ";
00553 check_pids(true);
00554 std::cout << std::endl;
00555 std::string ignored;
00556 std::cin >> ignored;
00557 }
00558 }
00559 }
00560
00561 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
00562 {
00563 TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
00564 if (restart_art_ || !always_restart_art_)
00565 {
00566 endOfData();
00567 }
00568 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00569 {
00570 broadcasts_.MarkBufferEmpty(ii, true);
00571 }
00572 if (newRun == 0) newRun = run_id_ + 1;
00573
00574 if (art_pset != current_art_pset_ || !current_art_config_file_)
00575 {
00576 current_art_pset_ = art_pset;
00577 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00578 }
00579
00580 if (n_art_processes != -1)
00581 {
00582 TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
00583 num_art_processes_ = n_art_processes;
00584 }
00585 startRun(newRun);
00586 TLOG(TLVL_DEBUG) << "ReconfigureArt END";
00587 }
00588
00589 bool artdaq::SharedMemoryEventManager::endOfData()
00590 {
00591 running_ = false;
00592 init_fragment_.reset(nullptr);
00593 TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
00594 restart_art_ = false;
00595
00596 size_t initialStoreSize = GetIncompleteEventCount();
00597 TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
00598 << " stale events from the SharedMemoryEventManager.";
00599 int counter = initialStoreSize;
00600 while (active_buffers_.size() > 0 && counter > 0)
00601 {
00602 complete_buffer_(*active_buffers_.begin());
00603 counter--;
00604 }
00605 TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
00606 << " stale events in the SharedMemoryEventManager.";
00607
00608
00609 TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
00610 auto start = std::chrono::steady_clock::now();
00611 auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00612 auto end_of_data_wait_us = art_event_processing_time_us_ * lastReadCount;
00613
00614 auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
00615
00616
00617 while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
00618 {
00619 auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00620 if (temp != lastReadCount)
00621 {
00622 TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
00623 lastReadCount = temp;
00624 start = std::chrono::steady_clock::now();
00625 }
00626 if (lastReadCount > 0) {
00627 TRACE(19, "About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu", outstanding_buffer_wait_time, lastReadCount, size(), end_of_data_wait_us);
00628 usleep(outstanding_buffer_wait_time);
00629 }
00630 }
00631
00632 TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
00633 << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
00634
00635 TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
00636 FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
00637 bool success = broadcastFragment_(std::move(outFrag), outFrag);
00638 if (!success)
00639 {
00640 TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
00641 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00642 {
00643 broadcasts_.MarkBufferEmpty(ii, true);
00644 }
00645 broadcastFragment_(std::move(outFrag), outFrag);
00646 }
00647 auto endOfDataProcessingStart = std::chrono::steady_clock::now();
00648
00649 if (get_art_process_count_() > 0)
00650 {
00651 TLOG(TLVL_DEBUG) << "Allowing " << get_art_process_count_() << " art processes the chance to end gracefully";
00652 if (end_of_data_wait_us == 0)
00653 {
00654 TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
00655 end_of_data_wait_us = 100 * 1000000;
00656 }
00657
00658 auto sleep_count = (end_of_data_wait_us / 10000) + 1;
00659 for (size_t ii = 0; ii < sleep_count; ++ii)
00660 {
00661 usleep(10000);
00662 if (get_art_process_count_() == 0) break;
00663 }
00664 }
00665
00666 while (get_art_process_count_() > 0)
00667 {
00668 TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
00669
00670 ShutdownArtProcesses(art_processes_);
00671 }
00672 TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
00673
00674 ResetAttachedCount();
00675
00676 TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
00677 for (size_t ii = 0; ii < size(); ++ii)
00678 {
00679 MarkBufferEmpty(ii, true);
00680 }
00681
00682
00683
00684
00685
00686
00687 released_incomplete_events_.clear();
00688
00689 TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestReceiver";
00690 requests_.reset(nullptr);
00691
00692 TLOG(TLVL_DEBUG) << "endOfData END";
00693 TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
00694 return true;
00695 }
00696
00697 void artdaq::SharedMemoryEventManager::startRun(run_id_t runID)
00698 {
00699 running_ = true;
00700 init_fragment_.reset(nullptr);
00701 TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
00702 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00703 {
00704 broadcasts_.MarkBufferEmpty(ii, true);
00705 }
00706 StartArt();
00707 run_id_ = runID;
00708 subrun_id_ = 1;
00709 subrun_rollover_event_ = Fragment::InvalidSequenceID;
00710 last_released_event_ = 0;
00711 requests_.reset(new RequestSender(data_pset_));
00712 if (requests_)
00713 {
00714 requests_->SendRoutingToken(queue_size_);
00715 }
00716 TLOG(TLVL_DEBUG) << "Starting run " << run_id_
00717 << ", max queue size = "
00718 << queue_size_
00719 << ", queue size = "
00720 << GetLockedBufferCount();
00721 if (metricMan)
00722 {
00723 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00724 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00725 }
00726 }
00727
00728 void artdaq::SharedMemoryEventManager::startSubrun()
00729 {
00730 ++subrun_id_;
00731 subrun_rollover_event_ = Fragment::InvalidSequenceID;
00732 if (metricMan)
00733 {
00734 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00735 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00736 }
00737 }
00738
00739 bool artdaq::SharedMemoryEventManager::endRun()
00740 {
00741 TLOG(TLVL_INFO) << "Ending run " << run_id_;
00742 FragmentPtr endOfRunFrag(new
00743 Fragment(static_cast<size_t>
00744 (ceil(sizeof(my_rank) /
00745 static_cast<double>(sizeof(Fragment::value_type))))));
00746
00747 TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
00748 endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
00749 *endOfRunFrag->dataBegin() = my_rank;
00750 broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
00751
00752 TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
00753 run_event_count_ = 0;
00754 run_incomplete_event_count_ = 0;
00755 oversize_fragment_count_ = 0;
00756 return true;
00757 }
00758
00759 bool artdaq::SharedMemoryEventManager::endSubrun()
00760 {
00761 TLOG(TLVL_INFO) << "Ending subrun " << subrun_id_;
00762 std::unique_ptr<artdaq::Fragment>
00763 endOfSubrunFrag(new
00764 Fragment(static_cast<size_t>
00765 (ceil(sizeof(my_rank) /
00766 static_cast<double>(sizeof(Fragment::value_type))))));
00767
00768 TLOG(TLVL_DEBUG) << "Broadcasting EndOfSubrun Fragment";
00769 endOfSubrunFrag->setSequenceID(subrun_rollover_event_);
00770 endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
00771 *endOfSubrunFrag->dataBegin() = my_rank;
00772
00773 broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
00774
00775 TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun.";
00776 subrun_event_count_ = 0;
00777 subrun_incomplete_event_count_ = 0;
00778
00779 return true;
00780 }
00781
00782 void artdaq::SharedMemoryEventManager::rolloverSubrun(sequence_id_t boundary)
00783 {
00784
00785 if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
00786
00787 if (boundary < last_released_event_)
00788 {
00789 auto logLevel = TLVL_ERROR;
00790 bool processAnyway = false;
00791 if (last_released_event_ - boundary < 100)
00792 {
00793 logLevel = TLVL_WARNING;
00794 processAnyway = true;
00795 }
00796 TLOG(logLevel) << "Subrun rollover requested for event that is in the past. (last_released_event="
00797 << last_released_event_ << ",requested_rollover_boundary=" << boundary << ").";
00798 if (!processAnyway) return;
00799 }
00800 TLOG(TLVL_INFO) << "Will roll over when I reach Sequence ID " << boundary;
00801
00802
00803
00804
00805
00806
00807
00808
00809 if (boundary == last_released_event_ + 1) {
00810 TLOG(TLVL_INFO) << "rolloverSubrun: Last released event had sequence id " << last_released_event_ << \
00811 ", boundary is sequence id " << boundary << ", so will start a new subrun here";
00812 endSubrun();
00813 startSubrun();
00814 subrun_rollover_event_ = std::numeric_limits<sequence_id_t>::max();
00815 }
00816 else {
00817 subrun_rollover_event_ = boundary;
00818 }
00819 }
00820
00821 void artdaq::SharedMemoryEventManager::sendMetrics()
00822 {
00823 if (metricMan)
00824 {
00825 metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
00826 metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
00827 }
00828
00829 if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
00830 {
00831 if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
00832 return;
00833
00834 last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
00835 std::ostringstream oss;
00836 oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
00837 for (auto& ev : active_buffers_)
00838 {
00839 auto hdr = getEventHeader_(ev);
00840 oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
00841 }
00842 TLOG(TLVL_DEBUG) << oss.str();
00843 }
00844 }
00845
00846 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
00847 {
00848 TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
00849 auto buffer = broadcasts_.GetBufferForWriting(false);
00850 TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
00851 auto start_time = std::chrono::steady_clock::now();
00852 while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
00853 {
00854 usleep(10000);
00855 buffer = broadcasts_.GetBufferForWriting(false);
00856 }
00857 TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
00858 if (buffer == -1)
00859 {
00860 TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
00861 outFrag.swap(frag);
00862 return false;
00863 }
00864
00865 TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
00866 auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
00867 hdr->run_id = run_id_;
00868 hdr->subrun_id = subrun_id_;
00869 hdr->sequence_id = frag->sequenceID();
00870 hdr->is_complete = true;
00871 broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
00872
00873 TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
00874 broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
00875
00876 TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
00877 broadcasts_.MarkBufferFull(buffer, -1);
00878 outFrag.swap(frag);
00879 TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
00880 return true;
00881 }
00882
00883 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
00884 {
00885 return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
00886 }
00887
00888 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
00889 {
00890 TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
00891 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00892
00893 TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
00894
00895 auto buffers = GetBuffersOwnedByManager();
00896 for (auto& buf : buffers)
00897 {
00898 auto hdr = getEventHeader_(buf);
00899 if (hdr->sequence_id == seqID)
00900 {
00901 TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
00902 return buf;
00903 }
00904 }
00905
00906 #if !ART_SUPPORTS_DUPLICATE_EVENTS
00907 if (released_incomplete_events_.count(seqID))
00908 {
00909 TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
00910 return -2;
00911 }
00912 #endif
00913
00914 if (!create_new) return -1;
00915
00916 check_pending_buffers_(lk);
00917 int new_buffer = GetBufferForWriting(false);
00918
00919 if (new_buffer == -1)
00920 {
00921 new_buffer = GetBufferForWriting(overwrite_mode_);
00922 }
00923
00924 if (new_buffer == -1) return -1;
00925 TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
00926 std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
00927 TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
00928
00929 auto hdr = getEventHeader_(new_buffer);
00930 hdr->is_complete = false;
00931 hdr->run_id = run_id_;
00932 hdr->subrun_id = subrun_id_;
00933 hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
00934 hdr->sequence_id = seqID;
00935 buffer_writes_pending_[new_buffer] = 0;
00936 IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
00937 SetMFIteration("Sequence ID " + std::to_string(seqID));
00938
00939 TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
00940 active_buffers_.insert(new_buffer);
00941 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
00942 << size() << ","
00943 << ReadReadyCount() << ","
00944 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
00945 << WriteReadyCount(false) << ","
00946 << pending_buffers_.size() << ","
00947 << active_buffers_.size() << ")";
00948
00949 if (requests_)
00950 {
00951 if (timestamp != Fragment::InvalidTimestamp)
00952 {
00953 requests_->AddRequest(seqID, timestamp);
00954 }
00955
00956
00957 else
00958 {
00959 requests_->SendRequest();
00960 }
00961 }
00962 TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
00963 return new_buffer;
00964 }
00965
00966 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
00967 {
00968 if (buffer == -1) return true;
00969 if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
00970 {
00971 return true;
00972 }
00973 ResetReadPos(buffer);
00974 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00975 return MoreDataInBuffer(buffer);
00976 }
00977
00978 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
00979 {
00980 auto hdr = getEventHeader_(buffer);
00981 if (hdr->is_complete)
00982 {
00983 TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
00984
00985 {
00986 TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
00987
00988 TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
00989 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00990 TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
00991 active_buffers_.erase(buffer);
00992 pending_buffers_.insert(buffer);
00993
00994 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
00995 << size() << ","
00996 << ReadReadyCount() << ","
00997 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
00998 << WriteReadyCount(false) << ","
00999 << pending_buffers_.size() << ","
01000 << active_buffers_.size() << ")";
01001 }
01002 if (requests_)
01003 {
01004 requests_->RemoveRequest(hdr->sequence_id);
01005 }
01006 }
01007 CheckPendingBuffers();
01008 }
01009
01010 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
01011 {
01012 return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
01013 }
01014
01015 void artdaq::SharedMemoryEventManager::CheckPendingBuffers()
01016 {
01017 TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
01018 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
01019 TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
01020 check_pending_buffers_(lk);
01021 }
01022
01023 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
01024 {
01025 TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
01026
01027 auto buffers = GetBuffersOwnedByManager();
01028 for (auto buf : buffers)
01029 {
01030 if (ResetBuffer(buf) && !pending_buffers_.count(buf))
01031 {
01032 TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
01033 auto hdr = getEventHeader_(buf);
01034 if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_))
01035 {
01036 if (requests_)
01037 {
01038 requests_->RemoveRequest(hdr->sequence_id);
01039 }
01040 TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
01041 active_buffers_.erase(buf);
01042 pending_buffers_.insert(buf);
01043 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
01044 << size() << ","
01045 << ReadReadyCount() << ","
01046 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
01047 << WriteReadyCount(false) << ","
01048 << pending_buffers_.size() << ","
01049 << active_buffers_.size() << ")";
01050
01051 subrun_incomplete_event_count_++;
01052 run_incomplete_event_count_++;
01053 if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
01054 if (!released_incomplete_events_.count(hdr->sequence_id))
01055 {
01056 released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
01057 }
01058 else
01059 {
01060 released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
01061 }
01062 TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
01063 }
01064
01065 }
01066 }
01067
01068 std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
01069 sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
01070
01071 auto counter = 0;
01072 double eventSize = 0;
01073 for (auto buf : sorted_buffers)
01074 {
01075 auto hdr = getEventHeader_(buf);
01076
01077 if (hdr->sequence_id >= subrun_rollover_event_)
01078 {
01079 TLOG(TLVL_INFO) << "Subrun rollover reached at event " << hdr->sequence_id << " (boundary=" << subrun_rollover_event_ << "), last released event is " << last_released_event_ << ".";
01080 endSubrun();
01081 startSubrun();
01082 }
01083 if (hdr->sequence_id > last_released_event_) last_released_event_ = hdr->sequence_id;
01084
01085 TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
01086 << "event_size=" << BufferDataSize(buf) << ", buffer_size=" << BufferSize();
01087
01088 TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
01089 MarkBufferFull(buf);
01090 subrun_event_count_++;
01091 run_event_count_++;
01092 counter++;
01093 eventSize += BufferDataSize(buf);
01094 pending_buffers_.erase(buf);
01095 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
01096 << size() << ","
01097 << ReadReadyCount() << ","
01098 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
01099 << WriteReadyCount(false) << ","
01100 << pending_buffers_.size() << ","
01101 << active_buffers_.size() << ")";
01102 }
01103
01104 if (requests_)
01105 {
01106 auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
01107 auto available_buffers = WriteReadyCount(overwrite_mode_);
01108
01109 TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
01110 << ", tokens_to_send: " << available_buffers - outstanding_tokens;
01111
01112 if (available_buffers > outstanding_tokens)
01113 {
01114 auto tokens_to_send = available_buffers - outstanding_tokens;
01115
01116 while (tokens_to_send > 0)
01117 {
01118 TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
01119 requests_->SendRoutingToken(1);
01120 tokens_to_send--;
01121 }
01122 }
01123 }
01124
01125 metric_data_.event_count += counter;
01126 metric_data_.event_size += eventSize;
01127
01128 if (metricMan && TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500)
01129 {
01130 TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
01131 metricMan->sendMetric("Event Rate", metric_data_.event_count, "Events/s", 1, MetricMode::Rate);
01132 if (metric_data_.event_count > 0) metricMan->sendMetric("Average Event Size", metric_data_.event_size / metric_data_.event_count, "Bytes", 1, MetricMode::Average);
01133 metric_data_ = MetricData();
01134
01135 metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
01136 metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
01137 metricMan->sendMetric("Events Released to art this subrun", subrun_event_count_, "Events", 2, MetricMode::LastPoint);
01138 metricMan->sendMetric("Incomplete Events Released to art this subrun", subrun_incomplete_event_count_, "Events", 2, MetricMode::LastPoint);
01139 if (requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
01140
01141 auto bufferReport = GetBufferReport();
01142 int full = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Full; });
01143 int empty = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Empty; });
01144 int writing = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Writing; });
01145 int reading = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Reading; });
01146 auto total = size();
01147 TLOG(TLVL_DEBUG) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
01148
01149 metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
01150 metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
01151 metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
01152 metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
01153 if (total > 0)
01154 {
01155 metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
01156 metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
01157 }
01158
01159 last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now();
01160 }
01161 TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
01162 }
01163
01164 void artdaq::SharedMemoryEventManager::send_init_frag_()
01165 {
01166 if (init_fragment_ != nullptr)
01167 {
01168 TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
01169
01170 #if 0
01171 std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
01172 std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
01173 ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
01174 ostream.close();
01175 #endif
01176
01177 broadcastFragment_(std::move(init_fragment_), init_fragment_);
01178 TLOG(TLVL_TRACE) << "Init Fragment sent";
01179 }
01180 else if (send_init_fragments_)
01181 {
01182 TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
01183 }
01184 }
01185
01186 void artdaq::SharedMemoryEventManager::SetInitFragment(FragmentPtr frag)
01187 {
01188 if (!init_fragment_ || init_fragment_ == nullptr)
01189 {
01190 init_fragment_.swap(frag);
01191 send_init_frag_();
01192 }
01193 }
01194
01195 void artdaq::SharedMemoryEventManager::UpdateArtConfiguration(fhicl::ParameterSet art_pset)
01196 {
01197 TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
01198 if (art_pset != current_art_pset_ || !current_art_config_file_)
01199 {
01200 current_art_pset_ = art_pset;
01201 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
01202 }
01203 TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
01204 }
01205
01206 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
01207 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager)
01208 #endif