00001
00002 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
00003
00004 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
00005 #include "artdaq-core/Core/StatisticsCollection.hh"
00006 #include "artdaq-core/Utilities/TraceLock.hh"
00007 #include <sys/wait.h>
00008
00009 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
00010
00011 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
00012 : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
00013 pset.get<size_t>("buffer_count"),
00014 pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
00015 pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
00016 !pset.get<bool>("broadcast_mode", false))
00017 , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
00018 , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
00019 , queue_size_(pset.get<size_t>("buffer_count"))
00020 , run_id_(0)
00021 , subrun_id_(0)
00022 , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
00023 , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
00024 , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
00025 , running_(false)
00026 , buffer_writes_pending_()
00027 , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
00028 , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
00029 , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
00030 , run_event_count_(0)
00031 , run_incomplete_event_count_(0)
00032 , subrun_event_count_(0)
00033 , subrun_incomplete_event_count_(0)
00034 , art_processes_()
00035 , restart_art_(false)
00036 , current_art_pset_(art_pset)
00037 , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
00038 , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 100000))
00039 , requests_(nullptr)
00040 , data_pset_(pset)
00041 , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
00042 pset.get<size_t>("broadcast_buffer_count", 10),
00043 pset.get<size_t>("broadcast_buffer_size", 0x100000),
00044 pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
00045 {
00046 SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00047 broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00048
00049 if (pset.get<bool>("use_art", true) == false) {
00050 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
00051 num_art_processes_ = 0;
00052 }
00053 else {
00054 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
00055 TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
00056 }
00057 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00058
00059 if (overwrite_mode_ && num_art_processes_ > 0)
00060 {
00061 TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
00062 }
00063 else if (overwrite_mode_)
00064 {
00065 TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
00066 }
00067
00068 for (size_t ii = 0; ii < size(); ++ii)
00069 {
00070 buffer_writes_pending_[ii] = 0;
00071 }
00072
00073 if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
00074
00075 TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
00076 SetRank(my_rank);
00077 TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
00078
00079
00080 TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
00081 }
00082
00083 artdaq::SharedMemoryEventManager::~SharedMemoryEventManager()
00084 {
00085 TLOG(TLVL_TRACE) << "DESTRUCTOR";
00086 if (running_) endOfData();
00087 TLOG(TLVL_TRACE) << "Destructor END";
00088 }
00089
00090 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
00091 {
00092 TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << std::to_string(frag.word_count)
00093 << ", sequence_id=" << std::to_string(frag.sequence_id);
00094 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00095 TLOG(TLVL_TRACE) << "Using buffer " << std::to_string(buffer);
00096 if (buffer == -1) return false;
00097 if (buffer == -2)
00098 {
00099 TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << std::to_string(frag.sequence_id);
00100 return true;
00101 }
00102
00103 auto hdr = getEventHeader_(buffer);
00104 if (update_run_ids_)
00105 {
00106 hdr->run_id = run_id_;
00107 hdr->subrun_id = subrun_id_;
00108 }
00109
00110 TLOG(TLVL_TRACE) << "AddFragment before Write calls";
00111 Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
00112
00113 TLOG(TLVL_TRACE) << "Checking for complete event";
00114 auto fragmentCount = GetFragmentCount(frag.sequence_id);
00115 hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
00116 TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
00117 << ", fragmentCount=" << std::to_string(fragmentCount)
00118 << ", num_fragments_per_event=" << std::to_string(num_fragments_per_event_)
00119 << ", buffer_writes_pending_[buffer]=" << std::to_string(buffer_writes_pending_[buffer]);
00120
00121 complete_buffer_(buffer);
00122 if (requests_) requests_->SendRequest(true);
00123
00124 TLOG(TLVL_TRACE) << "AddFragment END";
00125 return true;
00126 }
00127
00128 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
00129 {
00130 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
00131 auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
00132 auto data = frag->headerAddress();
00133 auto start = std::chrono::steady_clock::now();
00134 bool sts = false;
00135 while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
00136 {
00137 sts = AddFragment(hdr, data);
00138 if (!sts) usleep(1000);
00139 }
00140 if (!sts)
00141 {
00142 outfrag = std::move(frag);
00143 }
00144 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
00145 return sts;
00146 }
00147
00148 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
00149 {
00150 TLOG(14) << "WriteFragmentHeader BEGIN";
00151 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00152
00153 if (buffer < 0)
00154 {
00155 if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
00156 if (buffer == -2)
00157 {
00158 TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << std::to_string(frag.sequence_id) << " and fragment id " << std::to_string(frag.fragment_id) << " because data taking has already passed this event.";
00159 }
00160 else
00161 {
00162 TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << std::to_string(frag.sequence_id) << " and fragment id " << std::to_string(frag.fragment_id) << " because there is no room in the queue and reliable mode is off.";
00163 }
00164 dropped_data_.reset(new Fragment(frag.word_count - frag.num_words()));
00165 return dropped_data_->dataBegin();
00166 }
00167
00168 if (metricMan)
00169 {
00170 metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
00171 }
00172
00173 buffer_writes_pending_[buffer]++;
00174 TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
00175 Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
00176
00177 auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
00178 if (frag.word_count - frag.num_words() > 0) {
00179 IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
00180 }
00181
00182 TLOG(14) << "WriteFragmentHeader END";
00183 return pos;
00184
00185 }
00186
00187 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
00188 {
00189 TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
00190 auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
00191 if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
00192 if (buffer == -2) return;
00193 TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
00194
00195 auto hdr = getEventHeader_(buffer);
00196 if (update_run_ids_)
00197 {
00198 hdr->run_id = run_id_;
00199 hdr->subrun_id = subrun_id_;
00200 }
00201
00202 buffer_writes_pending_[buffer]--;
00203 if (buffer_writes_pending_[buffer] != 0)
00204 {
00205 TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
00206 return;
00207 }
00208 auto frag_count = GetFragmentCount(frag.sequence_id);
00209 hdr->is_complete = frag_count == num_fragments_per_event_;
00210 #if ART_SUPPORTS_DUPLICATE_EVENTS
00211 if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id)) {
00212 hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
00213 }
00214 #endif
00215
00216 complete_buffer_(buffer);
00217 if (requests_) requests_->SendRequest(true);
00218 TLOG(TLVL_TRACE) << "DoneWritingFragment END";
00219 }
00220
00221 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
00222 {
00223 return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
00224 }
00225
00226 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
00227 {
00228 if (buffer == -1) return 0;
00229 ResetReadPos(buffer);
00230 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00231
00232 size_t count = 0;
00233
00234 while (MoreDataInBuffer(buffer))
00235 {
00236 auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
00237 IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
00238 if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
00239 TLOG(TLVL_TRACE) << "Adding Fragment with size=" << std::to_string(fragHdr->word_count) << " to Fragment count";
00240 ++count;
00241 }
00242
00243 return count;
00244 }
00245
00246 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, pid_t& pid_out)
00247 {
00248 while (restart_art_)
00249 {
00250 auto start_time = std::chrono::steady_clock::now();
00251 send_init_frag_();
00252 TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
00253 std::vector<char*> args{ (char*)"art", (char*)"-c", &config_file->getFileName()[0], NULL };
00254
00255 auto pid = fork();
00256 if (pid == 0)
00257 {
00258 execvp("art", &args[0]);
00259 exit(1);
00260 }
00261 pid_out = pid;
00262
00263 TLOG(TLVL_INFO) << "PID of new art process is " << pid;
00264 art_processes_.insert(pid);
00265 siginfo_t status;
00266 auto sts = waitid(P_PID, pid, &status, WEXITED);
00267 TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
00268 art_processes_.erase(pid);
00269 if (sts < 0) {
00270 TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
00271 }
00272 else if (status.si_code == CLD_EXITED && status.si_status == 0)
00273 {
00274 TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
00275 }
00276 else
00277 {
00278 auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
00279 if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
00280
00281 auto exit_type = "exited with status code";
00282 switch (status.si_code) {
00283 case CLD_DUMPED:
00284 case CLD_KILLED:
00285 exit_type = "was killed with signal";
00286 break;
00287 case CLD_EXITED:
00288 default:
00289 break;
00290 }
00291
00292 TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
00293 << "art process " << pid << " " << exit_type << " " << status.si_status
00294 << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
00295 << " after " << std::setprecision(2) << art_lifetime << " seconds, "
00296 << (restart_art_ ? "restarting" : "not restarting");
00297 }
00298 }
00299 }
00300
00301 void artdaq::SharedMemoryEventManager::StartArt()
00302 {
00303 restart_art_ = true;
00304 if (num_art_processes_ == 0) return;
00305 for (size_t ii = 0; ii < num_art_processes_; ++ii)
00306 {
00307 StartArtProcess(current_art_pset_);
00308 }
00309 }
00310
00311 pid_t artdaq::SharedMemoryEventManager::StartArtProcess(fhicl::ParameterSet pset)
00312 {
00313 static std::mutex start_art_mutex;
00314 TraceLock lk(start_art_mutex, 15, "StartArtLock");
00315 restart_art_ = true;
00316 auto initialCount = GetAttachedCount();
00317 auto startTime = std::chrono::steady_clock::now();
00318
00319 if (pset != current_art_pset_ || !current_art_config_file_)
00320 {
00321 current_art_pset_ = pset;
00322 current_art_config_file_ = std::make_shared<art_config_file>(pset);
00323 }
00324 pid_t pid = -1;
00325 boost::thread thread([&] {RunArt(current_art_config_file_, pid); });
00326 thread.detach();
00327
00328
00329 while ((GetAttachedCount() - initialCount < 1 || pid <= 0)
00330 && TimeUtils::GetElapsedTime(startTime) < 5)
00331 {
00332 usleep(1000);
00333 }
00334 if (GetAttachedCount() - initialCount < 1 || pid <= 0)
00335 {
00336 TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
00337 << " (pid=" << pid << ", attachedCount=" << std::to_string(GetAttachedCount() - initialCount) << ")";
00338 return 0;
00339 }
00340 else
00341 {
00342 TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
00343 << TimeUtils::GetElapsedTime(startTime) << " seconds.";
00344
00345 return pid;
00346 }
00347
00348 }
00349
00350 void artdaq::SharedMemoryEventManager::ShutdownArtProcesses(std::set<pid_t> pids)
00351 {
00352 restart_art_ = false;
00353
00354
00355
00356 for (auto pid = pids.begin(); pid != pids.end();)
00357 {
00358 if (kill(*pid, 0) < 0)
00359 {
00360 pid = pids.erase(pid);
00361 }
00362 else {
00363 ++pid;
00364 }
00365 }
00366 if (pids.size() == 0)
00367 {
00368 TLOG(14) << "All art processes already exited, nothing to do.";
00369 usleep(1000);
00370 return;
00371 }
00372
00373 TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
00374 for (auto pid : pids)
00375 {
00376 kill(pid, SIGQUIT);
00377 }
00378
00379 int graceful_wait_ms = 5000;
00380 int int_wait_ms = 1000;
00381
00382 TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
00383 for (int ii = 0; ii < graceful_wait_ms; ++ii)
00384 {
00385 usleep(1000);
00386
00387 for (auto pid = pids.begin(); pid != pids.end();)
00388 {
00389 if (kill(*pid, 0) < 0)
00390 {
00391 pid = pids.erase(pid);
00392 }
00393 else {
00394 ++pid;
00395 }
00396 }
00397 if (pids.size() == 0)
00398 {
00399 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
00400 return;
00401 }
00402 }
00403
00404 TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
00405 for (auto pid : pids)
00406 {
00407 kill(pid, SIGINT);
00408 }
00409
00410 TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
00411 for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
00412 {
00413 usleep(1000);
00414
00415 for (auto pid = pids.begin(); pid != pids.end();)
00416 {
00417 if (kill(*pid, 0) < 0)
00418 {
00419 pid = pids.erase(pid);
00420 }
00421 else {
00422 ++pid;
00423 }
00424 }
00425
00426 if (pids.size() == 0)
00427 {
00428 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
00429 return;
00430 }
00431 }
00432
00433 TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
00434 while (pids.size() > 0)
00435 {
00436 kill(*pids.begin(), SIGKILL);
00437 usleep(1000);
00438
00439 for (auto pid = pids.begin(); pid != pids.end();)
00440 {
00441 if (kill(*pid, 0) < 0)
00442 {
00443 pid = pids.erase(pid);
00444 }
00445 else {
00446 ++pid;
00447 }
00448 }
00449 }
00450 }
00451
00452 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
00453 {
00454 TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
00455 if (restart_art_)
00456 {
00457 endOfData();
00458 }
00459 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00460 {
00461 broadcasts_.MarkBufferEmpty(ii, true);
00462 }
00463 if (newRun == 0) newRun = run_id_ + 1;
00464
00465 if (art_pset != current_art_pset_ || !current_art_config_file_) {
00466 current_art_pset_ = art_pset;
00467 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00468 }
00469
00470 if (n_art_processes != -1)
00471 {
00472 TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
00473 num_art_processes_ = n_art_processes;
00474 }
00475 startRun(newRun);
00476 TLOG(TLVL_DEBUG) << "ReconfigureArt END";
00477 }
00478
00479 bool artdaq::SharedMemoryEventManager::endOfData()
00480 {
00481 init_fragment_.reset(nullptr);
00482 TLOG(TLVL_TRACE) << "SharedMemoryEventManager::endOfData";
00483 restart_art_ = false;
00484
00485 size_t initialStoreSize = GetIncompleteEventCount();
00486 TLOG(TLVL_TRACE) << "endOfData: Flushing " << initialStoreSize
00487 << " stale events from the SharedMemoryEventManager.";
00488 int counter = initialStoreSize;
00489 while (active_buffers_.size() > 0 && counter > 0)
00490 {
00491 complete_buffer_(*active_buffers_.begin());
00492 counter--;
00493 }
00494 TLOG(TLVL_TRACE) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
00495 << " stale events in the SharedMemoryEventManager.";
00496
00497
00498 TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
00499 auto start = std::chrono::steady_clock::now();
00500 auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00501 auto end_of_data_wait_us = art_event_processing_time_us_ * size();
00502
00503
00504 while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && art_processes_.size() > 0)
00505 {
00506 auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00507 if (temp != lastReadCount)
00508 {
00509 TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(temp) << " outstanding buffers...";
00510 lastReadCount = temp;
00511 start = std::chrono::steady_clock::now();
00512 }
00513 if (lastReadCount > 0) usleep(art_event_processing_time_us_);
00514 }
00515 TLOG(TLVL_TRACE) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: " << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << art_processes_.size();
00516
00517 TLOG(TLVL_TRACE) << "endOfData: Broadcasting EndOfData Fragment";
00518 FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
00519 bool success = broadcastFragment_(std::move(outFrag), outFrag);
00520 if (!success)
00521 {
00522 TLOG(TLVL_TRACE) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
00523 for (size_t ii = 0; ii < size(); ++ii)
00524 {
00525 broadcasts_.MarkBufferEmpty(ii, true);
00526 }
00527 broadcastFragment_(std::move(outFrag), outFrag);
00528 }
00529 auto endOfDataProcessingStart = std::chrono::steady_clock::now();
00530
00531 if (art_processes_.size() > 0)
00532 {
00533 TLOG(TLVL_DEBUG) << "Allowing " << std::to_string(art_processes_.size()) << " art processes the chance to end gracefully";
00534 if (end_of_data_wait_us == 0)
00535 {
00536 TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
00537 end_of_data_wait_us = 100 * 1000000;
00538 }
00539
00540 auto sleep_count = (end_of_data_wait_us / 10000) + 1;
00541 for (size_t ii = 0; ii < sleep_count; ++ii) {
00542 usleep(10000);
00543 if (art_processes_.size() == 0) break;
00544 }
00545 }
00546
00547 while (art_processes_.size() > 0)
00548 {
00549 TLOG(TLVL_DEBUG) << "There are " << std::to_string(art_processes_.size()) << " art processes remaining. Proceeding to shutdown.";
00550 ShutdownArtProcesses(art_processes_);
00551 }
00552 TLOG(TLVL_INFO) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
00553
00554 ResetAttachedCount();
00555
00556 TLOG(TLVL_TRACE) << "endOfData: Clearing buffers";
00557 for (size_t ii = 0; ii < size(); ++ii)
00558 {
00559 MarkBufferEmpty(ii, true);
00560 }
00561 released_incomplete_events_.clear();
00562
00563 TLOG(TLVL_TRACE) << "endOfData: Shutting down RequestReceiver";
00564 requests_.reset(nullptr);
00565
00566 TLOG(TLVL_TRACE) << "endOfData END";
00567 TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
00568 running_ = false;
00569 return true;
00570 }
00571
00572 void artdaq::SharedMemoryEventManager::startRun(run_id_t runID)
00573 {
00574 running_ = true;
00575 init_fragment_.reset(nullptr);
00576 StartArt();
00577 run_id_ = runID;
00578 subrun_id_ = 1;
00579 requests_.reset(new RequestSender(data_pset_));
00580 if (requests_) requests_->SendRoutingToken(queue_size_);
00581 TLOG(TLVL_DEBUG) << "Starting run " << run_id_
00582 << ", max queue size = "
00583 << queue_size_
00584 << ", queue size = "
00585 << GetLockedBufferCount();
00586 if (metricMan)
00587 {
00588 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00589 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00590 }
00591 }
00592
00593 void artdaq::SharedMemoryEventManager::startSubrun()
00594 {
00595 ++subrun_id_;
00596 if (metricMan)
00597 {
00598 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00599 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00600 }
00601 }
00602
00603 bool artdaq::SharedMemoryEventManager::endRun()
00604 {
00605 TLOG(TLVL_INFO) << "Ending run " << run_id_;
00606 FragmentPtr endOfRunFrag(new
00607 Fragment(static_cast<size_t>
00608 (ceil(sizeof(my_rank) /
00609 static_cast<double>(sizeof(Fragment::value_type))))));
00610
00611 TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
00612 endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
00613 *endOfRunFrag->dataBegin() = my_rank;
00614 broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
00615
00616 TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
00617 run_event_count_ = 0;
00618 run_incomplete_event_count_ = 0;
00619 return true;
00620 }
00621
00622 bool artdaq::SharedMemoryEventManager::endSubrun()
00623 {
00624 TLOG(TLVL_INFO) << "Ending subrun " << subrun_id_;
00625 std::unique_ptr<artdaq::Fragment>
00626 endOfSubrunFrag(new
00627 Fragment(static_cast<size_t>
00628 (ceil(sizeof(my_rank) /
00629 static_cast<double>(sizeof(Fragment::value_type))))));
00630
00631 TLOG(TLVL_DEBUG) << "Broadcasting EndOfSubrun Fragment";
00632 endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
00633 *endOfSubrunFrag->dataBegin() = my_rank;
00634
00635 broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
00636
00637 TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun.";
00638 subrun_event_count_ = 0;
00639 subrun_incomplete_event_count_ = 0;
00640
00641 return true;
00642 }
00643
00644 void artdaq::SharedMemoryEventManager::sendMetrics()
00645 {
00646 if (metricMan)
00647 {
00648 metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
00649 metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
00650 }
00651
00652 if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
00653 {
00654 if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
00655 return;
00656
00657 last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
00658 std::ostringstream oss;
00659 oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
00660 for (auto& ev : active_buffers_)
00661 {
00662 auto hdr = getEventHeader_(ev);
00663 oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
00664 }
00665 TLOG(TLVL_DEBUG) << oss.str();
00666 }
00667 }
00668
00669 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
00670 {
00671 TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
00672 auto buffer = broadcasts_.GetBufferForWriting(false);
00673 TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
00674 auto start_time = std::chrono::steady_clock::now();
00675 while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
00676 {
00677 usleep(10000);
00678 buffer = broadcasts_.GetBufferForWriting(false);
00679 }
00680 TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
00681 if (buffer == -1)
00682 {
00683 TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
00684 outFrag.swap(frag);
00685 return false;
00686 }
00687
00688 TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
00689 auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
00690 hdr->run_id = run_id_;
00691 hdr->subrun_id = subrun_id_;
00692 hdr->sequence_id = frag->sequenceID();
00693 hdr->is_complete = true;
00694 broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
00695
00696 TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
00697 broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
00698
00699 TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
00700 broadcasts_.MarkBufferFull(buffer, -1);
00701 outFrag.swap(frag);
00702 TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
00703 return true;
00704 }
00705
00706 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
00707 {
00708 return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
00709 }
00710
00711 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
00712 {
00713 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00714 TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " BEGIN";
00715 auto buffers = GetBuffersOwnedByManager();
00716 for (auto& buf : buffers)
00717 {
00718 auto hdr = getEventHeader_(buf);
00719 if (hdr->sequence_id == seqID)
00720 {
00721 TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning " << buf;
00722 return buf;
00723 }
00724 }
00725
00726 #if !ART_SUPPORTS_DUPLICATE_EVENTS
00727 if (released_incomplete_events_.count(seqID)) {
00728 TLOG(TLVL_ERROR) << "Event " << std::to_string(seqID) << " has already been marked \"Incomplete\" and sent to art!";
00729 return -2;
00730 }
00731 #endif
00732
00733 if (!create_new) return -1;
00734
00735 check_pending_buffers_(lk);
00736 int new_buffer = GetBufferForWriting(false);
00737
00738 if (new_buffer == -1)
00739 {
00740 new_buffer = GetBufferForWriting(overwrite_mode_);
00741 }
00742
00743 if (new_buffer == -1) return -1;
00744 TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
00745 auto hdr = getEventHeader_(new_buffer);
00746 hdr->is_complete = false;
00747 hdr->run_id = run_id_;
00748 hdr->subrun_id = subrun_id_;
00749 hdr->sequence_id = seqID;
00750 buffer_writes_pending_[new_buffer] = 0;
00751 IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
00752 #if ART_HEX_VERSION >= 0x21100
00753 SetMFIteration("Sequence ID " + std::to_string(seqID));
00754 #endif
00755
00756 active_buffers_.insert(new_buffer);
00757
00758 if (requests_) {
00759 if (timestamp != Fragment::InvalidTimestamp)
00760 {
00761 requests_->AddRequest(seqID, timestamp);
00762 }
00763 requests_->SendRequest();
00764 }
00765 TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning newly initialized buffer " << new_buffer;
00766 return new_buffer;
00767 }
00768
00769 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
00770 {
00771 if (buffer == -1) return true;
00772 if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
00773 {
00774 return true;
00775 }
00776 ResetReadPos(buffer);
00777 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00778 return MoreDataInBuffer(buffer);
00779 }
00780
00781 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
00782 {
00783 auto hdr = getEventHeader_(buffer);
00784 if (hdr->is_complete)
00785 {
00786 TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << std::to_string(hdr->sequence_id) << ".";
00787
00788 if (requests_) {
00789 requests_->RemoveRequest(hdr->sequence_id);
00790 requests_->SendRoutingToken(1);
00791 }
00792 {
00793 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00794 active_buffers_.erase(buffer);
00795 pending_buffers_.insert(buffer);
00796 }
00797 }
00798 check_pending_buffers_();
00799 }
00800
00801 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
00802 {
00803 return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
00804 }
00805
00806 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
00807 {
00808 TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
00809
00810 auto buffers = GetBuffersOwnedByManager();
00811 for (auto buf : buffers)
00812 {
00813 if (ResetBuffer(buf) && !pending_buffers_.count(buf))
00814 {
00815 auto hdr = getEventHeader_(buf);
00816 if (active_buffers_.count(buf))
00817 {
00818 if (requests_) {
00819 requests_->RemoveRequest(hdr->sequence_id);
00820 requests_->SendRoutingToken(1);
00821 }
00822 active_buffers_.erase(buf);
00823 pending_buffers_.insert(buf);
00824 subrun_incomplete_event_count_++;
00825 run_incomplete_event_count_++;
00826 if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
00827 if (!released_incomplete_events_.count(hdr->sequence_id)) {
00828 released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
00829 }
00830 else {
00831 released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
00832 }
00833 TLOG(TLVL_WARNING) << "Active event " << std::to_string(hdr->sequence_id) << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
00834 }
00835
00836 }
00837 }
00838
00839 Fragment::sequence_id_t lowestSeqId = Fragment::InvalidSequenceID;
00840
00841
00842 if (WriteReadyCount(false) != 0)
00843 {
00844 for (auto buf : active_buffers_)
00845 {
00846 auto hdr = getEventHeader_(buf);
00847 TLOG(TLVL_TRACE) << "Buffer: " << buf << ", SeqID: " << std::to_string(hdr->sequence_id) << ", ACTIVE";
00848 if (hdr->sequence_id < lowestSeqId)
00849 {
00850 lowestSeqId = hdr->sequence_id;
00851 }
00852 }
00853 TLOG(TLVL_TRACE) << "Lowest SeqID held: " << std::to_string(lowestSeqId);
00854 }
00855
00856 std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
00857 sorted_buffers.sort([this](int a, int b) {return bufferComparator(a, b); });
00858
00859 auto counter = 0;
00860 double eventSize = 0;
00861 for (auto buf : sorted_buffers)
00862 {
00863 auto hdr = getEventHeader_(buf);
00864 if (hdr->sequence_id > lowestSeqId) break;
00865 TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art.";
00866 MarkBufferFull(buf);
00867 subrun_event_count_++;
00868 run_event_count_++;
00869 counter++;
00870 eventSize += BufferDataSize(buf);
00871 pending_buffers_.erase(buf);
00872 }
00873 eventSize /= counter;
00874
00875 TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
00876 if (metricMan)
00877 {
00878 auto full = ReadReadyCount();
00879 auto empty = WriteReadyCount(overwrite_mode_);
00880 auto total = size();
00881
00882 metricMan->sendMetric("Event Rate", counter, "Events/s", 1, MetricMode::Rate);
00883 metricMan->sendMetric("Events Released to art (run)", run_event_count_, "Events", 1, MetricMode::LastPoint);
00884 metricMan->sendMetric("Incomplete Events Released to art (run)", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
00885 metricMan->sendMetric("Events Released to art (subrun)", subrun_event_count_, "Events", 2, MetricMode::LastPoint);
00886 metricMan->sendMetric("Incomplete Events Released to art (subrun)", subrun_incomplete_event_count_, "Events", 2, MetricMode::LastPoint);
00887 metricMan->sendMetric("Event Size", eventSize, "Bytes", 1, MetricMode::Average);
00888
00889 metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
00890 metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
00891 metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
00892 metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
00893 }
00894 TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
00895 }
00896
00897 void artdaq::SharedMemoryEventManager::send_init_frag_()
00898 {
00899 if (init_fragment_ != nullptr)
00900 {
00901 TLOG(TLVL_TRACE) << "Sending init Fragment to art...";
00902
00903 #if 0
00904 std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
00905 std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
00906 ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
00907 ostream.close();
00908 #endif
00909
00910 broadcastFragment_(std::move(init_fragment_), init_fragment_);
00911 TLOG(TLVL_TRACE) << "Init Fragment sent";
00912 }
00913 else if (send_init_fragments_)
00914 {
00915 TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
00916 }
00917 }
00918
00919 void artdaq::SharedMemoryEventManager::SetInitFragment(FragmentPtr frag)
00920 {
00921 if (!init_fragment_ || init_fragment_ == nullptr)
00922 {
00923 init_fragment_.swap(frag);
00924 send_init_frag_();
00925 }
00926 }