00001
00002 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
00003
00004 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
00005 #include "artdaq-core/Core/StatisticsCollection.hh"
00006 #include "artdaq-core/Utilities/TraceLock.hh"
00007 #include <sys/wait.h>
00008
00009 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
00010
00011 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
00012 : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
00013 pset.get<size_t>("buffer_count"),
00014 pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
00015 pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
00016 !pset.get<bool>("broadcast_mode", false))
00017 , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
00018 , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
00019 , queue_size_(pset.get<size_t>("buffer_count"))
00020 , run_id_(0)
00021 , subrun_id_(0)
00022 , subrun_rollover_event_(Fragment::InvalidSequenceID)
00023 , last_released_event_(0)
00024 , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
00025 , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
00026 , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
00027 , running_(false)
00028 , buffer_writes_pending_()
00029 , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
00030 , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
00031 , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
00032 , run_event_count_(0)
00033 , run_incomplete_event_count_(0)
00034 , subrun_event_count_(0)
00035 , subrun_incomplete_event_count_(0)
00036 , art_processes_()
00037 , restart_art_(false)
00038 , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
00039 , current_art_pset_(art_pset)
00040 , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
00041 , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 100000))
00042 , requests_(nullptr)
00043 , data_pset_(pset)
00044 , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
00045 pset.get<size_t>("broadcast_buffer_count", 10),
00046 pset.get<size_t>("broadcast_buffer_size", 0x100000),
00047 pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
00048 {
00049 SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00050 broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00051
00052 if (pset.get<bool>("use_art", true) == false)
00053 {
00054 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
00055 num_art_processes_ = 0;
00056 }
00057 else
00058 {
00059 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
00060 TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
00061 }
00062 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00063
00064 if (overwrite_mode_ && num_art_processes_ > 0)
00065 {
00066 TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
00067 }
00068 else if (overwrite_mode_)
00069 {
00070 TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
00071 }
00072
00073 for (size_t ii = 0; ii < size(); ++ii)
00074 {
00075 buffer_writes_pending_[ii] = 0;
00076 }
00077
00078 if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
00079
00080 TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
00081 SetRank(my_rank);
00082 TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
00083
00084
00085 TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
00086 }
00087
00088 artdaq::SharedMemoryEventManager::~SharedMemoryEventManager()
00089 {
00090 TLOG(TLVL_TRACE) << "DESTRUCTOR";
00091 if (running_) endOfData();
00092 TLOG(TLVL_TRACE) << "Destructor END";
00093 }
00094
00095 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
00096 {
00097 TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << std::to_string(frag.word_count)
00098 << ", sequence_id=" << std::to_string(frag.sequence_id);
00099 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00100 TLOG(TLVL_TRACE) << "Using buffer " << std::to_string(buffer);
00101 if (buffer == -1) return false;
00102 if (buffer == -2)
00103 {
00104 TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << std::to_string(frag.sequence_id);
00105 return true;
00106 }
00107
00108 auto hdr = getEventHeader_(buffer);
00109 if (update_run_ids_)
00110 {
00111 hdr->run_id = run_id_;
00112 hdr->subrun_id = subrun_id_;
00113 }
00114
00115 TLOG(TLVL_TRACE) << "AddFragment before Write calls";
00116 Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
00117
00118 TLOG(TLVL_TRACE) << "Checking for complete event";
00119 auto fragmentCount = GetFragmentCount(frag.sequence_id);
00120 hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
00121 TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
00122 << ", fragmentCount=" << std::to_string(fragmentCount)
00123 << ", num_fragments_per_event=" << std::to_string(num_fragments_per_event_)
00124 << ", buffer_writes_pending_[buffer]=" << std::to_string(buffer_writes_pending_[buffer]);
00125
00126 complete_buffer_(buffer);
00127 if (requests_) requests_->SendRequest(true);
00128
00129 TLOG(TLVL_TRACE) << "AddFragment END";
00130 return true;
00131 }
00132
00133 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
00134 {
00135 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
00136 auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
00137 auto data = frag->headerAddress();
00138 auto start = std::chrono::steady_clock::now();
00139 bool sts = false;
00140 while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
00141 {
00142 sts = AddFragment(hdr, data);
00143 if (!sts) usleep(1000);
00144 }
00145 if (!sts)
00146 {
00147 outfrag = std::move(frag);
00148 }
00149 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
00150 return sts;
00151 }
00152
00153 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
00154 {
00155 TLOG(14) << "WriteFragmentHeader BEGIN";
00156 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00157
00158 if (buffer < 0)
00159 {
00160 if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
00161 if (buffer == -2)
00162 {
00163 TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << std::to_string(frag.sequence_id) << " and fragment id " << std::to_string(frag.fragment_id) << " because data taking has already passed this event.";
00164 }
00165 else
00166 {
00167 TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << std::to_string(frag.sequence_id) << " and fragment id " << std::to_string(frag.fragment_id) << " because there is no room in the queue and reliable mode is off.";
00168 }
00169 dropped_data_.reset(new Fragment(frag.word_count - frag.num_words()));
00170 return dropped_data_->dataBegin();
00171 }
00172
00173 if (metricMan)
00174 {
00175 metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
00176 }
00177
00178 buffer_writes_pending_[buffer]++;
00179 TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
00180 Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
00181
00182 auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
00183 if (frag.word_count - frag.num_words() > 0)
00184 {
00185 IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
00186 }
00187
00188 TLOG(14) << "WriteFragmentHeader END";
00189 return pos;
00190
00191 }
00192
00193 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
00194 {
00195 TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
00196 auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
00197 if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
00198 if (buffer == -2) return;
00199 TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
00200
00201 auto hdr = getEventHeader_(buffer);
00202 if (update_run_ids_)
00203 {
00204 hdr->run_id = run_id_;
00205 hdr->subrun_id = subrun_id_;
00206 }
00207
00208 buffer_writes_pending_[buffer]--;
00209 if (buffer_writes_pending_[buffer] != 0)
00210 {
00211 TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
00212 return;
00213 }
00214 auto frag_count = GetFragmentCount(frag.sequence_id);
00215 hdr->is_complete = frag_count == num_fragments_per_event_;
00216 #if ART_SUPPORTS_DUPLICATE_EVENTS
00217 if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
00218 {
00219 hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
00220 }
00221 #endif
00222
00223 complete_buffer_(buffer);
00224 if (requests_) requests_->SendRequest(true);
00225 TLOG(TLVL_TRACE) << "DoneWritingFragment END";
00226 }
00227
00228 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
00229 {
00230 return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
00231 }
00232
00233 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
00234 {
00235 if (buffer == -1) return 0;
00236 ResetReadPos(buffer);
00237 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00238
00239 size_t count = 0;
00240
00241 while (MoreDataInBuffer(buffer))
00242 {
00243 auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
00244 IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
00245 if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
00246 TLOG(TLVL_TRACE) << "Adding Fragment with size=" << std::to_string(fragHdr->word_count) << " to Fragment count";
00247 ++count;
00248 }
00249
00250 return count;
00251 }
00252
00253 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, pid_t& pid_out)
00254 {
00255 do
00256 {
00257 auto start_time = std::chrono::steady_clock::now();
00258 send_init_frag_();
00259 TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
00260
00261 char* filename = new char[config_file->getFileName().length() + 1];
00262 strcpy(filename, config_file->getFileName().c_str());
00263
00264 std::vector<char*> args{ (char*)"art", (char*)"-c", filename, NULL };
00265
00266
00267 auto pid = fork();
00268 if (pid == 0)
00269 {
00270 execvp("art", &args[0]);
00271 delete[] filename;
00272 exit(1);
00273 }
00274 delete[] filename;
00275 pid_out = pid;
00276
00277 TLOG(TLVL_INFO) << "PID of new art process is " << pid;
00278 art_processes_.insert(pid);
00279 siginfo_t status;
00280 auto sts = waitid(P_PID, pid, &status, WEXITED);
00281 TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
00282 art_processes_.erase(pid);
00283 if (sts < 0)
00284 {
00285 TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
00286 }
00287 else if (status.si_code == CLD_EXITED && status.si_status == 0)
00288 {
00289 TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
00290 }
00291 else
00292 {
00293 auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
00294 if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
00295
00296 auto exit_type = "exited with status code";
00297 switch (status.si_code)
00298 {
00299 case CLD_DUMPED:
00300 case CLD_KILLED:
00301 exit_type = "was killed with signal";
00302 break;
00303 case CLD_EXITED:
00304 default:
00305 break;
00306 }
00307
00308 TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
00309 << "art process " << pid << " " << exit_type << " " << status.si_status
00310 << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
00311 << " after " << std::setprecision(2) << art_lifetime << " seconds, "
00312 << (restart_art_ ? "restarting" : "not restarting");
00313 }
00314 } while (restart_art_);
00315 }
00316
00317 void artdaq::SharedMemoryEventManager::StartArt()
00318 {
00319 restart_art_ = always_restart_art_;
00320 if (num_art_processes_ == 0) return;
00321 for (size_t ii = 0; ii < num_art_processes_; ++ii)
00322 {
00323 StartArtProcess(current_art_pset_);
00324 }
00325 }
00326
00327 pid_t artdaq::SharedMemoryEventManager::StartArtProcess(fhicl::ParameterSet pset)
00328 {
00329 static std::mutex start_art_mutex;
00330 TraceLock lk(start_art_mutex, 15, "StartArtLock");
00331 restart_art_ = always_restart_art_;
00332 auto initialCount = GetAttachedCount();
00333 auto startTime = std::chrono::steady_clock::now();
00334
00335 if (pset != current_art_pset_ || !current_art_config_file_)
00336 {
00337 current_art_pset_ = pset;
00338 current_art_config_file_ = std::make_shared<art_config_file>(pset);
00339 }
00340 pid_t pid = -1;
00341 boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
00342 thread.detach();
00343
00344
00345 while (GetAttachedCount() - initialCount < 1 && TimeUtils::GetElapsedTime(startTime) < 5)
00346 {
00347 usleep(1000);
00348 }
00349 if (GetAttachedCount() - initialCount < 1 || pid <= 0)
00350 {
00351 TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
00352 << " (pid=" << pid << ", attachedCount=" << std::to_string(GetAttachedCount() - initialCount) << ")";
00353 return 0;
00354 }
00355 else
00356 {
00357 TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
00358 << TimeUtils::GetElapsedTime(startTime) << " seconds.";
00359
00360 return pid;
00361 }
00362
00363 }
00364
00365 void artdaq::SharedMemoryEventManager::ShutdownArtProcesses(std::set<pid_t> pids)
00366 {
00367 restart_art_ = false;
00368
00369
00370
00371 for (auto pid = pids.begin(); pid != pids.end();)
00372 {
00373 if (kill(*pid, 0) < 0)
00374 {
00375 pid = pids.erase(pid);
00376 }
00377 else
00378 {
00379 ++pid;
00380 }
00381 }
00382 if (pids.size() == 0)
00383 {
00384 TLOG(14) << "All art processes already exited, nothing to do.";
00385 usleep(1000);
00386 return;
00387 }
00388
00389 TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
00390 for (auto pid : pids)
00391 {
00392 kill(pid, SIGQUIT);
00393 }
00394
00395 int graceful_wait_ms = 5000;
00396 int int_wait_ms = 1000;
00397
00398 TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
00399 for (int ii = 0; ii < graceful_wait_ms; ++ii)
00400 {
00401 usleep(1000);
00402
00403 for (auto pid = pids.begin(); pid != pids.end();)
00404 {
00405 if (kill(*pid, 0) < 0)
00406 {
00407 pid = pids.erase(pid);
00408 }
00409 else
00410 {
00411 ++pid;
00412 }
00413 }
00414 if (pids.size() == 0)
00415 {
00416 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
00417 return;
00418 }
00419 }
00420
00421 TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
00422 for (auto pid : pids)
00423 {
00424 kill(pid, SIGINT);
00425 }
00426
00427 TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
00428 for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
00429 {
00430 usleep(1000);
00431
00432 for (auto pid = pids.begin(); pid != pids.end();)
00433 {
00434 if (kill(*pid, 0) < 0)
00435 {
00436 pid = pids.erase(pid);
00437 }
00438 else
00439 {
00440 ++pid;
00441 }
00442 }
00443
00444 if (pids.size() == 0)
00445 {
00446 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
00447 return;
00448 }
00449 }
00450
00451 TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
00452 while (pids.size() > 0)
00453 {
00454 kill(*pids.begin(), SIGKILL);
00455 usleep(1000);
00456
00457 for (auto pid = pids.begin(); pid != pids.end();)
00458 {
00459 if (kill(*pid, 0) < 0)
00460 {
00461 pid = pids.erase(pid);
00462 }
00463 else
00464 {
00465 ++pid;
00466 }
00467 }
00468 }
00469 }
00470
00471 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
00472 {
00473 TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
00474 if (restart_art_ || !always_restart_art_)
00475 {
00476 endOfData();
00477 }
00478 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00479 {
00480 broadcasts_.MarkBufferEmpty(ii, true);
00481 }
00482 if (newRun == 0) newRun = run_id_ + 1;
00483
00484 if (art_pset != current_art_pset_ || !current_art_config_file_)
00485 {
00486 current_art_pset_ = art_pset;
00487 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00488 }
00489
00490 if (n_art_processes != -1)
00491 {
00492 TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
00493 num_art_processes_ = n_art_processes;
00494 }
00495 startRun(newRun);
00496 TLOG(TLVL_DEBUG) << "ReconfigureArt END";
00497 }
00498
00499 bool artdaq::SharedMemoryEventManager::endOfData()
00500 {
00501 init_fragment_.reset(nullptr);
00502 TLOG(TLVL_TRACE) << "SharedMemoryEventManager::endOfData";
00503 restart_art_ = false;
00504
00505 size_t initialStoreSize = GetIncompleteEventCount();
00506 TLOG(TLVL_TRACE) << "endOfData: Flushing " << initialStoreSize
00507 << " stale events from the SharedMemoryEventManager.";
00508 int counter = initialStoreSize;
00509 while (active_buffers_.size() > 0 && counter > 0)
00510 {
00511 complete_buffer_(*active_buffers_.begin());
00512 counter--;
00513 }
00514 TLOG(TLVL_TRACE) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
00515 << " stale events in the SharedMemoryEventManager.";
00516
00517
00518 TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
00519 auto start = std::chrono::steady_clock::now();
00520 auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00521 auto end_of_data_wait_us = art_event_processing_time_us_ * size();
00522
00523
00524 while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && art_processes_.size() > 0)
00525 {
00526 auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00527 if (temp != lastReadCount)
00528 {
00529 TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(temp) << " outstanding buffers...";
00530 lastReadCount = temp;
00531 start = std::chrono::steady_clock::now();
00532 }
00533 if (lastReadCount > 0) usleep(art_event_processing_time_us_);
00534 }
00535 TLOG(TLVL_TRACE) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: " << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << art_processes_.size();
00536
00537 TLOG(TLVL_TRACE) << "endOfData: Broadcasting EndOfData Fragment";
00538 FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
00539 bool success = broadcastFragment_(std::move(outFrag), outFrag);
00540 if (!success)
00541 {
00542 TLOG(TLVL_TRACE) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
00543 for (size_t ii = 0; ii < size(); ++ii)
00544 {
00545 broadcasts_.MarkBufferEmpty(ii, true);
00546 }
00547 broadcastFragment_(std::move(outFrag), outFrag);
00548 }
00549 auto endOfDataProcessingStart = std::chrono::steady_clock::now();
00550
00551 if (art_processes_.size() > 0)
00552 {
00553 TLOG(TLVL_DEBUG) << "Allowing " << std::to_string(art_processes_.size()) << " art processes the chance to end gracefully";
00554 if (end_of_data_wait_us == 0)
00555 {
00556 TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
00557 end_of_data_wait_us = 100 * 1000000;
00558 }
00559
00560 auto sleep_count = (end_of_data_wait_us / 10000) + 1;
00561 for (size_t ii = 0; ii < sleep_count; ++ii)
00562 {
00563 usleep(10000);
00564 if (art_processes_.size() == 0) break;
00565 }
00566 }
00567
00568 while (art_processes_.size() > 0)
00569 {
00570 TLOG(TLVL_DEBUG) << "There are " << std::to_string(art_processes_.size()) << " art processes remaining. Proceeding to shutdown.";
00571 ShutdownArtProcesses(art_processes_);
00572 }
00573 TLOG(TLVL_INFO) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
00574
00575 ResetAttachedCount();
00576
00577 TLOG(TLVL_TRACE) << "endOfData: Clearing buffers";
00578 for (size_t ii = 0; ii < size(); ++ii)
00579 {
00580 MarkBufferEmpty(ii, true);
00581 }
00582 released_incomplete_events_.clear();
00583
00584 TLOG(TLVL_TRACE) << "endOfData: Shutting down RequestReceiver";
00585 requests_.reset(nullptr);
00586
00587 TLOG(TLVL_TRACE) << "endOfData END";
00588 TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
00589 running_ = false;
00590 return true;
00591 }
00592
00593 void artdaq::SharedMemoryEventManager::startRun(run_id_t runID)
00594 {
00595 running_ = true;
00596 init_fragment_.reset(nullptr);
00597 StartArt();
00598 run_id_ = runID;
00599 subrun_id_ = 1;
00600 subrun_rollover_event_ = Fragment::InvalidSequenceID;
00601 last_released_event_ = 0;
00602 requests_.reset(new RequestSender(data_pset_));
00603 if (requests_) requests_->SendRoutingToken(queue_size_);
00604 TLOG(TLVL_DEBUG) << "Starting run " << run_id_
00605 << ", max queue size = "
00606 << queue_size_
00607 << ", queue size = "
00608 << GetLockedBufferCount();
00609 if (metricMan)
00610 {
00611 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00612 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00613 }
00614 }
00615
00616 void artdaq::SharedMemoryEventManager::startSubrun()
00617 {
00618 ++subrun_id_;
00619 subrun_rollover_event_ = Fragment::InvalidSequenceID;
00620 if (metricMan)
00621 {
00622 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00623 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00624 }
00625 }
00626
00627 bool artdaq::SharedMemoryEventManager::endRun()
00628 {
00629 TLOG(TLVL_INFO) << "Ending run " << run_id_;
00630 FragmentPtr endOfRunFrag(new
00631 Fragment(static_cast<size_t>
00632 (ceil(sizeof(my_rank) /
00633 static_cast<double>(sizeof(Fragment::value_type))))));
00634
00635 TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
00636 endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
00637 *endOfRunFrag->dataBegin() = my_rank;
00638 broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
00639
00640 TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
00641 run_event_count_ = 0;
00642 run_incomplete_event_count_ = 0;
00643 return true;
00644 }
00645
00646 bool artdaq::SharedMemoryEventManager::endSubrun()
00647 {
00648 TLOG(TLVL_INFO) << "Ending subrun " << subrun_id_;
00649 std::unique_ptr<artdaq::Fragment>
00650 endOfSubrunFrag(new
00651 Fragment(static_cast<size_t>
00652 (ceil(sizeof(my_rank) /
00653 static_cast<double>(sizeof(Fragment::value_type))))));
00654
00655 TLOG(TLVL_DEBUG) << "Broadcasting EndOfSubrun Fragment";
00656 endOfSubrunFrag->setSequenceID(subrun_rollover_event_);
00657 endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
00658 *endOfSubrunFrag->dataBegin() = my_rank;
00659
00660 broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
00661
00662 TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun.";
00663 subrun_event_count_ = 0;
00664 subrun_incomplete_event_count_ = 0;
00665
00666 return true;
00667 }
00668
00669 void artdaq::SharedMemoryEventManager::rolloverSubrun(sequence_id_t boundary)
00670 {
00671
00672 if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
00673
00674 if (boundary < last_released_event_)
00675 {
00676 auto logLevel = TLVL_ERROR;
00677 bool processAnyway = false;
00678 if (last_released_event_ - boundary < 100)
00679 {
00680 logLevel = TLVL_WARNING;
00681 processAnyway = true;
00682 }
00683 TLOG(logLevel) << "Subrun rollover requested for event that is in the past. (delta = " << (last_released_event_ - boundary) << ").";
00684 if (!processAnyway) return;
00685 }
00686 TLOG(TLVL_INFO) << "Will roll over when I reach Sequence ID " << boundary;
00687 subrun_rollover_event_ = boundary;
00688 }
00689
00690 void artdaq::SharedMemoryEventManager::sendMetrics()
00691 {
00692 if (metricMan)
00693 {
00694 metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
00695 metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
00696 }
00697
00698 if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
00699 {
00700 if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
00701 return;
00702
00703 last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
00704 std::ostringstream oss;
00705 oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
00706 for (auto& ev : active_buffers_)
00707 {
00708 auto hdr = getEventHeader_(ev);
00709 oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
00710 }
00711 TLOG(TLVL_DEBUG) << oss.str();
00712 }
00713 }
00714
00715 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
00716 {
00717 TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
00718 auto buffer = broadcasts_.GetBufferForWriting(false);
00719 TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
00720 auto start_time = std::chrono::steady_clock::now();
00721 while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
00722 {
00723 usleep(10000);
00724 buffer = broadcasts_.GetBufferForWriting(false);
00725 }
00726 TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
00727 if (buffer == -1)
00728 {
00729 TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
00730 outFrag.swap(frag);
00731 return false;
00732 }
00733
00734 TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
00735 auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
00736 hdr->run_id = run_id_;
00737 hdr->subrun_id = subrun_id_;
00738 hdr->sequence_id = frag->sequenceID();
00739 hdr->is_complete = true;
00740 broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
00741
00742 TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
00743 broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
00744
00745 TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
00746 broadcasts_.MarkBufferFull(buffer, -1);
00747 outFrag.swap(frag);
00748 TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
00749 return true;
00750 }
00751
00752 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
00753 {
00754 return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
00755 }
00756
00757 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
00758 {
00759 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00760 TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " BEGIN";
00761 auto buffers = GetBuffersOwnedByManager();
00762 for (auto& buf : buffers)
00763 {
00764 auto hdr = getEventHeader_(buf);
00765 if (hdr->sequence_id == seqID)
00766 {
00767 TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning " << buf;
00768 return buf;
00769 }
00770 }
00771
00772 #if !ART_SUPPORTS_DUPLICATE_EVENTS
00773 if (released_incomplete_events_.count(seqID))
00774 {
00775 TLOG(TLVL_ERROR) << "Event " << std::to_string(seqID) << " has already been marked \"Incomplete\" and sent to art!";
00776 return -2;
00777 }
00778 #endif
00779
00780 if (!create_new) return -1;
00781
00782 check_pending_buffers_(lk);
00783 int new_buffer = GetBufferForWriting(false);
00784
00785 if (new_buffer == -1)
00786 {
00787 new_buffer = GetBufferForWriting(overwrite_mode_);
00788 }
00789
00790 if (new_buffer == -1) return -1;
00791 TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
00792 auto hdr = getEventHeader_(new_buffer);
00793 hdr->is_complete = false;
00794 hdr->run_id = run_id_;
00795 hdr->subrun_id = subrun_id_;
00796 hdr->sequence_id = seqID;
00797 buffer_writes_pending_[new_buffer] = 0;
00798 IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
00799 SetMFIteration("Sequence ID " + std::to_string(seqID));
00800
00801 active_buffers_.insert(new_buffer);
00802
00803 if (requests_)
00804 {
00805 if (timestamp != Fragment::InvalidTimestamp)
00806 {
00807 requests_->AddRequest(seqID, timestamp);
00808 }
00809 requests_->SendRequest();
00810 }
00811 TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning newly initialized buffer " << new_buffer;
00812 return new_buffer;
00813 }
00814
00815 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
00816 {
00817 if (buffer == -1) return true;
00818 if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
00819 {
00820 return true;
00821 }
00822 ResetReadPos(buffer);
00823 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00824 return MoreDataInBuffer(buffer);
00825 }
00826
00827 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
00828 {
00829 auto hdr = getEventHeader_(buffer);
00830 if (hdr->is_complete)
00831 {
00832 TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << std::to_string(hdr->sequence_id) << ".";
00833
00834 if (requests_)
00835 {
00836 requests_->RemoveRequest(hdr->sequence_id);
00837 requests_->SendRoutingToken(1);
00838 }
00839 {
00840 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00841 active_buffers_.erase(buffer);
00842 pending_buffers_.insert(buffer);
00843 }
00844 }
00845 check_pending_buffers_();
00846 }
00847
00848 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
00849 {
00850 return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
00851 }
00852
00853 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
00854 {
00855 TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
00856
00857 auto buffers = GetBuffersOwnedByManager();
00858 for (auto buf : buffers)
00859 {
00860 if (ResetBuffer(buf) && !pending_buffers_.count(buf))
00861 {
00862 auto hdr = getEventHeader_(buf);
00863 if (active_buffers_.count(buf))
00864 {
00865 if (requests_)
00866 {
00867 requests_->RemoveRequest(hdr->sequence_id);
00868 requests_->SendRoutingToken(1);
00869 }
00870 active_buffers_.erase(buf);
00871 pending_buffers_.insert(buf);
00872 subrun_incomplete_event_count_++;
00873 run_incomplete_event_count_++;
00874 if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
00875 if (!released_incomplete_events_.count(hdr->sequence_id))
00876 {
00877 released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
00878 }
00879 else
00880 {
00881 released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
00882 }
00883 TLOG(TLVL_WARNING) << "Active event " << std::to_string(hdr->sequence_id) << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
00884 }
00885
00886 }
00887 }
00888
00889 Fragment::sequence_id_t lowestSeqId = Fragment::InvalidSequenceID;
00890
00891
00892 if (WriteReadyCount(false) != 0)
00893 {
00894 for (auto buf : active_buffers_)
00895 {
00896 auto hdr = getEventHeader_(buf);
00897 TLOG(TLVL_TRACE) << "Buffer: " << buf << ", SeqID: " << std::to_string(hdr->sequence_id) << ", ACTIVE";
00898 if (hdr->sequence_id < lowestSeqId)
00899 {
00900 lowestSeqId = hdr->sequence_id;
00901 }
00902 }
00903 TLOG(TLVL_TRACE) << "Lowest SeqID held: " << std::to_string(lowestSeqId);
00904 }
00905
00906 std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
00907 sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
00908
00909 auto counter = 0;
00910 double eventSize = 0;
00911 for (auto buf : sorted_buffers)
00912 {
00913 auto hdr = getEventHeader_(buf);
00914 if (hdr->sequence_id > lowestSeqId) break;
00915
00916 if (hdr->sequence_id >= subrun_rollover_event_)
00917 {
00918 TLOG(TLVL_INFO) << "Subrun rollover reached at event " << hdr->sequence_id << " (boundary=" << subrun_rollover_event_ << ").";
00919 endSubrun();
00920 startSubrun();
00921 }
00922 if (hdr->sequence_id > last_released_event_) last_released_event_ = hdr->sequence_id;
00923
00924 TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art.";
00925 MarkBufferFull(buf);
00926 subrun_event_count_++;
00927 run_event_count_++;
00928 counter++;
00929 eventSize += BufferDataSize(buf);
00930 pending_buffers_.erase(buf);
00931 }
00932 eventSize /= counter;
00933
00934 TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
00935 if (metricMan)
00936 {
00937 auto full = ReadReadyCount();
00938 auto empty = WriteReadyCount(overwrite_mode_);
00939 auto total = size();
00940
00941 metricMan->sendMetric("Event Rate", counter, "Events/s", 1, MetricMode::Rate);
00942 metricMan->sendMetric("Events Released to art (run)", run_event_count_, "Events", 1, MetricMode::LastPoint);
00943 metricMan->sendMetric("Incomplete Events Released to art (run)", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
00944 metricMan->sendMetric("Events Released to art (subrun)", subrun_event_count_, "Events", 2, MetricMode::LastPoint);
00945 metricMan->sendMetric("Incomplete Events Released to art (subrun)", subrun_incomplete_event_count_, "Events", 2, MetricMode::LastPoint);
00946 metricMan->sendMetric("Event Size", eventSize, "Bytes", 1, MetricMode::Average);
00947
00948 metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
00949 metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
00950 metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
00951 metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
00952 }
00953 TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
00954 }
00955
00956 void artdaq::SharedMemoryEventManager::send_init_frag_()
00957 {
00958 if (init_fragment_ != nullptr)
00959 {
00960 TLOG(TLVL_TRACE) << "Sending init Fragment to art...";
00961
00962 #if 0
00963 std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
00964 std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
00965 ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
00966 ostream.close();
00967 #endif
00968
00969 broadcastFragment_(std::move(init_fragment_), init_fragment_);
00970 TLOG(TLVL_TRACE) << "Init Fragment sent";
00971 }
00972 else if (send_init_fragments_)
00973 {
00974 TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
00975 }
00976 }
00977
00978 void artdaq::SharedMemoryEventManager::SetInitFragment(FragmentPtr frag)
00979 {
00980 if (!init_fragment_ || init_fragment_ == nullptr)
00981 {
00982 init_fragment_.swap(frag);
00983 send_init_frag_();
00984 }
00985 }
00986
00987 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
00988 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager)
00989 #endif