00001
00002 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
00003
00004 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
00005 #include "artdaq-core/Core/StatisticsCollection.hh"
00006 #include "artdaq-core/Utilities/TraceLock.hh"
00007 #include <sys/wait.h>
00008 #include "SharedMemoryEventManager.hh"
00009
00010 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
00011
00012 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
00013 : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
00014 pset.get<size_t>("buffer_count"),
00015 pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
00016 pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
00017 !pset.get<bool>("broadcast_mode", false))
00018 , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
00019 , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
00020 , queue_size_(pset.get<size_t>("buffer_count"))
00021 , run_id_(0)
00022 , subrun_id_(0)
00023 , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
00024 , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
00025 , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
00026 , buffer_writes_pending_()
00027 , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
00028 , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
00029 , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
00030 , broadcast_count_(0)
00031 , subrun_event_count_(0)
00032 , art_processes_()
00033 , restart_art_(false)
00034 , current_art_pset_(art_pset)
00035 , requests_(pset)
00036 , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
00037 pset.get<size_t>("broadcast_buffer_count", 10),
00038 pset.get<size_t>("broadcast_buffer_size", 0x100000),
00039 pset.get<int>("fragment_broadcast_timeout_ms", 3000) * 1000, false)
00040 {
00041 SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00042 broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00043
00044 if (pset.get<bool>("use_art", true) == false) {
00045 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false" << TLOG_ENDL;
00046 num_art_processes_ = 0;
00047 }
00048 else {
00049 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true" << TLOG_ENDL;
00050 TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string() << TLOG_ENDL;
00051 }
00052 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00053
00054 if (overwrite_mode_ && num_art_processes_ > 0)
00055 {
00056 TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!" << TLOG_ENDL;
00057 }
00058 else if (overwrite_mode_)
00059 {
00060 TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup" << TLOG_ENDL;
00061 }
00062
00063 for (size_t ii = 0; ii < size(); ++ii)
00064 {
00065 buffer_writes_pending_[ii] = 0;
00066 }
00067
00068 if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
00069
00070 TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank << TLOG_ENDL;
00071 SetRank(my_rank);
00072 TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank() << TLOG_ENDL;
00073
00074
00075 TLOG(TLVL_TRACE) << "END CONSTRUCTOR" << TLOG_ENDL;
00076 }
00077
00078 artdaq::SharedMemoryEventManager::~SharedMemoryEventManager()
00079 {
00080 TLOG(TLVL_TRACE) << "DESTRUCTOR" << TLOG_ENDL;
00081 endOfData();
00082 TLOG(TLVL_TRACE) << "Destructor END" << TLOG_ENDL;
00083 }
00084
00085 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
00086 {
00087 TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << std::to_string(frag.word_count)
00088 << ", sequence_id=" << std::to_string(frag.sequence_id) << TLOG_ENDL;
00089 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00090 TLOG(TLVL_TRACE) << "Using buffer " << std::to_string(buffer) << TLOG_ENDL;
00091 if (buffer == -1) return false;
00092 if (buffer == -2)
00093 {
00094 TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << std::to_string(frag.sequence_id) << TLOG_ENDL;
00095 return true;
00096 }
00097
00098 auto hdr = getEventHeader_(buffer);
00099 if (update_run_ids_)
00100 {
00101 hdr->run_id = run_id_;
00102 hdr->subrun_id = subrun_id_;
00103 }
00104
00105 TLOG(TLVL_TRACE) << "AddFragment before Write calls" << TLOG_ENDL;
00106 Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
00107
00108 TLOG(TLVL_TRACE) << "Checking for complete event" << TLOG_ENDL;
00109 auto fragmentCount = GetFragmentCount(frag.sequence_id);
00110 hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
00111 TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
00112 << ", fragmentCount=" << std::to_string(fragmentCount)
00113 << ", num_fragments_per_event=" << std::to_string(num_fragments_per_event_)
00114 << ", buffer_writes_pending_[buffer]=" << std::to_string(buffer_writes_pending_[buffer]) << TLOG_ENDL;
00115
00116 complete_buffer_(buffer);
00117 requests_.SendRequest(true);
00118
00119 TLOG(TLVL_TRACE) << "AddFragment END" << TLOG_ENDL;
00120 return true;
00121 }
00122
00123 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
00124 {
00125 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN" << TLOG_ENDL;
00126 auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
00127 auto data = frag->headerAddress();
00128 auto start = std::chrono::steady_clock::now();
00129 bool sts = false;
00130 while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
00131 {
00132 sts = AddFragment(hdr, data);
00133 if (!sts) usleep(1000);
00134 }
00135 if (!sts)
00136 {
00137 outfrag = std::move(frag);
00138 }
00139 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts << TLOG_ENDL;
00140 return sts;
00141 }
00142
00143 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
00144 {
00145 TLOG(14) << "WriteFragmentHeader BEGIN" << TLOG_ENDL;
00146 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00147
00148 if (buffer < 0)
00149 {
00150 if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
00151 if (buffer == -2)
00152 {
00153 TLOG(TLVL_ERROR) << "Dropping fragment because data taking has already passed this event number: " << std::to_string(frag.sequence_id) << TLOG_ENDL;
00154 }
00155 else
00156 {
00157 TLOG(TLVL_ERROR) << "Dropping fragment because there is no room in the queue and reliable mode is off: " << std::to_string(frag.sequence_id) << TLOG_ENDL;
00158 }
00159 dropped_data_.reset(new Fragment(frag.word_count - frag.num_words()));
00160 return dropped_data_->dataBegin();
00161 }
00162
00163 buffer_writes_pending_[buffer]++;
00164 TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
00165 Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
00166
00167 auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
00168 if (frag.word_count - frag.num_words() > 0) {
00169 IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
00170 }
00171
00172 TLOG(14) << "WriteFragmentHeader END" << TLOG_ENDL;
00173 return pos;
00174
00175 }
00176
00177 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
00178 {
00179 TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN" << TLOG_ENDL;
00180 auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
00181 if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
00182 if (buffer == -2) return;
00183 TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
00184
00185 auto hdr = getEventHeader_(buffer);
00186 if (update_run_ids_)
00187 {
00188 hdr->run_id = run_id_;
00189 hdr->subrun_id = subrun_id_;
00190 }
00191
00192 buffer_writes_pending_[buffer]--;
00193 if (buffer_writes_pending_[buffer] != 0)
00194 {
00195 TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps." << TLOG_ENDL;
00196 return;
00197 }
00198 auto frag_count = GetFragmentCount(frag.sequence_id);
00199 hdr->is_complete = frag_count == num_fragments_per_event_;
00200 #if ART_SUPPORTS_DUPLICATE_EVENTS
00201 if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id)) {
00202 hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
00203 }
00204 #endif
00205
00206 complete_buffer_(buffer);
00207 requests_.SendRequest(true);
00208 TLOG(TLVL_TRACE) << "DoneWritingFragment END" << TLOG_ENDL;
00209 }
00210
00211 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
00212 {
00213 return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
00214 }
00215
00216 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
00217 {
00218 if (buffer == -1) return 0;
00219 ResetReadPos(buffer);
00220 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00221
00222 size_t count = 0;
00223
00224 while (MoreDataInBuffer(buffer))
00225 {
00226 auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
00227 IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
00228 if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
00229 TLOG_TRACE("GetFragmentCount") << "Adding Fragment with size=" << std::to_string(fragHdr->word_count) << " to Fragment count" << TLOG_ENDL;
00230 ++count;
00231 }
00232
00233 return count;
00234 }
00235
00236 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, pid_t& pid_out)
00237 {
00238 while (restart_art_)
00239 {
00240 send_init_frag_();
00241 TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName() << TLOG_ENDL;
00242 std::vector<char*> args{ (char*)"art", (char*)"-c", &config_file->getFileName()[0], NULL };
00243
00244 auto pid = fork();
00245 if (pid == 0)
00246 {
00247 execvp("art", &args[0]);
00248 exit(1);
00249 }
00250 pid_out = pid;
00251
00252 TLOG(TLVL_INFO) << "PID of new art process is " << pid << TLOG_ENDL;
00253 art_processes_.insert(pid);
00254 int status;
00255 waitpid(pid, &status, 0);
00256 TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list" << TLOG_ENDL;
00257 art_processes_.erase(pid);
00258 if (status == 0)
00259 {
00260 TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting") << TLOG_ENDL;
00261 }
00262 else
00263 {
00264 TLOG(TLVL_WARNING) << "art process " << pid << " exited with status code 0x" << std::hex << status << " (" << std::dec << status << "), " << (restart_art_ ? "restarting" : "not restarting") << TLOG_ENDL;
00265 }
00266 }
00267 }
00268
00269 void artdaq::SharedMemoryEventManager::StartArt()
00270 {
00271 restart_art_ = true;
00272 if (num_art_processes_ == 0) return;
00273 for (size_t ii = 0; ii < num_art_processes_; ++ii)
00274 {
00275 StartArtProcess(current_art_pset_);
00276 }
00277 }
00278
00279 pid_t artdaq::SharedMemoryEventManager::StartArtProcess(fhicl::ParameterSet pset)
00280 {
00281 static std::mutex start_art_mutex;
00282 TraceLock lk(start_art_mutex, 15, "StartArtLock");
00283 restart_art_ = true;
00284 auto initialCount = GetAttachedCount();
00285 auto startTime = std::chrono::steady_clock::now();
00286
00287 if (pset != current_art_pset_)
00288 {
00289 current_art_pset_ = pset;
00290 current_art_config_file_ = std::make_shared<art_config_file>(pset);
00291 }
00292 pid_t pid = -1;
00293 boost::thread thread([&] {RunArt(current_art_config_file_, pid); });
00294 thread.detach();
00295
00296
00297 while ((GetAttachedCount() - initialCount < 1 || pid <= 0)
00298 && TimeUtils::GetElapsedTime(startTime) < 5)
00299 {
00300 usleep(1000);
00301 }
00302 if (GetAttachedCount() - initialCount < 1 || pid <= 0)
00303 {
00304 TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
00305 << " (pid=" << pid << ", attachedCount=" << std::to_string(GetAttachedCount() - initialCount) << ")" << TLOG_ENDL;
00306 return 0;
00307 }
00308 else
00309 {
00310 TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
00311 << TimeUtils::GetElapsedTime(startTime) << " seconds." << TLOG_ENDL;
00312
00313 return pid;
00314 }
00315
00316 }
00317
00318 void artdaq::SharedMemoryEventManager::ShutdownArtProcesses(std::set<pid_t> pids)
00319 {
00320 restart_art_ = false;
00321 current_art_config_file_ = nullptr;
00322 current_art_pset_ = fhicl::ParameterSet();
00323
00324 for (auto pid : pids)
00325 {
00326 if (kill(pid, 0) >= 0)
00327 {
00328 pids.erase(pid);
00329 }
00330 }
00331 if (pids.size() == 0)
00332 {
00333 TLOG(14) << "All art processes already exited, nothing to do." << TLOG_ENDL;
00334 usleep(1000);
00335 return;
00336 }
00337
00338 TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down" << TLOG_ENDL;
00339 for (auto pid : pids)
00340 {
00341 kill(pid, SIGQUIT);
00342 }
00343
00344 int graceful_wait_ms = 1000;
00345 int int_wait_ms = 100;
00346
00347 TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully" << TLOG_ENDL;
00348 for (int ii = 0; ii < graceful_wait_ms; ++ii)
00349 {
00350 usleep(1000);
00351
00352 for (auto pid : pids)
00353 {
00354 if (kill(pid, 0) < 0)
00355 {
00356 pids.erase(pid);
00357 }
00358 }
00359 if (pids.size() == 0)
00360 {
00361 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms." << TLOG_ENDL;
00362 return;
00363 }
00364 }
00365
00366 TLOG(TLVL_TRACE) << "Insisting that the art processes shut down" << TLOG_ENDL;
00367 for (auto pid : pids)
00368 {
00369 kill(pid, SIGINT);
00370 }
00371
00372 TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit" << TLOG_ENDL;
00373 for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
00374 {
00375 usleep(1000);
00376
00377 for (auto pid : pids)
00378 {
00379 if (kill(pid, 0) < 0)
00380 {
00381 pids.erase(pid);
00382 }
00383 }
00384
00385 if (pids.size() == 0)
00386 {
00387 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms." << TLOG_ENDL;
00388 return;
00389 }
00390 }
00391
00392 TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice" << TLOG_ENDL;
00393 while (pids.size() > 0)
00394 {
00395 kill(*pids.begin(), SIGKILL);
00396 }
00397 }
00398
00399 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
00400 {
00401 TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN" << TLOG_ENDL;
00402 if (restart_art_)
00403 {
00404 endOfData();
00405 }
00406 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00407 {
00408 broadcasts_.MarkBufferEmpty(ii, true);
00409 }
00410 if (newRun == 0) newRun = run_id_ + 1;
00411 current_art_pset_ = art_pset;
00412 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00413
00414 if (n_art_processes != -1)
00415 {
00416 TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes << TLOG_ENDL;
00417 num_art_processes_ = n_art_processes;
00418 }
00419 startRun(newRun);
00420 TLOG(TLVL_DEBUG) << "ReconfigureArt END" << TLOG_ENDL;
00421 }
00422
00423 bool artdaq::SharedMemoryEventManager::endOfData()
00424 {
00425 init_fragment_.reset(nullptr);
00426 TLOG(TLVL_TRACE) << "SharedMemoryEventManager::endOfData" << TLOG_ENDL;
00427 restart_art_ = false;
00428
00429 size_t initialStoreSize = GetIncompleteEventCount();
00430 TLOG(TLVL_TRACE) << "endOfData: Flushing " << initialStoreSize
00431 << " stale events from the SharedMemoryEventManager." << TLOG_ENDL;
00432 int counter = initialStoreSize;
00433 while (active_buffers_.size() > 0 && counter > 0)
00434 {
00435 complete_buffer_(*active_buffers_.begin());
00436 counter--;
00437 }
00438 TLOG(TLVL_TRACE) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
00439 << " stale events in the SharedMemoryEventManager." << TLOG_ENDL;
00440
00441
00442 TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers..." << TLOG_ENDL;
00443 auto start = std::chrono::steady_clock::now();
00444 auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00445
00446
00447 while (lastReadCount > 0 && TimeUtils::GetElapsedTime(start) < 1)
00448 {
00449 auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00450 if (temp != lastReadCount)
00451 {
00452 TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(temp) << " outstanding buffers..." << TLOG_ENDL;
00453 lastReadCount = temp;
00454 start = std::chrono::steady_clock::now();
00455 }
00456 if (lastReadCount > 0) usleep(1000);
00457 }
00458
00459 TLOG(TLVL_TRACE) << "endOfData: Broadcasting EndOfData Fragment" << TLOG_ENDL;
00460 FragmentPtr outFrag = std::move(Fragment::eodFrag(GetBufferCount()));
00461 bool success = broadcastFragment_(std::move(outFrag), outFrag);
00462 if (!success)
00463 {
00464 TLOG(TLVL_TRACE) << "endOfData: Clearing buffers to make room for EndOfData Fragment" << TLOG_ENDL;
00465 for (size_t ii = 0; ii < size(); ++ii)
00466 {
00467 broadcasts_.MarkBufferEmpty(ii, true);
00468 }
00469 broadcastFragment_(std::move(outFrag), outFrag);
00470 }
00471
00472 TLOG(TLVL_DEBUG) << "Waiting for all art processes to exit, there are " << std::to_string(art_processes_.size()) << " remaining." << TLOG_ENDL;
00473 while (art_processes_.size() > 0)
00474 {
00475 ShutdownArtProcesses(art_processes_);
00476 }
00477 ResetAttachedCount();
00478
00479 TLOG(TLVL_TRACE) << "endOfData: Clearing buffers" << TLOG_ENDL;
00480 for (size_t ii = 0; ii < size(); ++ii)
00481 {
00482 MarkBufferEmpty(ii, true);
00483 }
00484 released_incomplete_events_.clear();
00485
00486 TLOG(TLVL_TRACE) << "endOfData END" << TLOG_ENDL;
00487 TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " events processed in this run." << TLOG_ENDL;
00488 return true;
00489 }
00490
00491 void artdaq::SharedMemoryEventManager::startRun(run_id_t runID)
00492 {
00493 init_fragment_.reset(nullptr);
00494 StartArt();
00495 run_id_ = runID;
00496 subrun_id_ = 1;
00497 requests_.SendRoutingToken(queue_size_);
00498 TLOG(TLVL_DEBUG) << "Starting run " << run_id_
00499 << ", max queue size = "
00500 << queue_size_
00501 << ", queue size = "
00502 << GetLockedBufferCount() << TLOG_ENDL;
00503 if (metricMan)
00504 {
00505 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00506 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00507 }
00508 }
00509
00510 void artdaq::SharedMemoryEventManager::startSubrun()
00511 {
00512 ++subrun_id_;
00513 if (metricMan)
00514 {
00515 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00516 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00517 }
00518 }
00519
00520 bool artdaq::SharedMemoryEventManager::endRun()
00521 {
00522 FragmentPtr endOfRunFrag(new
00523 Fragment(static_cast<size_t>
00524 (ceil(sizeof(my_rank) /
00525 static_cast<double>(sizeof(Fragment::value_type))))));
00526
00527 endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
00528 *endOfRunFrag->dataBegin() = my_rank;
00529 broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
00530
00531 return true;
00532 }
00533
00534 bool artdaq::SharedMemoryEventManager::endSubrun()
00535 {
00536 std::unique_ptr<artdaq::Fragment>
00537 endOfSubrunFrag(new
00538 Fragment(static_cast<size_t>
00539 (ceil(sizeof(my_rank) /
00540 static_cast<double>(sizeof(Fragment::value_type))))));
00541
00542 endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
00543 *endOfSubrunFrag->dataBegin() = my_rank;
00544
00545 broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
00546
00547 TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun." << TLOG_ENDL;
00548 subrun_event_count_ = 0;
00549
00550 return true;
00551 }
00552
00553 void artdaq::SharedMemoryEventManager::sendMetrics()
00554 {
00555 if (metricMan)
00556 {
00557 metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
00558 metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
00559 }
00560
00561 if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
00562 {
00563 if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
00564 return;
00565
00566 last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
00567 std::ostringstream oss;
00568 oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
00569 for (auto& ev : active_buffers_)
00570 {
00571 auto hdr = getEventHeader_(ev);
00572 oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
00573 }
00574 TLOG(TLVL_DEBUG) << oss.str() << TLOG_ENDL;
00575 }
00576 }
00577
00578 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
00579 {
00580 auto buffer = broadcasts_.GetBufferForWriting(false);
00581 auto start_time = std::chrono::steady_clock::now();
00582 while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
00583 {
00584 usleep(10000);
00585 buffer = broadcasts_.GetBufferForWriting(false);
00586 }
00587 if (buffer == -1)
00588 {
00589 TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!" << TLOG_ENDL;
00590 outFrag.swap(frag);
00591 return false;
00592 }
00593
00594 auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
00595 hdr->run_id = run_id_;
00596 hdr->subrun_id = subrun_id_;
00597 hdr->sequence_id = frag->sequenceID();
00598 hdr->is_complete = true;
00599 broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
00600
00601 TLOG(TLVL_TRACE) << "broadcastFragment_ before Write calls" << TLOG_ENDL;
00602 broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
00603
00604 broadcasts_.MarkBufferFull(buffer, -1);
00605 outFrag.swap(frag);
00606 return true;
00607 }
00608
00609 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
00610 {
00611 return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
00612 }
00613
00614 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
00615 {
00616 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00617 TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " BEGIN" << TLOG_ENDL;
00618 auto buffers = GetBuffersOwnedByManager();
00619 for (auto& buf : buffers)
00620 {
00621 auto hdr = getEventHeader_(buf);
00622 if (hdr->sequence_id == seqID)
00623 {
00624 TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning " << buf << TLOG_ENDL;
00625 return buf;
00626 }
00627 }
00628
00629 #if !ART_SUPPORTS_DUPLICATE_EVENTS
00630 if (released_incomplete_events_.count(seqID)) {
00631 TLOG(TLVL_ERROR) << "Buffer has already been marked \"Incomplete\" and sent to art!" << TLOG_ENDL;
00632 return -2;
00633 }
00634 #endif
00635
00636 if (!create_new) return -1;
00637
00638 check_pending_buffers_(lk);
00639 int new_buffer = GetBufferForWriting(false);
00640
00641 if (new_buffer == -1)
00642 {
00643 new_buffer = GetBufferForWriting(overwrite_mode_);
00644 }
00645
00646 if (new_buffer == -1) return -1;
00647 TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
00648 auto hdr = getEventHeader_(new_buffer);
00649 hdr->is_complete = false;
00650 hdr->run_id = run_id_;
00651 hdr->subrun_id = subrun_id_;
00652 hdr->sequence_id = seqID;
00653 buffer_writes_pending_[new_buffer] = 0;
00654 IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
00655
00656 active_buffers_.insert(new_buffer);
00657
00658 if (timestamp != Fragment::InvalidTimestamp)
00659 {
00660 requests_.AddRequest(seqID, timestamp);
00661 }
00662 requests_.SendRequest();
00663 TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning newly initialized buffer " << new_buffer << TLOG_ENDL;
00664 return new_buffer;
00665 }
00666
00667 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
00668 {
00669 if (buffer == -1) return true;
00670 if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
00671 {
00672 return true;
00673 }
00674 ResetReadPos(buffer);
00675 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00676 return MoreDataInBuffer(buffer);
00677 }
00678
00679 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
00680 {
00681 auto hdr = getEventHeader_(buffer);
00682 if (hdr->is_complete)
00683 {
00684 TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << std::to_string(hdr->sequence_id) << "." << TLOG_ENDL;
00685
00686 requests_.RemoveRequest(hdr->sequence_id);
00687 requests_.SendRoutingToken(1);
00688 {
00689 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00690 active_buffers_.erase(buffer);
00691 pending_buffers_.insert(buffer);
00692 }
00693 }
00694 check_pending_buffers_();
00695 }
00696
00697 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
00698 {
00699 return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
00700 }
00701
00702 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
00703 {
00704 TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock() << TLOG_ENDL;
00705
00706 auto buffers = GetBuffersOwnedByManager();
00707 for (auto buf : buffers)
00708 {
00709 if (ResetBuffer(buf) && !pending_buffers_.count(buf))
00710 {
00711 auto hdr = getEventHeader_(buf);
00712 if (active_buffers_.count(buf))
00713 {
00714 TLOG(TLVL_WARNING) << "Active event " << std::to_string(hdr->sequence_id) << " is stale. Scheduling release of incomplete event to art." << TLOG_ENDL;
00715 requests_.RemoveRequest(hdr->sequence_id);
00716 requests_.SendRoutingToken(1);
00717 active_buffers_.erase(buf);
00718 pending_buffers_.insert(buf);
00719 if (!released_incomplete_events_.count(hdr->sequence_id)) {
00720 released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
00721 }
00722 else {
00723 released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
00724 }
00725 }
00726
00727 }
00728 }
00729
00730 Fragment::sequence_id_t lowestSeqId = Fragment::InvalidSequenceID;
00731
00732
00733 if (WriteReadyCount(false) != 0)
00734 {
00735 for (auto buf : active_buffers_)
00736 {
00737 auto hdr = getEventHeader_(buf);
00738 TLOG(TLVL_TRACE) << "Buffer: " << buf << ", SeqID: " << std::to_string(hdr->sequence_id) << ", ACTIVE" << TLOG_ENDL;
00739 if (hdr->sequence_id < lowestSeqId)
00740 {
00741 lowestSeqId = hdr->sequence_id;
00742 }
00743 }
00744 TLOG(TLVL_TRACE) << "Lowest SeqID held: " << std::to_string(lowestSeqId) << TLOG_ENDL;
00745 }
00746
00747 std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
00748 sorted_buffers.sort([this](int a, int b) {return bufferComparator(a, b); });
00749 for (auto buf : sorted_buffers)
00750 {
00751 auto hdr = getEventHeader_(buf);
00752 if (hdr->sequence_id > lowestSeqId) break;
00753 TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art." << TLOG_ENDL;
00754 MarkBufferFull(buf);
00755 subrun_event_count_++;
00756 pending_buffers_.erase(buf);
00757 }
00758
00759 TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics" << TLOG_ENDL;
00760 if (metricMan)
00761 {
00762 auto full = ReadReadyCount();
00763 auto empty = WriteReadyCount(overwrite_mode_);
00764 auto total = size();
00765 metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
00766 metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
00767 metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
00768 metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
00769 }
00770 TLOG(TLVL_TRACE) << "check_pending_buffers_ END" << TLOG_ENDL;
00771 }
00772
00773 void artdaq::SharedMemoryEventManager::send_init_frag_()
00774 {
00775 if (init_fragment_ != nullptr)
00776 {
00777 TLOG(TLVL_TRACE) << "Sending init Fragment to art..." << TLOG_ENDL;
00778
00779 #if 0
00780 std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
00781 std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
00782 ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
00783 ostream.close();
00784 #endif
00785
00786 broadcastFragment_(std::move(init_fragment_), init_fragment_);
00787 TLOG(TLVL_TRACE) << "Init Fragment sent" << TLOG_ENDL;
00788 }
00789 else if (send_init_fragments_)
00790 {
00791 TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!" << TLOG_ENDL;
00792 }
00793 }
00794
00795 void artdaq::SharedMemoryEventManager::SetInitFragment(FragmentPtr frag)
00796 {
00797 if (!init_fragment_ || init_fragment_ == nullptr)
00798 {
00799 init_fragment_.swap(frag);
00800 send_init_frag_();
00801 }
00802 }