00001
00002 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
00003
00004 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
00005 #include "artdaq-core/Core/StatisticsCollection.hh"
00006 #include "artdaq-core/Utilities/TraceLock.hh"
00007 #include <sys/wait.h>
00008
00009 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
00010
00011 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
00012 : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
00013 pset.get<size_t>("buffer_count"),
00014 pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
00015 pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
00016 !pset.get<bool>("broadcast_mode", false))
00017 , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
00018 , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
00019 , queue_size_(pset.get<size_t>("buffer_count"))
00020 , run_id_(0)
00021 , subrun_id_(0)
00022 , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
00023 , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
00024 , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
00025 , buffer_writes_pending_()
00026 , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
00027 , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
00028 , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
00029 , broadcast_count_(0)
00030 , subrun_event_count_(0)
00031 , art_processes_()
00032 , restart_art_(false)
00033 , current_art_pset_(art_pset)
00034 , requests_(pset)
00035 , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
00036 pset.get<size_t>("broadcast_buffer_count", 10),
00037 pset.get<size_t>("broadcast_buffer_size", 0x100000),
00038 pset.get<int>("fragment_broadcast_timeout_ms", 3000) * 1000, false)
00039 {
00040 SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00041 broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
00042
00043 if (pset.get<bool>("use_art", true) == false) {
00044 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false" << TLOG_ENDL;
00045 num_art_processes_ = 0;
00046 }
00047 else {
00048 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true" << TLOG_ENDL;
00049 TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string() << TLOG_ENDL;
00050 }
00051 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00052
00053 if (overwrite_mode_ && num_art_processes_ > 0)
00054 {
00055 TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!" << TLOG_ENDL;
00056 }
00057 else if (overwrite_mode_)
00058 {
00059 TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup" << TLOG_ENDL;
00060 }
00061
00062 for (size_t ii = 0; ii < size(); ++ii)
00063 {
00064 buffer_writes_pending_[ii] = 0;
00065 }
00066
00067 if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
00068
00069 TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank << TLOG_ENDL;
00070 SetRank(my_rank);
00071 TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank() << TLOG_ENDL;
00072
00073
00074 TLOG(TLVL_TRACE) << "END CONSTRUCTOR" << TLOG_ENDL;
00075 }
00076
00077 artdaq::SharedMemoryEventManager::~SharedMemoryEventManager()
00078 {
00079 TLOG(TLVL_TRACE) << "DESTRUCTOR" << TLOG_ENDL;
00080 endOfData();
00081 TLOG(TLVL_TRACE) << "Destructor END" << TLOG_ENDL;
00082 }
00083
00084 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
00085 {
00086 TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << std::to_string(frag.word_count)
00087 << ", sequence_id=" << std::to_string(frag.sequence_id) << TLOG_ENDL;
00088 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00089 TLOG(TLVL_TRACE) << "Using buffer " << std::to_string(buffer) << TLOG_ENDL;
00090 if (buffer == -1) return false;
00091 if (buffer == -2)
00092 {
00093 TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << std::to_string(frag.sequence_id) << TLOG_ENDL;
00094 return true;
00095 }
00096
00097 auto hdr = getEventHeader_(buffer);
00098 if (update_run_ids_)
00099 {
00100 hdr->run_id = run_id_;
00101 hdr->subrun_id = subrun_id_;
00102 }
00103
00104 TLOG(TLVL_TRACE) << "AddFragment before Write calls" << TLOG_ENDL;
00105 Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
00106
00107 TLOG(TLVL_TRACE) << "Checking for complete event" << TLOG_ENDL;
00108 auto fragmentCount = GetFragmentCount(frag.sequence_id);
00109 hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
00110 TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
00111 << ", fragmentCount=" << std::to_string(fragmentCount)
00112 << ", num_fragments_per_event=" << std::to_string(num_fragments_per_event_)
00113 << ", buffer_writes_pending_[buffer]=" << std::to_string(buffer_writes_pending_[buffer]) << TLOG_ENDL;
00114
00115 complete_buffer_(buffer);
00116 requests_.SendRequest(true);
00117
00118 TLOG(TLVL_TRACE) << "AddFragment END" << TLOG_ENDL;
00119 return true;
00120 }
00121
00122 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
00123 {
00124 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN" << TLOG_ENDL;
00125 auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
00126 auto data = frag->headerAddress();
00127 auto start = std::chrono::steady_clock::now();
00128 bool sts = false;
00129 while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
00130 {
00131 sts = AddFragment(hdr, data);
00132 if (!sts) usleep(1000);
00133 }
00134 if (!sts)
00135 {
00136 outfrag = std::move(frag);
00137 }
00138 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts << TLOG_ENDL;
00139 return sts;
00140 }
00141
00142 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
00143 {
00144 TLOG(14) << "WriteFragmentHeader BEGIN" << TLOG_ENDL;
00145 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
00146
00147 if (buffer < 0)
00148 {
00149 if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
00150 if (buffer == -2)
00151 {
00152 TLOG(TLVL_ERROR) << "Dropping fragment because data taking has already passed this event number: " << std::to_string(frag.sequence_id) << TLOG_ENDL;
00153 }
00154 else
00155 {
00156 TLOG(TLVL_ERROR) << "Dropping fragment because there is no room in the queue and reliable mode is off: " << std::to_string(frag.sequence_id) << TLOG_ENDL;
00157 }
00158 dropped_data_.reset(new Fragment(frag.word_count - frag.num_words()));
00159 return dropped_data_->dataBegin();
00160 }
00161
00162 buffer_writes_pending_[buffer]++;
00163 TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
00164 Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
00165
00166 auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
00167 if (frag.word_count - frag.num_words() > 0) {
00168 IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
00169 }
00170
00171 TLOG(14) << "WriteFragmentHeader END" << TLOG_ENDL;
00172 return pos;
00173
00174 }
00175
00176 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
00177 {
00178 TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN" << TLOG_ENDL;
00179 auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
00180 if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
00181 if (buffer == -2) return;
00182 TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
00183
00184 auto hdr = getEventHeader_(buffer);
00185 if (update_run_ids_)
00186 {
00187 hdr->run_id = run_id_;
00188 hdr->subrun_id = subrun_id_;
00189 }
00190
00191 buffer_writes_pending_[buffer]--;
00192 if (buffer_writes_pending_[buffer] != 0)
00193 {
00194 TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps." << TLOG_ENDL;
00195 return;
00196 }
00197 auto frag_count = GetFragmentCount(frag.sequence_id);
00198 hdr->is_complete = frag_count == num_fragments_per_event_;
00199 #if ART_SUPPORTS_DUPLICATE_EVENTS
00200 if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id)) {
00201 hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
00202 }
00203 #endif
00204
00205 complete_buffer_(buffer);
00206 requests_.SendRequest(true);
00207 TLOG(TLVL_TRACE) << "DoneWritingFragment END" << TLOG_ENDL;
00208 }
00209
00210 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
00211 {
00212 return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
00213 }
00214
00215 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
00216 {
00217 if (buffer == -1) return 0;
00218 ResetReadPos(buffer);
00219 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00220
00221 size_t count = 0;
00222
00223 while (MoreDataInBuffer(buffer))
00224 {
00225 auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
00226 IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
00227 if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
00228 TLOG_TRACE("GetFragmentCount") << "Adding Fragment with size=" << std::to_string(fragHdr->word_count) << " to Fragment count" << TLOG_ENDL;
00229 ++count;
00230 }
00231
00232 return count;
00233 }
00234
00235 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, pid_t& pid_out)
00236 {
00237 while (restart_art_)
00238 {
00239 send_init_frag_();
00240 TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName() << TLOG_ENDL;
00241 std::vector<char*> args{ (char*)"art", (char*)"-c", &config_file->getFileName()[0], NULL };
00242
00243 auto pid = fork();
00244 if (pid == 0)
00245 {
00246 execvp("art", &args[0]);
00247 exit(1);
00248 }
00249 pid_out = pid;
00250
00251 TLOG(TLVL_INFO) << "PID of new art process is " << pid << TLOG_ENDL;
00252 art_processes_.insert(pid);
00253 int status;
00254 waitpid(pid, &status, 0);
00255 TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list" << TLOG_ENDL;
00256 art_processes_.erase(pid);
00257 if (status == 0)
00258 {
00259 TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting") << TLOG_ENDL;
00260 }
00261 else
00262 {
00263 TLOG(TLVL_WARNING) << "art process " << pid << " exited with status code 0x" << std::hex << status << " (" << std::dec << status << "), " << (restart_art_ ? "restarting" : "not restarting") << TLOG_ENDL;
00264 }
00265 }
00266 }
00267
00268 void artdaq::SharedMemoryEventManager::StartArt()
00269 {
00270 restart_art_ = true;
00271 if (num_art_processes_ == 0) return;
00272 for (size_t ii = 0; ii < num_art_processes_; ++ii)
00273 {
00274 StartArtProcess(current_art_pset_);
00275 }
00276 }
00277
00278 pid_t artdaq::SharedMemoryEventManager::StartArtProcess(fhicl::ParameterSet pset)
00279 {
00280 static std::mutex start_art_mutex;
00281 TraceLock lk(start_art_mutex, 15, "StartArtLock");
00282 restart_art_ = true;
00283 auto initialCount = GetAttachedCount();
00284 auto startTime = std::chrono::steady_clock::now();
00285
00286 if (pset != current_art_pset_)
00287 {
00288 current_art_pset_ = pset;
00289 current_art_config_file_ = std::make_shared<art_config_file>(pset);
00290 }
00291 pid_t pid = -1;
00292 boost::thread thread([&] {RunArt(current_art_config_file_, pid); });
00293 thread.detach();
00294
00295
00296 while ((GetAttachedCount() - initialCount < 1 || pid <= 0)
00297 && TimeUtils::GetElapsedTime(startTime) < 5)
00298 {
00299 usleep(1000);
00300 }
00301 if (GetAttachedCount() - initialCount < 1 || pid <= 0)
00302 {
00303 TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
00304 << " (pid=" << pid << ", attachedCount=" << std::to_string(GetAttachedCount() - initialCount) << ")" << TLOG_ENDL;
00305 return 0;
00306 }
00307 else
00308 {
00309 TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
00310 << TimeUtils::GetElapsedTime(startTime) << " seconds." << TLOG_ENDL;
00311
00312 return pid;
00313 }
00314
00315 }
00316
00317 void artdaq::SharedMemoryEventManager::ShutdownArtProcesses(std::set<pid_t> pids)
00318 {
00319 restart_art_ = false;
00320 current_art_config_file_ = nullptr;
00321 current_art_pset_ = fhicl::ParameterSet();
00322
00323 for (auto pid : pids)
00324 {
00325 if (kill(pid, 0) >= 0)
00326 {
00327 pids.erase(pid);
00328 }
00329 }
00330 if (pids.size() == 0)
00331 {
00332 TLOG(14) << "All art processes already exited, nothing to do." << TLOG_ENDL;
00333 usleep(1000);
00334 return;
00335 }
00336
00337 TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down" << TLOG_ENDL;
00338 for (auto pid : pids)
00339 {
00340 kill(pid, SIGQUIT);
00341 }
00342
00343 int graceful_wait_ms = 1000;
00344 int int_wait_ms = 100;
00345
00346 TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully" << TLOG_ENDL;
00347 for (int ii = 0; ii < graceful_wait_ms; ++ii)
00348 {
00349 usleep(1000);
00350
00351 for (auto pid : pids)
00352 {
00353 if (kill(pid, 0) < 0)
00354 {
00355 pids.erase(pid);
00356 }
00357 }
00358 if (pids.size() == 0)
00359 {
00360 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms." << TLOG_ENDL;
00361 return;
00362 }
00363 }
00364
00365 TLOG(TLVL_TRACE) << "Insisting that the art processes shut down" << TLOG_ENDL;
00366 for (auto pid : pids)
00367 {
00368 kill(pid, SIGINT);
00369 }
00370
00371 TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit" << TLOG_ENDL;
00372 for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
00373 {
00374 usleep(1000);
00375
00376 for (auto pid : pids)
00377 {
00378 if (kill(pid, 0) < 0)
00379 {
00380 pids.erase(pid);
00381 }
00382 }
00383
00384 if (pids.size() == 0)
00385 {
00386 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms." << TLOG_ENDL;
00387 return;
00388 }
00389 }
00390
00391 TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice" << TLOG_ENDL;
00392 while (pids.size() > 0)
00393 {
00394 kill(*pids.begin(), SIGKILL);
00395 }
00396 }
00397
00398 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
00399 {
00400 TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN" << TLOG_ENDL;
00401 if (restart_art_)
00402 {
00403 endOfData();
00404 }
00405 for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
00406 {
00407 broadcasts_.MarkBufferEmpty(ii, true);
00408 }
00409 if (newRun == 0) newRun = run_id_ + 1;
00410 current_art_pset_ = art_pset;
00411 current_art_config_file_ = std::make_shared<art_config_file>(art_pset);
00412
00413 if (n_art_processes != -1)
00414 {
00415 TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes << TLOG_ENDL;
00416 num_art_processes_ = n_art_processes;
00417 }
00418 startRun(newRun);
00419 TLOG(TLVL_DEBUG) << "ReconfigureArt END" << TLOG_ENDL;
00420 }
00421
00422 bool artdaq::SharedMemoryEventManager::endOfData()
00423 {
00424 init_fragment_.reset(nullptr);
00425 TLOG(TLVL_TRACE) << "SharedMemoryEventManager::endOfData" << TLOG_ENDL;
00426 restart_art_ = false;
00427
00428 size_t initialStoreSize = GetIncompleteEventCount();
00429 TLOG(TLVL_TRACE) << "endOfData: Flushing " << initialStoreSize
00430 << " stale events from the SharedMemoryEventManager." << TLOG_ENDL;
00431 int counter = initialStoreSize;
00432 while (active_buffers_.size() > 0 && counter > 0)
00433 {
00434 complete_buffer_(*active_buffers_.begin());
00435 counter--;
00436 }
00437 TLOG(TLVL_TRACE) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
00438 << " stale events in the SharedMemoryEventManager." << TLOG_ENDL;
00439
00440
00441 TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers..." << TLOG_ENDL;
00442 auto start = std::chrono::steady_clock::now();
00443 auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00444
00445
00446 while (lastReadCount > 0 && TimeUtils::GetElapsedTime(start) < 1)
00447 {
00448 auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
00449 if (temp != lastReadCount)
00450 {
00451 TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(temp) << " outstanding buffers..." << TLOG_ENDL;
00452 lastReadCount = temp;
00453 start = std::chrono::steady_clock::now();
00454 }
00455 if (lastReadCount > 0) usleep(1000);
00456 }
00457
00458 TLOG(TLVL_TRACE) << "endOfData: Broadcasting EndOfData Fragment" << TLOG_ENDL;
00459 FragmentPtr outFrag = std::move(Fragment::eodFrag(GetBufferCount()));
00460 bool success = broadcastFragment_(std::move(outFrag), outFrag);
00461 if (!success)
00462 {
00463 TLOG(TLVL_TRACE) << "endOfData: Clearing buffers to make room for EndOfData Fragment" << TLOG_ENDL;
00464 for (size_t ii = 0; ii < size(); ++ii)
00465 {
00466 broadcasts_.MarkBufferEmpty(ii, true);
00467 }
00468 broadcastFragment_(std::move(outFrag), outFrag);
00469 }
00470
00471 TLOG(TLVL_DEBUG) << "Waiting for all art processes to exit, there are " << std::to_string(art_processes_.size()) << " remaining." << TLOG_ENDL;
00472 while (art_processes_.size() > 0)
00473 {
00474 ShutdownArtProcesses(art_processes_);
00475 }
00476 ResetAttachedCount();
00477
00478 TLOG(TLVL_TRACE) << "endOfData: Clearing buffers" << TLOG_ENDL;
00479 for (size_t ii = 0; ii < size(); ++ii)
00480 {
00481 MarkBufferEmpty(ii, true);
00482 }
00483 released_incomplete_events_.clear();
00484
00485 TLOG(TLVL_TRACE) << "endOfData END" << TLOG_ENDL;
00486 TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " events processed in this run." << TLOG_ENDL;
00487 return true;
00488 }
00489
00490 void artdaq::SharedMemoryEventManager::startRun(run_id_t runID)
00491 {
00492 init_fragment_.reset(nullptr);
00493 StartArt();
00494 run_id_ = runID;
00495 subrun_id_ = 1;
00496 requests_.SendRoutingToken(queue_size_);
00497 TLOG(TLVL_DEBUG) << "Starting run " << run_id_
00498 << ", max queue size = "
00499 << queue_size_
00500 << ", queue size = "
00501 << GetLockedBufferCount() << TLOG_ENDL;
00502 if (metricMan)
00503 {
00504 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00505 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00506 }
00507 }
00508
00509 void artdaq::SharedMemoryEventManager::startSubrun()
00510 {
00511 ++subrun_id_;
00512 if (metricMan)
00513 {
00514 double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
00515 metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
00516 }
00517 }
00518
00519 bool artdaq::SharedMemoryEventManager::endRun()
00520 {
00521 FragmentPtr endOfRunFrag(new
00522 Fragment(static_cast<size_t>
00523 (ceil(sizeof(my_rank) /
00524 static_cast<double>(sizeof(Fragment::value_type))))));
00525
00526 endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
00527 *endOfRunFrag->dataBegin() = my_rank;
00528 broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
00529
00530 return true;
00531 }
00532
00533 bool artdaq::SharedMemoryEventManager::endSubrun()
00534 {
00535 std::unique_ptr<artdaq::Fragment>
00536 endOfSubrunFrag(new
00537 Fragment(static_cast<size_t>
00538 (ceil(sizeof(my_rank) /
00539 static_cast<double>(sizeof(Fragment::value_type))))));
00540
00541 endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
00542 *endOfSubrunFrag->dataBegin() = my_rank;
00543
00544 broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
00545
00546 TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun." << TLOG_ENDL;
00547 subrun_event_count_ = 0;
00548
00549 return true;
00550 }
00551
00552 void artdaq::SharedMemoryEventManager::sendMetrics()
00553 {
00554 if (metricMan)
00555 {
00556 metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
00557 metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
00558 }
00559
00560 if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
00561 {
00562 if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
00563 return;
00564
00565 last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
00566 std::ostringstream oss;
00567 oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
00568 for (auto& ev : active_buffers_)
00569 {
00570 auto hdr = getEventHeader_(ev);
00571 oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
00572 }
00573 TLOG(TLVL_DEBUG) << oss.str() << TLOG_ENDL;
00574 }
00575 }
00576
00577 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
00578 {
00579 auto buffer = broadcasts_.GetBufferForWriting(false);
00580 auto start_time = std::chrono::steady_clock::now();
00581 while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
00582 {
00583 usleep(10000);
00584 buffer = broadcasts_.GetBufferForWriting(false);
00585 }
00586 if (buffer == -1)
00587 {
00588 TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!" << TLOG_ENDL;
00589 outFrag.swap(frag);
00590 return false;
00591 }
00592
00593 auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
00594 hdr->run_id = run_id_;
00595 hdr->subrun_id = subrun_id_;
00596 hdr->sequence_id = frag->sequenceID();
00597 hdr->is_complete = true;
00598 broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
00599
00600 TLOG(TLVL_TRACE) << "broadcastFragment_ before Write calls" << TLOG_ENDL;
00601 broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
00602
00603 broadcasts_.MarkBufferFull(buffer, -1);
00604 outFrag.swap(frag);
00605 return true;
00606 }
00607
00608 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
00609 {
00610 return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
00611 }
00612
00613 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
00614 {
00615 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00616 TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " BEGIN" << TLOG_ENDL;
00617 auto buffers = GetBuffersOwnedByManager();
00618 for (auto& buf : buffers)
00619 {
00620 auto hdr = getEventHeader_(buf);
00621 if (hdr->sequence_id == seqID)
00622 {
00623 TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning " << buf << TLOG_ENDL;
00624 return buf;
00625 }
00626 }
00627
00628 #if !ART_SUPPORTS_DUPLICATE_EVENTS
00629 if (released_incomplete_events_.count(seqID)) {
00630 TLOG(TLVL_ERROR) << "Buffer has already been marked \"Incomplete\" and sent to art!" << TLOG_ENDL;
00631 return -2;
00632 }
00633 #endif
00634
00635 if (!create_new) return -1;
00636
00637 check_pending_buffers_(lk);
00638 int new_buffer = GetBufferForWriting(false);
00639
00640 if (new_buffer == -1)
00641 {
00642 new_buffer = GetBufferForWriting(overwrite_mode_);
00643 }
00644
00645 if (new_buffer == -1) return -1;
00646 TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
00647 auto hdr = getEventHeader_(new_buffer);
00648 hdr->is_complete = false;
00649 hdr->run_id = run_id_;
00650 hdr->subrun_id = subrun_id_;
00651 hdr->sequence_id = seqID;
00652 buffer_writes_pending_[new_buffer] = 0;
00653 IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
00654
00655 active_buffers_.insert(new_buffer);
00656
00657 if (timestamp != Fragment::InvalidTimestamp)
00658 {
00659 requests_.AddRequest(seqID, timestamp);
00660 }
00661 requests_.SendRequest();
00662 TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning newly initialized buffer " << new_buffer << TLOG_ENDL;
00663 return new_buffer;
00664 }
00665
00666 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
00667 {
00668 if (buffer == -1) return true;
00669 if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
00670 {
00671 return true;
00672 }
00673 ResetReadPos(buffer);
00674 IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
00675 return MoreDataInBuffer(buffer);
00676 }
00677
00678 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
00679 {
00680 auto hdr = getEventHeader_(buffer);
00681 if (hdr->is_complete)
00682 {
00683 TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << std::to_string(hdr->sequence_id) << "." << TLOG_ENDL;
00684
00685 requests_.RemoveRequest(hdr->sequence_id);
00686 requests_.SendRoutingToken(1);
00687 {
00688 std::unique_lock<std::mutex> lk(sequence_id_mutex_);
00689 active_buffers_.erase(buffer);
00690 pending_buffers_.insert(buffer);
00691 }
00692 }
00693 check_pending_buffers_();
00694 }
00695
00696 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
00697 {
00698 return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
00699 }
00700
00701 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
00702 {
00703 TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock() << TLOG_ENDL;
00704
00705 auto buffers = GetBuffersOwnedByManager();
00706 for (auto buf : buffers)
00707 {
00708 if (ResetBuffer(buf) && !pending_buffers_.count(buf))
00709 {
00710 auto hdr = getEventHeader_(buf);
00711 if (active_buffers_.count(buf))
00712 {
00713 TLOG(TLVL_WARNING) << "Active event " << std::to_string(hdr->sequence_id) << " is stale. Scheduling release of incomplete event to art." << TLOG_ENDL;
00714 requests_.RemoveRequest(hdr->sequence_id);
00715 requests_.SendRoutingToken(1);
00716 active_buffers_.erase(buf);
00717 pending_buffers_.insert(buf);
00718 if (!released_incomplete_events_.count(hdr->sequence_id)) {
00719 released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
00720 }
00721 else {
00722 released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
00723 }
00724 }
00725
00726 }
00727 }
00728
00729 Fragment::sequence_id_t lowestSeqId = Fragment::InvalidSequenceID;
00730
00731
00732 if (WriteReadyCount(false) != 0)
00733 {
00734 for (auto buf : active_buffers_)
00735 {
00736 auto hdr = getEventHeader_(buf);
00737 TLOG(TLVL_TRACE) << "Buffer: " << buf << ", SeqID: " << std::to_string(hdr->sequence_id) << ", ACTIVE" << TLOG_ENDL;
00738 if (hdr->sequence_id < lowestSeqId)
00739 {
00740 lowestSeqId = hdr->sequence_id;
00741 }
00742 }
00743 TLOG(TLVL_TRACE) << "Lowest SeqID held: " << std::to_string(lowestSeqId) << TLOG_ENDL;
00744 }
00745
00746 std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
00747 sorted_buffers.sort([this](int a, int b) {return bufferComparator(a, b); });
00748 for (auto buf : sorted_buffers)
00749 {
00750 auto hdr = getEventHeader_(buf);
00751 if (hdr->sequence_id > lowestSeqId) break;
00752 TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art." << TLOG_ENDL;
00753 MarkBufferFull(buf);
00754 subrun_event_count_++;
00755 pending_buffers_.erase(buf);
00756 }
00757
00758 TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics" << TLOG_ENDL;
00759 if (metricMan)
00760 {
00761 auto full = ReadReadyCount();
00762 auto empty = WriteReadyCount(overwrite_mode_);
00763 auto total = size();
00764 metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
00765 metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
00766 metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
00767 metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
00768 }
00769 TLOG(TLVL_TRACE) << "check_pending_buffers_ END" << TLOG_ENDL;
00770 }
00771
00772 void artdaq::SharedMemoryEventManager::send_init_frag_()
00773 {
00774 if (init_fragment_ != nullptr)
00775 {
00776 TLOG(TLVL_TRACE) << "Sending init Fragment to art..." << TLOG_ENDL;
00777
00778 #if 0
00779 std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
00780 std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
00781 ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
00782 ostream.close();
00783 #endif
00784
00785 broadcastFragment_(std::move(init_fragment_), init_fragment_);
00786 TLOG(TLVL_TRACE) << "Init Fragment sent" << TLOG_ENDL;
00787 }
00788 else if (send_init_fragments_)
00789 {
00790 TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!" << TLOG_ENDL;
00791 }
00792 }
00793
00794 void artdaq::SharedMemoryEventManager::SetInitFragment(FragmentPtr frag)
00795 {
00796 if (!init_fragment_ || init_fragment_ == nullptr)
00797 {
00798 init_fragment_.swap(frag);
00799 send_init_frag_();
00800 }
00801 }