$treeview $search $mathjax $extrastylesheet
artdaq
v3_04_00
$projectbrief
|
$projectbrief
|
$searchbox |
00001 00002 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str() 00003 00004 #include "artdaq/DAQrate/SharedMemoryEventManager.hh" 00005 #include <sys/wait.h> 00006 #include "artdaq-core/Core/StatisticsCollection.hh" 00007 #include "artdaq-core/Utilities/TraceLock.hh" 00008 00009 #define TLVL_BUFFER 40 00010 #define TLVL_BUFLCK 41 00011 00012 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_; 00013 std::mutex artdaq::SharedMemoryEventManager::subrun_event_map_mutex_; 00014 00015 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset) 00016 : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()), 00017 pset.get<size_t>("buffer_count"), 00018 pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"), 00019 pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000), 00020 !pset.get<bool>("broadcast_mode", false)) 00021 , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1)) 00022 , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event")) 00023 , queue_size_(pset.get<size_t>("buffer_count")) 00024 , run_id_(0) 00025 , max_subrun_event_map_length_(pset.get<size_t>("max_subrun_lookup_table_size", 100)) 00026 , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true)) 00027 , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true)) 00028 , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false)) 00029 , send_init_fragments_(pset.get<bool>("send_init_fragments", true)) 00030 , running_(false) 00031 , buffer_writes_pending_() 00032 , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1)) 00033 , last_incomplete_event_report_time_(std::chrono::steady_clock::now()) 00034 , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now()) 00035 , last_backpressure_report_time_(std::chrono::steady_clock::now()) 00036 , last_fragment_header_write_time_(std::chrono::steady_clock::now()) 00037 , metric_data_() 00038 , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000)) 00039 , run_event_count_(0) 00040 , run_incomplete_event_count_(0) 00041 , subrun_event_count_(0) 00042 , subrun_incomplete_event_count_(0) 00043 , oversize_fragment_count_(0) 00044 , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1)) 00045 , art_processes_() 00046 , restart_art_(false) 00047 , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true)) 00048 , manual_art_(pset.get<bool>("manual_art", false)) 00049 , current_art_pset_(art_pset) 00050 , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0)) 00051 , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 100000)) 00052 , requests_(nullptr) 00053 , data_pset_(pset) 00054 , dropped_data_() 00055 , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()), 00056 pset.get<size_t>("broadcast_buffer_count", 10), 00057 pset.get<size_t>("broadcast_buffer_size", 0x100000), 00058 pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false) 00059 { 00060 subrun_event_map_[0] = 1; 00061 SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader)); 00062 broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader)); 00063 00064 if (pset.get<bool>("use_art", true) == false) 00065 { 00066 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false"; 00067 num_art_processes_ = 0; 00068 } 00069 else 00070 { 00071 TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true"; 00072 TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string(); 00073 } 00074 current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/); 00075 00076 if (overwrite_mode_ && num_art_processes_ > 0) 00077 { 00078 TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!"; 00079 } 00080 else if (overwrite_mode_) 00081 { 00082 TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup"; 00083 } 00084 00085 for (size_t ii = 0; ii < size(); ++ii) 00086 { 00087 buffer_writes_pending_[ii] = 0; 00088 } 00089 00090 if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!"; 00091 00092 TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank; 00093 SetRank(my_rank); 00094 TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank(); 00095 00096 TLOG(TLVL_TRACE) << "END CONSTRUCTOR"; 00097 } 00098 00099 artdaq::SharedMemoryEventManager::~SharedMemoryEventManager() 00100 { 00101 TLOG(TLVL_TRACE) << "DESTRUCTOR"; 00102 if (running_) endOfData(); 00103 TLOG(TLVL_TRACE) << "Destructor END"; 00104 } 00105 00106 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr) 00107 { 00108 TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count 00109 << ", sequence_id=" << frag.sequence_id; 00110 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp); 00111 TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id; 00112 if (buffer == -1) return false; 00113 if (buffer == -2) 00114 { 00115 TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id; 00116 return true; 00117 } 00118 00119 auto hdr = getEventHeader_(buffer); 00120 if (update_run_ids_) 00121 { 00122 hdr->run_id = run_id_; 00123 } 00124 hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id); 00125 00126 TLOG(TLVL_TRACE) << "AddFragment before Write calls"; 00127 Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType)); 00128 00129 TLOG(TLVL_TRACE) << "Checking for complete event"; 00130 auto fragmentCount = GetFragmentCount(frag.sequence_id); 00131 hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0; 00132 TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete 00133 << ", fragmentCount=" << fragmentCount 00134 << ", num_fragments_per_event=" << num_fragments_per_event_ 00135 << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer]; 00136 00137 complete_buffer_(buffer); 00138 if (requests_) requests_->SendRequest(true); 00139 00140 TLOG(TLVL_TRACE) << "AddFragment END"; 00141 return true; 00142 } 00143 00144 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag) 00145 { 00146 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN"; 00147 auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress()); 00148 auto data = frag->headerAddress(); 00149 auto start = std::chrono::steady_clock::now(); 00150 bool sts = false; 00151 while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec) 00152 { 00153 sts = AddFragment(hdr, data); 00154 if (!sts) usleep(1000); 00155 } 00156 if (!sts) 00157 { 00158 outfrag = std::move(frag); 00159 } 00160 TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts; 00161 return sts; 00162 } 00163 00164 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable) 00165 { 00166 TLOG(14) << "WriteFragmentHeader BEGIN"; 00167 auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp); 00168 00169 if (buffer < 0) 00170 { 00171 if (buffer == -1 && !dropIfNoBuffersAvailable) 00172 { 00173 std::unique_lock<std::mutex> bp_lk(sequence_id_mutex_); 00174 if (TimeUtils::GetElapsedTime(last_backpressure_report_time_) > 1.0) 00175 { 00176 TLOG(TLVL_WARNING) << app_name << ": Back-pressure condition: All Shared Memory buffers have been full for " << TimeUtils::GetElapsedTime(last_fragment_header_write_time_) << " s!"; 00177 last_backpressure_report_time_ = std::chrono::steady_clock::now(); 00178 } 00179 return nullptr; 00180 } 00181 if (buffer == -2) 00182 { 00183 TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event."; 00184 } 00185 else 00186 { 00187 TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off."; 00188 } 00189 dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words())); 00190 00191 TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes(); 00192 return dropped_data_[frag.fragment_id]->dataBegin(); 00193 } 00194 00195 last_backpressure_report_time_ = std::chrono::steady_clock::now(); 00196 last_fragment_header_write_time_ = std::chrono::steady_clock::now(); 00197 // Increment this as soon as we know we want to use the buffer 00198 buffer_writes_pending_[buffer]++; 00199 00200 if (metricMan) 00201 { 00202 metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate); 00203 } 00204 00205 TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer; 00206 00207 std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]); 00208 00209 TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer; 00210 00211 //TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader"); 00212 auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer)); 00213 Write(buffer, &frag, frag.num_words() * sizeof(RawDataType)); 00214 00215 auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer)); 00216 if (frag.word_count - frag.num_words() > 0) 00217 { 00218 auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType)); 00219 00220 if (!sts) 00221 { 00222 reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words(); 00223 reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType; 00224 TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)"; 00225 dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words())); 00226 00227 oversize_fragment_count_++; 00228 00229 if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_) 00230 { 00231 throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!"); 00232 } 00233 00234 TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin(); 00235 return dropped_data_[frag.fragment_id]->dataBegin(); 00236 } 00237 } 00238 TLOG(14) << "WriteFragmentHeader END"; 00239 return pos; 00240 } 00241 00242 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag) 00243 { 00244 TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN"; 00245 auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp); 00246 if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!"); 00247 if (buffer == -2) { return; } 00248 00249 { 00250 TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer; 00251 00252 std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]); 00253 00254 TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer; 00255 00256 //TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment"); 00257 00258 TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")"; 00259 auto hdr = getEventHeader_(buffer); 00260 if (update_run_ids_) 00261 { 00262 hdr->run_id = run_id_; 00263 } 00264 hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id); 00265 00266 TLOG(TLVL_TRACE) << "DoneWritingFragment: Updating buffer touch time"; 00267 TouchBuffer(buffer); 00268 00269 buffer_writes_pending_[buffer]--; 00270 if (buffer_writes_pending_[buffer] != 0) 00271 { 00272 TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps."; 00273 return; 00274 } 00275 TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps."; 00276 auto frag_count = GetFragmentCount(frag.sequence_id); 00277 hdr->is_complete = frag_count == num_fragments_per_event_; 00278 TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_; 00279 #if ART_SUPPORTS_DUPLICATE_EVENTS 00280 if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id)) 00281 { 00282 hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0; 00283 } 00284 #endif 00285 } 00286 00287 complete_buffer_(buffer); 00288 if (requests_) requests_->SendRequest(true); 00289 TLOG(TLVL_TRACE) << "DoneWritingFragment END"; 00290 } 00291 00292 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type) 00293 { 00294 return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type); 00295 } 00296 00297 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type) 00298 { 00299 if (buffer == -1) return 0; 00300 ResetReadPos(buffer); 00301 IncrementReadPos(buffer, sizeof(detail::RawEventHeader)); 00302 00303 size_t count = 0; 00304 00305 while (MoreDataInBuffer(buffer)) 00306 { 00307 auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer)); 00308 IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType)); 00309 if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue; 00310 TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count"; 00311 ++count; 00312 } 00313 00314 return count; 00315 } 00316 00317 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out) 00318 { 00319 do 00320 { 00321 auto start_time = std::chrono::steady_clock::now(); 00322 send_init_frag_(); 00323 TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName(); 00324 00325 pid_t pid = 0; 00326 00327 if (!manual_art_) 00328 { 00329 char* filename = new char[config_file->getFileName().length() + 1]; 00330 strcpy(filename, config_file->getFileName().c_str()); 00331 00332 std::vector<char*> args{(char*)"art", (char*)"-c", filename, NULL}; 00333 pid = fork(); 00334 if (pid == 0) 00335 { /* child */ 00336 // 23-May-2018, KAB: added the setting of the partition number env var 00337 // in the environment of the child art process so that Globals.hh 00338 // will pick it up there and provide it to the artdaq classes that 00339 // are used in data transfers, etc. within the art process. 00340 std::string envVarKey = "ARTDAQ_PARTITION_NUMBER"; 00341 std::string envVarValue = std::to_string(GetPartitionNumber()); 00342 if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0) 00343 { 00344 TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey 00345 << "\" in the environment of a child art process. " 00346 << "This may result in incorrect TCP port number " 00347 << "assignments or other issues, and data may " 00348 << "not flow through the system correctly."; 00349 } 00350 envVarKey = "ARTDAQ_APPLICATION_NAME"; 00351 envVarValue = app_name; 00352 if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0) 00353 { 00354 TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey 00355 << "\" in the environment of a child art process. "; 00356 } 00357 00358 execvp("art", &args[0]); 00359 delete[] filename; 00360 exit(1); 00361 } 00362 delete[] filename; 00363 } 00364 else 00365 { 00366 //Using cin/cout here to ensure console is active (artdaqDriver) 00367 std::cout << "Please run the following command in a separate terminal:" << std::endl 00368 << "art -c " << config_file->getFileName() << std::endl 00369 << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl 00370 << "Finally, return to this window and enter the pid: " << std::endl; 00371 std::cin >> pid; 00372 } 00373 *pid_out = pid; 00374 00375 TLOG(TLVL_INFO) << "PID of new art process is " << pid; 00376 { 00377 std::unique_lock<std::mutex> lk(art_process_mutex_); 00378 art_processes_.insert(pid); 00379 } 00380 siginfo_t status; 00381 auto sts = waitid(P_PID, pid, &status, WEXITED); 00382 TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list"; 00383 { 00384 std::unique_lock<std::mutex> lk(art_process_mutex_); 00385 art_processes_.erase(pid); 00386 } 00387 if (sts < 0) 00388 { 00389 TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ")."; 00390 } 00391 else if (status.si_code == CLD_EXITED && status.si_status == 0) 00392 { 00393 TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting"); 00394 } 00395 else 00396 { 00397 auto art_lifetime = TimeUtils::GetElapsedTime(start_time); 00398 if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false; 00399 00400 auto exit_type = "exited with status code"; 00401 switch (status.si_code) 00402 { 00403 case CLD_DUMPED: 00404 case CLD_KILLED: 00405 exit_type = "was killed with signal"; 00406 break; 00407 case CLD_EXITED: 00408 default: 00409 break; 00410 } 00411 00412 TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR)) 00413 << "art process " << pid << " " << exit_type << " " << status.si_status 00414 << (status.si_code == CLD_DUMPED ? " (core dumped)" : "") 00415 << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, " 00416 << (restart_art_ ? "restarting" : "not restarting"); 00417 } 00418 } while (restart_art_); 00419 } 00420 00421 void artdaq::SharedMemoryEventManager::StartArt() 00422 { 00423 restart_art_ = always_restart_art_; 00424 if (num_art_processes_ == 0) return; 00425 for (size_t ii = 0; ii < num_art_processes_; ++ii) 00426 { 00427 StartArtProcess(current_art_pset_); 00428 } 00429 } 00430 00431 pid_t artdaq::SharedMemoryEventManager::StartArtProcess(fhicl::ParameterSet pset) 00432 { 00433 static std::mutex start_art_mutex; 00434 std::unique_lock<std::mutex> lk(start_art_mutex); 00435 //TraceLock lk(start_art_mutex, 15, "StartArtLock"); 00436 restart_art_ = always_restart_art_; 00437 auto initialCount = GetAttachedCount(); 00438 auto startTime = std::chrono::steady_clock::now(); 00439 00440 if (pset != current_art_pset_ || !current_art_config_file_) 00441 { 00442 current_art_pset_ = pset; 00443 current_art_config_file_ = std::make_shared<art_config_file>(pset /*, GetKey(), GetBroadcastKey()*/); 00444 } 00445 std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1)); 00446 boost::thread thread([&] { RunArt(current_art_config_file_, pid); }); 00447 thread.detach(); 00448 00449 auto currentCount = GetAttachedCount() - initialCount; 00450 while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_)) 00451 { 00452 usleep(10000); 00453 currentCount = GetAttachedCount() - initialCount; 00454 } 00455 if ((currentCount < 1 || *pid <= 0) && manual_art_) 00456 { 00457 TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid; 00458 return 0; 00459 } 00460 else if (currentCount < 1 || *pid <= 0) 00461 { 00462 TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!" 00463 << " (pid=" << *pid << ", attachedCount=" << currentCount << ")"; 00464 return 0; 00465 } 00466 else 00467 { 00468 TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took " 00469 << TimeUtils::GetElapsedTime(startTime) << " seconds."; 00470 00471 return *pid; 00472 } 00473 } 00474 00475 void artdaq::SharedMemoryEventManager::ShutdownArtProcesses(std::set<pid_t>& pids) 00476 { 00477 restart_art_ = false; 00478 //current_art_config_file_ = nullptr; 00479 //current_art_pset_ = fhicl::ParameterSet(); 00480 00481 auto check_pids = [&](bool print) { 00482 std::unique_lock<std::mutex> lk(art_process_mutex_); 00483 for (auto pid = pids.begin(); pid != pids.end();) 00484 { 00485 // 08-May-2018, KAB: protect against killing invalid PIDS 00486 00487 if (*pid <= 0) 00488 { 00489 TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid 00490 << ") from the shutdown list."; 00491 pid = pids.erase(pid); 00492 } 00493 else if (kill(*pid, 0) < 0) 00494 { 00495 pid = pids.erase(pid); 00496 } 00497 else 00498 { 00499 if (print) std::cout << *pid << " "; 00500 ++pid; 00501 } 00502 } 00503 }; 00504 auto count_pids = [&]() { 00505 std::unique_lock<std::mutex> lk(art_process_mutex_); 00506 return pids.size(); 00507 }; 00508 check_pids(false); 00509 if (count_pids() == 0) 00510 { 00511 TLOG(14) << "All art processes already exited, nothing to do."; 00512 usleep(1000); 00513 return; 00514 } 00515 00516 if (!manual_art_) 00517 { 00518 { 00519 TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down"; 00520 std::unique_lock<std::mutex> lk(art_process_mutex_); 00521 for (auto pid : pids) 00522 { 00523 TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid; 00524 kill(pid, SIGQUIT); 00525 } 00526 } 00527 00528 int graceful_wait_ms = 5000; 00529 int int_wait_ms = 1000; 00530 00531 TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully"; 00532 for (int ii = 0; ii < graceful_wait_ms; ++ii) 00533 { 00534 usleep(1000); 00535 00536 check_pids(false); 00537 if (count_pids() == 0) 00538 { 00539 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms."; 00540 return; 00541 } 00542 } 00543 00544 { 00545 TLOG(TLVL_TRACE) << "Insisting that the art processes shut down"; 00546 std::unique_lock<std::mutex> lk(art_process_mutex_); 00547 for (auto pid : pids) 00548 { 00549 kill(pid, SIGINT); 00550 } 00551 } 00552 00553 TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit"; 00554 for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii) 00555 { 00556 usleep(1000); 00557 00558 check_pids(false); 00559 00560 if (count_pids() == 0) 00561 { 00562 TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms."; 00563 return; 00564 } 00565 } 00566 00567 TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice"; 00568 while (count_pids() > 0) 00569 { 00570 { 00571 std::unique_lock<std::mutex> lk(art_process_mutex_); 00572 kill(*pids.begin(), SIGKILL); 00573 usleep(1000); 00574 } 00575 check_pids(false); 00576 } 00577 } 00578 else 00579 { 00580 std::cout << "Please shut down all art processes, then hit return/enter" << std::endl; 00581 while (count_pids() > 0) 00582 { 00583 std::cout << "The following PIDs are running: "; 00584 check_pids(true); 00585 std::cout << std::endl; 00586 std::string ignored; 00587 std::cin >> ignored; 00588 } 00589 } 00590 } 00591 00592 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes) 00593 { 00594 TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN"; 00595 if (restart_art_ || !always_restart_art_) // Art is running 00596 { 00597 endOfData(); 00598 } 00599 for (size_t ii = 0; ii < broadcasts_.size(); ++ii) 00600 { 00601 broadcasts_.MarkBufferEmpty(ii, true); 00602 } 00603 if (newRun == 0) newRun = run_id_ + 1; 00604 00605 if (art_pset != current_art_pset_ || !current_art_config_file_) 00606 { 00607 current_art_pset_ = art_pset; 00608 current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/); 00609 } 00610 00611 if (n_art_processes != -1) 00612 { 00613 TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes; 00614 num_art_processes_ = n_art_processes; 00615 } 00616 startRun(newRun); 00617 TLOG(TLVL_DEBUG) << "ReconfigureArt END"; 00618 } 00619 00620 bool artdaq::SharedMemoryEventManager::endOfData() 00621 { 00622 running_ = false; 00623 init_fragment_.reset(nullptr); 00624 TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData"; 00625 restart_art_ = false; 00626 00627 size_t initialStoreSize = GetIncompleteEventCount(); 00628 TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize 00629 << " stale events from the SharedMemoryEventManager."; 00630 int counter = initialStoreSize; 00631 while (active_buffers_.size() > 0 && counter > 0) 00632 { 00633 complete_buffer_(*active_buffers_.begin()); 00634 counter--; 00635 } 00636 TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount() 00637 << " stale events in the SharedMemoryEventManager."; 00638 00639 TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers..."; 00640 auto start = std::chrono::steady_clock::now(); 00641 auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_)); 00642 auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1); //size(); 00643 00644 auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_; 00645 00646 // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left. 00647 while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0) 00648 { 00649 auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_)); 00650 if (temp != lastReadCount) 00651 { 00652 TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers..."; 00653 lastReadCount = temp; 00654 start = std::chrono::steady_clock::now(); 00655 } 00656 if (lastReadCount > 0) 00657 { 00658 TRACE(19, "About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu", outstanding_buffer_wait_time, lastReadCount, size(), end_of_data_wait_us); 00659 usleep(outstanding_buffer_wait_time); 00660 } 00661 } 00662 00663 TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: " 00664 << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_(); 00665 00666 TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment"; 00667 FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount()); 00668 bool success = broadcastFragment_(std::move(outFrag), outFrag); 00669 if (!success) 00670 { 00671 TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment"; 00672 for (size_t ii = 0; ii < broadcasts_.size(); ++ii) 00673 { 00674 broadcasts_.MarkBufferEmpty(ii, true); 00675 } 00676 broadcastFragment_(std::move(outFrag), outFrag); 00677 } 00678 auto endOfDataProcessingStart = std::chrono::steady_clock::now(); 00679 00680 if (get_art_process_count_() > 0) 00681 { 00682 TLOG(TLVL_DEBUG) << "Allowing " << get_art_process_count_() << " art processes the chance to end gracefully"; 00683 if (end_of_data_wait_us == 0) 00684 { 00685 TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully."; 00686 end_of_data_wait_us = 100 * 1000000; 00687 } 00688 00689 auto sleep_count = (end_of_data_wait_us / 10000) + 1; 00690 for (size_t ii = 0; ii < sleep_count; ++ii) 00691 { 00692 usleep(10000); 00693 if (get_art_process_count_() == 0) break; 00694 } 00695 } 00696 00697 while (get_art_process_count_() > 0) 00698 { 00699 TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown."; 00700 00701 ShutdownArtProcesses(art_processes_); 00702 } 00703 TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment"; 00704 00705 ResetAttachedCount(); 00706 00707 TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers"; 00708 for (size_t ii = 0; ii < size(); ++ii) 00709 { 00710 MarkBufferEmpty(ii, true); 00711 } 00712 // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again... 00713 // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers"; 00714 // for (size_t ii = 0; ii < broadcasts_.size(); ++ii) 00715 // { 00716 // broadcasts_.MarkBufferEmpty(ii, true); 00717 // } 00718 released_incomplete_events_.clear(); 00719 00720 TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestSender"; 00721 requests_.reset(nullptr); 00722 00723 TLOG(TLVL_DEBUG) << "endOfData END"; 00724 TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed."; 00725 return true; 00726 } 00727 00728 void artdaq::SharedMemoryEventManager::startRun(run_id_t runID) 00729 { 00730 running_ = true; 00731 init_fragment_.reset(nullptr); 00732 TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers"; 00733 for (size_t ii = 0; ii < broadcasts_.size(); ++ii) 00734 { 00735 broadcasts_.MarkBufferEmpty(ii, true); 00736 } 00737 StartArt(); 00738 run_id_ = runID; 00739 { 00740 std::unique_lock<std::mutex> lk(subrun_event_map_mutex_); 00741 subrun_event_map_.clear(); 00742 subrun_event_map_[0] = 1; 00743 } 00744 run_event_count_ = 0; 00745 run_incomplete_event_count_ = 0; 00746 requests_.reset(new RequestSender(data_pset_)); 00747 if (requests_) 00748 { 00749 requests_->SetRunNumber(static_cast<uint32_t>(run_id_)); 00750 requests_->SendRoutingToken(queue_size_, run_id_); 00751 } 00752 TLOG(TLVL_DEBUG) << "Starting run " << run_id_ 00753 << ", max queue size = " 00754 << queue_size_ 00755 << ", queue size = " 00756 << GetLockedBufferCount(); 00757 if (metricMan) 00758 { 00759 metricMan->sendMetric("Run Number", static_cast<unsigned long>(run_id_), "Run", 1, MetricMode::LastPoint); 00760 } 00761 } 00762 00763 bool artdaq::SharedMemoryEventManager::endRun() 00764 { 00765 TLOG(TLVL_INFO) << "Ending run " << run_id_; 00766 FragmentPtr endOfRunFrag(new Fragment(static_cast<size_t>(ceil(sizeof(my_rank) / 00767 static_cast<double>(sizeof(Fragment::value_type)))))); 00768 00769 TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment"; 00770 endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType); 00771 *endOfRunFrag->dataBegin() = my_rank; 00772 broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag); 00773 00774 TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run."; 00775 run_event_count_ = 0; 00776 run_incomplete_event_count_ = 0; 00777 oversize_fragment_count_ = 0; 00778 { 00779 std::unique_lock<std::mutex> lk(subrun_event_map_mutex_); 00780 subrun_event_map_.clear(); 00781 subrun_event_map_[0] = 1; 00782 } 00783 return true; 00784 } 00785 00786 void artdaq::SharedMemoryEventManager::rolloverSubrun(sequence_id_t boundary, subrun_id_t subrun) 00787 { 00788 // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored 00789 if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return; 00790 00791 std::unique_lock<std::mutex> lk(subrun_event_map_mutex_); 00792 00793 TLOG(TLVL_INFO) << "Will roll over to subrun " << subrun << " when I reach Sequence ID " << boundary; 00794 subrun_event_map_[boundary] = subrun; 00795 while (subrun_event_map_.size() > max_subrun_event_map_length_) 00796 { 00797 subrun_event_map_.erase(subrun_event_map_.begin()); 00798 } 00799 } 00800 00801 void artdaq::SharedMemoryEventManager::rolloverSubrun() 00802 { 00803 Fragment::sequence_id_t seqID = 0; 00804 subrun_id_t subrun = 0; 00805 { 00806 std::unique_lock<std::mutex> lk(subrun_event_map_mutex_); 00807 for (auto& it : subrun_event_map_) 00808 { 00809 if (it.first >= seqID) seqID = it.first + 1; 00810 if (it.second >= subrun) subrun = it.second + 1; 00811 } 00812 } 00813 rolloverSubrun(seqID, subrun); 00814 } 00815 00816 void artdaq::SharedMemoryEventManager::sendMetrics() 00817 { 00818 if (metricMan) 00819 { 00820 metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint); 00821 metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint); 00822 } 00823 00824 if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount()) 00825 { 00826 if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_)) 00827 return; 00828 00829 last_incomplete_event_report_time_ = std::chrono::steady_clock::now(); 00830 std::ostringstream oss; 00831 oss << "Incomplete Events (" << num_fragments_per_event_ << "): "; 00832 for (auto& ev : active_buffers_) 00833 { 00834 auto hdr = getEventHeader_(ev); 00835 oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), "; 00836 } 00837 TLOG(TLVL_DEBUG) << oss.str(); 00838 } 00839 } 00840 00841 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag) 00842 { 00843 TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B."; 00844 auto buffer = broadcasts_.GetBufferForWriting(false); 00845 TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer; 00846 auto start_time = std::chrono::steady_clock::now(); 00847 while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_)) 00848 { 00849 usleep(10000); 00850 buffer = broadcasts_.GetBufferForWriting(false); 00851 } 00852 TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s."; 00853 if (buffer == -1) 00854 { 00855 TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!"; 00856 outFrag.swap(frag); 00857 return false; 00858 } 00859 00860 TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader"; 00861 auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer)); 00862 hdr->run_id = run_id_; 00863 hdr->subrun_id = GetSubrunForSequenceID(frag->sequenceID()); 00864 hdr->sequence_id = frag->sequenceID(); 00865 hdr->is_complete = true; 00866 broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader)); 00867 00868 TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls"; 00869 broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType)); 00870 00871 TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full"; 00872 broadcasts_.MarkBufferFull(buffer, -1); 00873 outFrag.swap(frag); 00874 TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete"; 00875 return true; 00876 } 00877 00878 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer) 00879 { 00880 return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer)); 00881 } 00882 00883 artdaq::SharedMemoryEventManager::subrun_id_t artdaq::SharedMemoryEventManager::GetSubrunForSequenceID(Fragment::sequence_id_t seqID) 00884 { 00885 std::unique_lock<std::mutex> lk(subrun_event_map_mutex_); 00886 00887 TLOG(TLVL_TRACE) << "GetSubrunForSequenceID BEGIN map size = " << subrun_event_map_.size(); 00888 auto it = subrun_event_map_.begin(); 00889 subrun_id_t subrun = 1; 00890 00891 while (it->first <= seqID && it != subrun_event_map_.end()) 00892 { 00893 TLOG(TLVL_TRACE) << "Map has sequence ID " << it->first << ", subrun " << it->second << " (looking for <= " << seqID << ")"; 00894 subrun = it->second; 00895 ++it; 00896 } 00897 00898 TLOG(TLVL_DEBUG) << "GetSubrunForSequenceID returning subrun " << subrun << " for sequence ID " << seqID; 00899 return subrun; 00900 } 00901 00902 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp) 00903 { 00904 TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN"; 00905 std::unique_lock<std::mutex> lk(sequence_id_mutex_); 00906 00907 TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID; 00908 00909 auto buffers = GetBuffersOwnedByManager(); 00910 for (auto& buf : buffers) 00911 { 00912 auto hdr = getEventHeader_(buf); 00913 if (hdr->sequence_id == seqID) 00914 { 00915 TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf; 00916 return buf; 00917 } 00918 } 00919 00920 #if !ART_SUPPORTS_DUPLICATE_EVENTS 00921 if (released_incomplete_events_.count(seqID)) 00922 { 00923 TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!"; 00924 return -2; 00925 } 00926 #endif 00927 00928 if (!create_new) return -1; 00929 00930 check_pending_buffers_(lk); 00931 int new_buffer = GetBufferForWriting(false); 00932 00933 if (new_buffer == -1) 00934 { 00935 new_buffer = GetBufferForWriting(overwrite_mode_); 00936 } 00937 00938 if (new_buffer == -1) return -1; 00939 TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer; 00940 std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]); 00941 TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer; 00942 //TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID"); 00943 auto hdr = getEventHeader_(new_buffer); 00944 hdr->is_complete = false; 00945 hdr->run_id = run_id_; 00946 hdr->subrun_id = GetSubrunForSequenceID(seqID); 00947 hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp); 00948 hdr->sequence_id = seqID; 00949 buffer_writes_pending_[new_buffer] = 0; 00950 IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader)); 00951 SetMFIteration("Sequence ID " + std::to_string(seqID)); 00952 00953 TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active."; 00954 active_buffers_.insert(new_buffer); 00955 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=(" 00956 << size() << "," 00957 << ReadReadyCount() << "," 00958 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << "," 00959 << WriteReadyCount(false) << "," 00960 << pending_buffers_.size() << "," 00961 << active_buffers_.size() << ")"; 00962 00963 if (requests_) 00964 { 00965 if (timestamp != Fragment::InvalidTimestamp) 00966 { 00967 requests_->AddRequest(seqID, timestamp); 00968 } 00969 // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we 00970 // don't double-send requests, but still get the benefit of calling SendRequest 'often'. 00971 else 00972 { 00973 requests_->SendRequest(); 00974 } 00975 } 00976 TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer; 00977 return new_buffer; 00978 } 00979 00980 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer) 00981 { 00982 if (buffer == -1) return true; 00983 if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing)) 00984 { 00985 return true; 00986 } 00987 ResetReadPos(buffer); 00988 IncrementReadPos(buffer, sizeof(detail::RawEventHeader)); 00989 return MoreDataInBuffer(buffer); 00990 } 00991 00992 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer) 00993 { 00994 auto hdr = getEventHeader_(buffer); 00995 if (hdr->is_complete) 00996 { 00997 TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << "."; 00998 00999 { 01000 TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending."; 01001 01002 TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id; 01003 std::unique_lock<std::mutex> lk(sequence_id_mutex_); 01004 TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id; 01005 active_buffers_.erase(buffer); 01006 pending_buffers_.insert(buffer); 01007 01008 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=(" 01009 << size() << "," 01010 << ReadReadyCount() << "," 01011 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << "," 01012 << WriteReadyCount(false) << "," 01013 << pending_buffers_.size() << "," 01014 << active_buffers_.size() << ")"; 01015 } 01016 if (requests_) 01017 { 01018 requests_->RemoveRequest(hdr->sequence_id); 01019 } 01020 } 01021 CheckPendingBuffers(); 01022 } 01023 01024 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB) 01025 { 01026 return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id; 01027 } 01028 01029 void artdaq::SharedMemoryEventManager::CheckPendingBuffers() 01030 { 01031 TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_"; 01032 std::unique_lock<std::mutex> lk(sequence_id_mutex_); 01033 TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_"; 01034 check_pending_buffers_(lk); 01035 } 01036 01037 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock) 01038 { 01039 TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock(); 01040 01041 auto buffers = GetBuffersOwnedByManager(); 01042 for (auto buf : buffers) 01043 { 01044 if (ResetBuffer(buf) && !pending_buffers_.count(buf)) 01045 { 01046 TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load(); 01047 auto hdr = getEventHeader_(buf); 01048 if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_)) 01049 { 01050 if (requests_) 01051 { 01052 requests_->RemoveRequest(hdr->sequence_id); 01053 } 01054 TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending"; 01055 active_buffers_.erase(buf); 01056 pending_buffers_.insert(buf); 01057 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=(" 01058 << size() << "," 01059 << ReadReadyCount() << "," 01060 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << "," 01061 << WriteReadyCount(false) << "," 01062 << pending_buffers_.size() << "," 01063 << active_buffers_.size() << ")"; 01064 01065 run_incomplete_event_count_++; 01066 if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate); 01067 if (!released_incomplete_events_.count(hdr->sequence_id)) 01068 { 01069 released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf); 01070 } 01071 else 01072 { 01073 released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf); 01074 } 01075 TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art."; 01076 } 01077 } 01078 } 01079 01080 std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end()); 01081 sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); }); 01082 01083 auto counter = 0; 01084 double eventSize = 0; 01085 for (auto buf : sorted_buffers) 01086 { 01087 auto hdr = getEventHeader_(buf); 01088 01089 TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, " 01090 << "event_size=" << BufferDataSize(buf) << ", buffer_size=" << BufferSize(); 01091 01092 TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full"; 01093 MarkBufferFull(buf); 01094 run_event_count_++; 01095 counter++; 01096 eventSize += BufferDataSize(buf); 01097 pending_buffers_.erase(buf); 01098 TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=(" 01099 << size() << "," 01100 << ReadReadyCount() << "," 01101 << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << "," 01102 << WriteReadyCount(false) << "," 01103 << pending_buffers_.size() << "," 01104 << active_buffers_.size() << ")"; 01105 } 01106 01107 if (requests_) 01108 { 01109 TLOG(TLVL_TRACE) << "Sent tokens: " << requests_->GetSentTokenCount() << ", Event count: " << run_event_count_; 01110 auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_; 01111 auto available_buffers = WriteReadyCount(overwrite_mode_); 01112 01113 TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers 01114 << ", tokens_to_send: " << available_buffers - outstanding_tokens; 01115 01116 if (available_buffers > outstanding_tokens) 01117 { 01118 auto tokens_to_send = available_buffers - outstanding_tokens; 01119 01120 while (tokens_to_send > 0) 01121 { 01122 TLOG(35) << "check_pending_buffers_: Sending a Routing Token"; 01123 requests_->SendRoutingToken(1, run_id_); 01124 tokens_to_send--; 01125 } 01126 } 01127 } 01128 01129 metric_data_.event_count += counter; 01130 metric_data_.event_size += eventSize; 01131 01132 if (metricMan && TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500) // Limit to 2 Hz updates 01133 { 01134 TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics"; 01135 metricMan->sendMetric("Event Rate", metric_data_.event_count, "Events/s", 1, MetricMode::Rate); 01136 if (metric_data_.event_count > 0) metricMan->sendMetric("Average Event Size", metric_data_.event_size / metric_data_.event_count, "Bytes", 1, MetricMode::Average); 01137 metric_data_ = MetricData(); 01138 01139 metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint); 01140 metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint); 01141 if (requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint); 01142 01143 auto bufferReport = GetBufferReport(); 01144 int full = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Full; }); 01145 int empty = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Empty; }); 01146 int writing = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Writing; }); 01147 int reading = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Reading; }); 01148 auto total = size(); 01149 TLOG(TLVL_DEBUG) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total; 01150 01151 metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint); 01152 metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint); 01153 metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint); 01154 metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint); 01155 if (total > 0) 01156 { 01157 metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint); 01158 metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint); 01159 } 01160 01161 last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now(); 01162 } 01163 TLOG(TLVL_TRACE) << "check_pending_buffers_ END"; 01164 } 01165 01166 void artdaq::SharedMemoryEventManager::send_init_frag_() 01167 { 01168 if (init_fragment_ != nullptr) 01169 { 01170 TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses..."; 01171 01172 #if 0 01173 std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin"; 01174 std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary); 01175 ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes()); 01176 ostream.close(); 01177 #endif 01178 01179 broadcastFragment_(std::move(init_fragment_), init_fragment_); 01180 TLOG(TLVL_TRACE) << "Init Fragment sent"; 01181 } 01182 else if (send_init_fragments_) 01183 { 01184 TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!"; 01185 } 01186 } 01187 01188 void artdaq::SharedMemoryEventManager::SetInitFragment(FragmentPtr frag) 01189 { 01190 if (!init_fragment_ || init_fragment_ == nullptr) 01191 { 01192 init_fragment_.swap(frag); 01193 send_init_frag_(); 01194 } 01195 } 01196 01197 void artdaq::SharedMemoryEventManager::UpdateArtConfiguration(fhicl::ParameterSet art_pset) 01198 { 01199 TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN"; 01200 if (art_pset != current_art_pset_ || !current_art_config_file_) 01201 { 01202 current_art_pset_ = art_pset; 01203 current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/); 01204 } 01205 TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END"; 01206 } 01207 01208 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103 01209 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager) 01210 #endif