artdaq  v3_00_02
SharedMemoryEventManager.cc
1 
2 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
3 
4 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
5 #include "artdaq-core/Core/StatisticsCollection.hh"
6 #include "artdaq-core/Utilities/TraceLock.hh"
7 #include <sys/wait.h>
8 #include "SharedMemoryEventManager.hh"
9 
10 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
11 
12 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
13  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
14  pset.get<size_t>("buffer_count"),
15  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
16  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
17  !pset.get<bool>("broadcast_mode", false))
18  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
19  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
20  , queue_size_(pset.get<size_t>("buffer_count"))
21  , run_id_(0)
22  , subrun_id_(0)
23  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
24  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
25  , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
26  , buffer_writes_pending_()
27  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
28  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
29  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
30  , broadcast_count_(0)
31  , subrun_event_count_(0)
32  , art_processes_()
33  , restart_art_(false)
34  , current_art_pset_(art_pset)
35  , requests_(pset)
36  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
37  pset.get<size_t>("broadcast_buffer_count", 10),
38  pset.get<size_t>("broadcast_buffer_size", 0x100000),
39  pset.get<int>("fragment_broadcast_timeout_ms", 3000) * 1000, false)
40 {
41  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
42  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
43 
44  if (pset.get<bool>("use_art", true) == false) {
45  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false" << TLOG_ENDL;
46  num_art_processes_ = 0;
47  }
48  else {
49  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true" << TLOG_ENDL;
50  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string() << TLOG_ENDL;
51  }
52  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
53 
54  if (overwrite_mode_ && num_art_processes_ > 0)
55  {
56  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!" << TLOG_ENDL;
57  }
58  else if (overwrite_mode_)
59  {
60  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup" << TLOG_ENDL;
61  }
62 
63  for (size_t ii = 0; ii < size(); ++ii)
64  {
65  buffer_writes_pending_[ii] = 0;
66  }
67 
68  if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
69 
70  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank << TLOG_ENDL;
71  SetRank(my_rank);
72  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank() << TLOG_ENDL;
73 
74 
75  TLOG(TLVL_TRACE) << "END CONSTRUCTOR" << TLOG_ENDL;
76 }
77 
79 {
80  TLOG(TLVL_TRACE) << "DESTRUCTOR" << TLOG_ENDL;
81  endOfData();
82  TLOG(TLVL_TRACE) << "Destructor END" << TLOG_ENDL;
83 }
84 
85 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
86 {
87  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << std::to_string(frag.word_count)
88  << ", sequence_id=" << std::to_string(frag.sequence_id) << TLOG_ENDL;
89  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
90  TLOG(TLVL_TRACE) << "Using buffer " << std::to_string(buffer) << TLOG_ENDL;
91  if (buffer == -1) return false;
92  if (buffer == -2)
93  {
94  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << std::to_string(frag.sequence_id) << TLOG_ENDL;
95  return true;
96  }
97 
98  auto hdr = getEventHeader_(buffer);
99  if (update_run_ids_)
100  {
101  hdr->run_id = run_id_;
102  hdr->subrun_id = subrun_id_;
103  }
104 
105  TLOG(TLVL_TRACE) << "AddFragment before Write calls" << TLOG_ENDL;
106  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
107 
108  TLOG(TLVL_TRACE) << "Checking for complete event" << TLOG_ENDL;
109  auto fragmentCount = GetFragmentCount(frag.sequence_id);
110  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
111  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
112  << ", fragmentCount=" << std::to_string(fragmentCount)
113  << ", num_fragments_per_event=" << std::to_string(num_fragments_per_event_)
114  << ", buffer_writes_pending_[buffer]=" << std::to_string(buffer_writes_pending_[buffer]) << TLOG_ENDL;
115 
116  complete_buffer_(buffer);
117  requests_.SendRequest(true);
118 
119  TLOG(TLVL_TRACE) << "AddFragment END" << TLOG_ENDL;
120  return true;
121 }
122 
123 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
124 {
125  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN" << TLOG_ENDL;
126  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
127  auto data = frag->headerAddress();
128  auto start = std::chrono::steady_clock::now();
129  bool sts = false;
130  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
131  {
132  sts = AddFragment(hdr, data);
133  if (!sts) usleep(1000);
134  }
135  if (!sts)
136  {
137  outfrag = std::move(frag);
138  }
139  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts << TLOG_ENDL;
140  return sts;
141 }
142 
143 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
144 {
145  TLOG(14) << "WriteFragmentHeader BEGIN" << TLOG_ENDL;
146  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
147 
148  if (buffer < 0)
149  {
150  if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
151  if (buffer == -2)
152  {
153  TLOG(TLVL_ERROR) << "Dropping fragment because data taking has already passed this event number: " << std::to_string(frag.sequence_id) << TLOG_ENDL;
154  }
155  else
156  {
157  TLOG(TLVL_ERROR) << "Dropping fragment because there is no room in the queue and reliable mode is off: " << std::to_string(frag.sequence_id) << TLOG_ENDL;
158  }
159  dropped_data_.reset(new Fragment(frag.word_count - frag.num_words()));
160  return dropped_data_->dataBegin();
161  }
162 
163  buffer_writes_pending_[buffer]++;
164  TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
165  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
166 
167  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
168  if (frag.word_count - frag.num_words() > 0) {
169  IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
170  }
171 
172  TLOG(14) << "WriteFragmentHeader END" << TLOG_ENDL;
173  return pos;
174 
175 }
176 
177 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
178 {
179  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN" << TLOG_ENDL;
180  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
181  if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
182  if (buffer == -2) return;
183  TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
184 
185  auto hdr = getEventHeader_(buffer);
186  if (update_run_ids_)
187  {
188  hdr->run_id = run_id_;
189  hdr->subrun_id = subrun_id_;
190  }
191 
192  buffer_writes_pending_[buffer]--;
193  if (buffer_writes_pending_[buffer] != 0)
194  {
195  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps." << TLOG_ENDL;
196  return;
197  }
198  auto frag_count = GetFragmentCount(frag.sequence_id);
199  hdr->is_complete = frag_count == num_fragments_per_event_;
200 #if ART_SUPPORTS_DUPLICATE_EVENTS
201  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id)) {
202  hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
203  }
204 #endif
205 
206  complete_buffer_(buffer);
207  requests_.SendRequest(true);
208  TLOG(TLVL_TRACE) << "DoneWritingFragment END" << TLOG_ENDL;
209 }
210 
211 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
212 {
213  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
214 }
215 
216 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
217 {
218  if (buffer == -1) return 0;
219  ResetReadPos(buffer);
220  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
221 
222  size_t count = 0;
223 
224  while (MoreDataInBuffer(buffer))
225  {
226  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
227  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
228  if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
229  TLOG_TRACE("GetFragmentCount") << "Adding Fragment with size=" << std::to_string(fragHdr->word_count) << " to Fragment count" << TLOG_ENDL;
230  ++count;
231  }
232 
233  return count;
234 }
235 
236 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, pid_t& pid_out)
237 {
238  while (restart_art_)
239  {
240  send_init_frag_();
241  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName() << TLOG_ENDL;
242  std::vector<char*> args{ (char*)"art", (char*)"-c", &config_file->getFileName()[0], NULL };
243 
244  auto pid = fork();
245  if (pid == 0)
246  { /* child */
247  execvp("art", &args[0]);
248  exit(1);
249  }
250  pid_out = pid;
251 
252  TLOG(TLVL_INFO) << "PID of new art process is " << pid << TLOG_ENDL;
253  art_processes_.insert(pid);
254  int status;
255  waitpid(pid, &status, 0);
256  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list" << TLOG_ENDL;
257  art_processes_.erase(pid);
258  if (status == 0)
259  {
260  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting") << TLOG_ENDL;
261  }
262  else
263  {
264  TLOG(TLVL_WARNING) << "art process " << pid << " exited with status code 0x" << std::hex << status << " (" << std::dec << status << "), " << (restart_art_ ? "restarting" : "not restarting") << TLOG_ENDL;
265  }
266  }
267 }
268 
270 {
271  restart_art_ = true;
272  if (num_art_processes_ == 0) return;
273  for (size_t ii = 0; ii < num_art_processes_; ++ii)
274  {
275  StartArtProcess(current_art_pset_);
276  }
277 }
278 
280 {
281  static std::mutex start_art_mutex;
282  TraceLock lk(start_art_mutex, 15, "StartArtLock");
283  restart_art_ = true;
284  auto initialCount = GetAttachedCount();
285  auto startTime = std::chrono::steady_clock::now();
286 
287  if (pset != current_art_pset_)
288  {
289  current_art_pset_ = pset;
290  current_art_config_file_ = std::make_shared<art_config_file>(pset/*, GetKey(), GetBroadcastKey()*/);
291  }
292  pid_t pid = -1;
293  boost::thread thread([&] {RunArt(current_art_config_file_, pid); });
294  thread.detach();
295 
296 
297  while ((GetAttachedCount() - initialCount < 1 || pid <= 0)
298  && TimeUtils::GetElapsedTime(startTime) < 5)
299  {
300  usleep(1000);
301  }
302  if (GetAttachedCount() - initialCount < 1 || pid <= 0)
303  {
304  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
305  << " (pid=" << pid << ", attachedCount=" << std::to_string(GetAttachedCount() - initialCount) << ")" << TLOG_ENDL;
306  return 0;
307  }
308  else
309  {
310  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
311  << TimeUtils::GetElapsedTime(startTime) << " seconds." << TLOG_ENDL;
312 
313  return pid;
314  }
315 
316 }
317 
319 {
320  restart_art_ = false;
321  current_art_config_file_ = nullptr;
322  current_art_pset_ = fhicl::ParameterSet();
323 
324  for (auto pid : pids)
325  {
326  if (kill(pid, 0) >= 0)
327  {
328  pids.erase(pid);
329  }
330  }
331  if (pids.size() == 0)
332  {
333  TLOG(14) << "All art processes already exited, nothing to do." << TLOG_ENDL;
334  usleep(1000);
335  return;
336  }
337 
338  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down" << TLOG_ENDL;
339  for (auto pid : pids)
340  {
341  kill(pid, SIGQUIT);
342  }
343 
344  int graceful_wait_ms = 1000;
345  int int_wait_ms = 100;
346 
347  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully" << TLOG_ENDL;
348  for (int ii = 0; ii < graceful_wait_ms; ++ii)
349  {
350  usleep(1000);
351 
352  for (auto pid : pids)
353  {
354  if (kill(pid, 0) < 0)
355  {
356  pids.erase(pid);
357  }
358  }
359  if (pids.size() == 0)
360  {
361  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms." << TLOG_ENDL;
362  return;
363  }
364  }
365 
366  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down" << TLOG_ENDL;
367  for (auto pid : pids)
368  {
369  kill(pid, SIGINT);
370  }
371 
372  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit" << TLOG_ENDL;
373  for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
374  {
375  usleep(1000);
376 
377  for (auto pid : pids)
378  {
379  if (kill(pid, 0) < 0)
380  {
381  pids.erase(pid);
382  }
383  }
384 
385  if (pids.size() == 0)
386  {
387  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms." << TLOG_ENDL;
388  return;
389  }
390  }
391 
392  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice" << TLOG_ENDL;
393  while (pids.size() > 0)
394  {
395  kill(*pids.begin(), SIGKILL);
396  }
397 }
398 
399 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
400 {
401  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN" << TLOG_ENDL;
402  if (restart_art_) // Art is running
403  {
404  endOfData();
405  }
406  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
407  {
408  broadcasts_.MarkBufferEmpty(ii, true);
409  }
410  if (newRun == 0) newRun = run_id_ + 1;
411  current_art_pset_ = art_pset;
412  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
413 
414  if (n_art_processes != -1)
415  {
416  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes << TLOG_ENDL;
417  num_art_processes_ = n_art_processes;
418  }
419  startRun(newRun);
420  TLOG(TLVL_DEBUG) << "ReconfigureArt END" << TLOG_ENDL;
421 }
422 
424 {
425  init_fragment_.reset(nullptr);
426  TLOG(TLVL_TRACE) << "SharedMemoryEventManager::endOfData" << TLOG_ENDL;
427  restart_art_ = false;
428 
429  size_t initialStoreSize = GetIncompleteEventCount();
430  TLOG(TLVL_TRACE) << "endOfData: Flushing " << initialStoreSize
431  << " stale events from the SharedMemoryEventManager." << TLOG_ENDL;
432  int counter = initialStoreSize;
433  while (active_buffers_.size() > 0 && counter > 0)
434  {
435  complete_buffer_(*active_buffers_.begin());
436  counter--;
437  }
438  TLOG(TLVL_TRACE) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
439  << " stale events in the SharedMemoryEventManager." << TLOG_ENDL;
440 
441 
442  TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers..." << TLOG_ENDL;
443  auto start = std::chrono::steady_clock::now();
444  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
445 
446  // We will wait until no buffer has been read for 1 second.
447  while (lastReadCount > 0 && TimeUtils::GetElapsedTime(start) < 1)
448  {
449  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
450  if (temp != lastReadCount)
451  {
452  TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(temp) << " outstanding buffers..." << TLOG_ENDL;
453  lastReadCount = temp;
454  start = std::chrono::steady_clock::now();
455  }
456  if (lastReadCount > 0) usleep(1000);
457  }
458 
459  TLOG(TLVL_TRACE) << "endOfData: Broadcasting EndOfData Fragment" << TLOG_ENDL;
460  FragmentPtr outFrag = std::move(Fragment::eodFrag(GetBufferCount()));
461  bool success = broadcastFragment_(std::move(outFrag), outFrag);
462  if (!success)
463  {
464  TLOG(TLVL_TRACE) << "endOfData: Clearing buffers to make room for EndOfData Fragment" << TLOG_ENDL;
465  for (size_t ii = 0; ii < size(); ++ii)
466  {
467  broadcasts_.MarkBufferEmpty(ii, true);
468  }
469  broadcastFragment_(std::move(outFrag), outFrag);
470  }
471 
472  TLOG(TLVL_DEBUG) << "Waiting for all art processes to exit, there are " << std::to_string(art_processes_.size()) << " remaining." << TLOG_ENDL;
473  while (art_processes_.size() > 0)
474  {
475  ShutdownArtProcesses(art_processes_);
476  }
477  ResetAttachedCount();
478 
479  TLOG(TLVL_TRACE) << "endOfData: Clearing buffers" << TLOG_ENDL;
480  for (size_t ii = 0; ii < size(); ++ii)
481  {
482  MarkBufferEmpty(ii, true);
483  }
484  released_incomplete_events_.clear();
485 
486  TLOG(TLVL_TRACE) << "endOfData END" << TLOG_ENDL;
487  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " events processed in this run." << TLOG_ENDL;
488  return true;
489 }
490 
492 {
493  init_fragment_.reset(nullptr);
494  StartArt();
495  run_id_ = runID;
496  subrun_id_ = 1;
497  requests_.SendRoutingToken(queue_size_);
498  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
499  << ", max queue size = "
500  << queue_size_
501  << ", queue size = "
502  << GetLockedBufferCount() << TLOG_ENDL;
503  if (metricMan)
504  {
505  double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
506  metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
507  }
508 }
509 
511 {
512  ++subrun_id_;
513  if (metricMan)
514  {
515  double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
516  metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
517  }
518 }
519 
521 {
522  FragmentPtr endOfRunFrag(new
523  Fragment(static_cast<size_t>
524  (ceil(sizeof(my_rank) /
525  static_cast<double>(sizeof(Fragment::value_type))))));
526 
527  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
528  *endOfRunFrag->dataBegin() = my_rank;
529  broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
530 
531  return true;
532 }
533 
535 {
536  std::unique_ptr<artdaq::Fragment>
537  endOfSubrunFrag(new
538  Fragment(static_cast<size_t>
539  (ceil(sizeof(my_rank) /
540  static_cast<double>(sizeof(Fragment::value_type))))));
541 
542  endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
543  *endOfSubrunFrag->dataBegin() = my_rank;
544 
545  broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
546 
547  TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun." << TLOG_ENDL;
548  subrun_event_count_ = 0;
549 
550  return true;
551 }
552 
554 {
555  if (metricMan)
556  {
557  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
558  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
559  }
560 
561  if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
562  {
563  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
564  return;
565 
566  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
567  std::ostringstream oss;
568  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
569  for (auto& ev : active_buffers_)
570  {
571  auto hdr = getEventHeader_(ev);
572  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
573  }
574  TLOG(TLVL_DEBUG) << oss.str() << TLOG_ENDL;
575  }
576 }
577 
578 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
579 {
580  auto buffer = broadcasts_.GetBufferForWriting(false);
581  auto start_time = std::chrono::steady_clock::now();
582  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
583  {
584  usleep(10000);
585  buffer = broadcasts_.GetBufferForWriting(false);
586  }
587  if (buffer == -1)
588  {
589  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!" << TLOG_ENDL;
590  outFrag.swap(frag);
591  return false;
592  }
593 
594  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
595  hdr->run_id = run_id_;
596  hdr->subrun_id = subrun_id_;
597  hdr->sequence_id = frag->sequenceID();
598  hdr->is_complete = true;
599  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
600 
601  TLOG(TLVL_TRACE) << "broadcastFragment_ before Write calls" << TLOG_ENDL;
602  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
603 
604  broadcasts_.MarkBufferFull(buffer, -1);
605  outFrag.swap(frag);
606  return true;
607 }
608 
609 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
610 {
611  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
612 }
613 
614 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
615 {
616  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
617  TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " BEGIN" << TLOG_ENDL;
618  auto buffers = GetBuffersOwnedByManager();
619  for (auto& buf : buffers)
620  {
621  auto hdr = getEventHeader_(buf);
622  if (hdr->sequence_id == seqID)
623  {
624  TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning " << buf << TLOG_ENDL;
625  return buf;
626  }
627  }
628 
629 #if !ART_SUPPORTS_DUPLICATE_EVENTS
630  if (released_incomplete_events_.count(seqID)) {
631  TLOG(TLVL_ERROR) << "Buffer has already been marked \"Incomplete\" and sent to art!" << TLOG_ENDL;
632  return -2;
633  }
634 #endif
635 
636  if (!create_new) return -1;
637 
638  check_pending_buffers_(lk);
639  int new_buffer = GetBufferForWriting(false);
640 
641  if (new_buffer == -1)
642  {
643  new_buffer = GetBufferForWriting(overwrite_mode_);
644  }
645 
646  if (new_buffer == -1) return -1;
647  TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
648  auto hdr = getEventHeader_(new_buffer);
649  hdr->is_complete = false;
650  hdr->run_id = run_id_;
651  hdr->subrun_id = subrun_id_;
652  hdr->sequence_id = seqID;
653  buffer_writes_pending_[new_buffer] = 0;
654  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
655 
656  active_buffers_.insert(new_buffer);
657 
658  if (timestamp != Fragment::InvalidTimestamp)
659  {
660  requests_.AddRequest(seqID, timestamp);
661  }
662  requests_.SendRequest();
663  TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning newly initialized buffer " << new_buffer << TLOG_ENDL;
664  return new_buffer;
665 }
666 
667 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
668 {
669  if (buffer == -1) return true;
670  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
671  {
672  return true;
673  }
674  ResetReadPos(buffer);
675  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
676  return MoreDataInBuffer(buffer);
677 }
678 
679 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
680 {
681  auto hdr = getEventHeader_(buffer);
682  if (hdr->is_complete)
683  {
684  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << std::to_string(hdr->sequence_id) << "." << TLOG_ENDL;
685 
686  requests_.RemoveRequest(hdr->sequence_id);
687  requests_.SendRoutingToken(1);
688  {
689  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
690  active_buffers_.erase(buffer);
691  pending_buffers_.insert(buffer);
692  }
693  }
694  check_pending_buffers_();
695 }
696 
697 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
698 {
699  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
700 }
701 
702 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
703 {
704  TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock() << TLOG_ENDL;
705 
706  auto buffers = GetBuffersOwnedByManager();
707  for (auto buf : buffers)
708  {
709  if (ResetBuffer(buf) && !pending_buffers_.count(buf))
710  {
711  auto hdr = getEventHeader_(buf);
712  if (active_buffers_.count(buf))
713  {
714  TLOG(TLVL_WARNING) << "Active event " << std::to_string(hdr->sequence_id) << " is stale. Scheduling release of incomplete event to art." << TLOG_ENDL;
715  requests_.RemoveRequest(hdr->sequence_id);
716  requests_.SendRoutingToken(1);
717  active_buffers_.erase(buf);
718  pending_buffers_.insert(buf);
719  if (!released_incomplete_events_.count(hdr->sequence_id)) {
720  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
721  }
722  else {
723  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
724  }
725  }
726 
727  }
728  }
729 
730  Fragment::sequence_id_t lowestSeqId = Fragment::InvalidSequenceID;
731 
732  // Only use "weak ordering" when buffers are available for writing
733  if (WriteReadyCount(false) != 0)
734  {
735  for (auto buf : active_buffers_)
736  {
737  auto hdr = getEventHeader_(buf);
738  TLOG(TLVL_TRACE) << "Buffer: " << buf << ", SeqID: " << std::to_string(hdr->sequence_id) << ", ACTIVE" << TLOG_ENDL;
739  if (hdr->sequence_id < lowestSeqId)
740  {
741  lowestSeqId = hdr->sequence_id;
742  }
743  }
744  TLOG(TLVL_TRACE) << "Lowest SeqID held: " << std::to_string(lowestSeqId) << TLOG_ENDL;
745  }
746 
747  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
748  sorted_buffers.sort([this](int a, int b) {return bufferComparator(a, b); });
749  for (auto buf : sorted_buffers)
750  {
751  auto hdr = getEventHeader_(buf);
752  if (hdr->sequence_id > lowestSeqId) break;
753  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art." << TLOG_ENDL;
754  MarkBufferFull(buf);
755  subrun_event_count_++;
756  pending_buffers_.erase(buf);
757  }
758 
759  TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics" << TLOG_ENDL;
760  if (metricMan)
761  {
762  auto full = ReadReadyCount();
763  auto empty = WriteReadyCount(overwrite_mode_);
764  auto total = size();
765  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
766  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
767  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
768  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
769  }
770  TLOG(TLVL_TRACE) << "check_pending_buffers_ END" << TLOG_ENDL;
771 }
772 
773 void artdaq::SharedMemoryEventManager::send_init_frag_()
774 {
775  if (init_fragment_ != nullptr)
776  {
777  TLOG(TLVL_TRACE) << "Sending init Fragment to art..." << TLOG_ENDL;
778 
779 #if 0
780  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
781  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
782  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
783  ostream.close();
784 #endif
785 
786  broadcastFragment_(std::move(init_fragment_), init_fragment_);
787  TLOG(TLVL_TRACE) << "Init Fragment sent" << TLOG_ENDL;
788  }
789  else if (send_init_fragments_)
790  {
791  TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!" << TLOG_ENDL;
792  }
793 }
794 
796 {
797  if (!init_fragment_ || init_fragment_ == nullptr)
798  {
799  init_fragment_.swap(frag);
800  send_init_frag_();
801  }
802 }
void RunArt(std::shared_ptr< art_config_file > config_file, pid_t &pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void StartArt()
Start all the art processes.
void ShutdownArtProcesses(std::set< pid_t > pids)
Shutdown a set of art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
void startSubrun()
Start a new Subrun, incrementing the subrun number.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endSubrun()
Send an EndOfSubRunFragment to the art thread.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
bool endOfData()
Indicate that the end of input has been reached to the art processes.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.