artdaq  v3_02_00
SharedMemoryEventManager.cc
1 
2 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
3 
4 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
5 #include "artdaq-core/Core/StatisticsCollection.hh"
6 #include "artdaq-core/Utilities/TraceLock.hh"
7 #include <sys/wait.h>
8 
9 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
10 
11 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
12  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
13  pset.get<size_t>("buffer_count"),
14  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
15  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
16  !pset.get<bool>("broadcast_mode", false))
17  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
18  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
19  , queue_size_(pset.get<size_t>("buffer_count"))
20  , run_id_(0)
21  , subrun_id_(0)
22  , subrun_rollover_event_(Fragment::InvalidSequenceID)
23  , last_released_event_(0)
24  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
25  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
26  , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
27  , running_(false)
28  , buffer_writes_pending_()
29  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
30  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
31  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
32  , run_event_count_(0)
33  , run_incomplete_event_count_(0)
34  , subrun_event_count_(0)
35  , subrun_incomplete_event_count_(0)
36  , art_processes_()
37  , restart_art_(false)
38  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
39  , current_art_pset_(art_pset)
40  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
41  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 100000))
42  , requests_(nullptr)
43  , data_pset_(pset)
44  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
45  pset.get<size_t>("broadcast_buffer_count", 10),
46  pset.get<size_t>("broadcast_buffer_size", 0x100000),
47  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
48 {
49  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
50  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
51 
52  if (pset.get<bool>("use_art", true) == false)
53  {
54  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
55  num_art_processes_ = 0;
56  }
57  else
58  {
59  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
60  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
61  }
62  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
63 
64  if (overwrite_mode_ && num_art_processes_ > 0)
65  {
66  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
67  }
68  else if (overwrite_mode_)
69  {
70  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
71  }
72 
73  for (size_t ii = 0; ii < size(); ++ii)
74  {
75  buffer_writes_pending_[ii] = 0;
76  }
77 
78  if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
79 
80  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
81  SetRank(my_rank);
82  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
83 
84 
85  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
86 }
87 
89 {
90  TLOG(TLVL_TRACE) << "DESTRUCTOR";
91  if (running_) endOfData();
92  TLOG(TLVL_TRACE) << "Destructor END";
93 }
94 
95 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
96 {
97  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << std::to_string(frag.word_count)
98  << ", sequence_id=" << std::to_string(frag.sequence_id);
99  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
100  TLOG(TLVL_TRACE) << "Using buffer " << std::to_string(buffer);
101  if (buffer == -1) return false;
102  if (buffer == -2)
103  {
104  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << std::to_string(frag.sequence_id);
105  return true;
106  }
107 
108  auto hdr = getEventHeader_(buffer);
109  if (update_run_ids_)
110  {
111  hdr->run_id = run_id_;
112  hdr->subrun_id = subrun_id_;
113  }
114 
115  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
116  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
117 
118  TLOG(TLVL_TRACE) << "Checking for complete event";
119  auto fragmentCount = GetFragmentCount(frag.sequence_id);
120  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
121  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
122  << ", fragmentCount=" << std::to_string(fragmentCount)
123  << ", num_fragments_per_event=" << std::to_string(num_fragments_per_event_)
124  << ", buffer_writes_pending_[buffer]=" << std::to_string(buffer_writes_pending_[buffer]);
125 
126  complete_buffer_(buffer);
127  if (requests_) requests_->SendRequest(true);
128 
129  TLOG(TLVL_TRACE) << "AddFragment END";
130  return true;
131 }
132 
133 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
134 {
135  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
136  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
137  auto data = frag->headerAddress();
138  auto start = std::chrono::steady_clock::now();
139  bool sts = false;
140  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
141  {
142  sts = AddFragment(hdr, data);
143  if (!sts) usleep(1000);
144  }
145  if (!sts)
146  {
147  outfrag = std::move(frag);
148  }
149  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
150  return sts;
151 }
152 
153 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
154 {
155  TLOG(14) << "WriteFragmentHeader BEGIN";
156  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
157 
158  if (buffer < 0)
159  {
160  if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
161  if (buffer == -2)
162  {
163  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << std::to_string(frag.sequence_id) << " and fragment id " << std::to_string(frag.fragment_id) << " because data taking has already passed this event.";
164  }
165  else
166  {
167  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << std::to_string(frag.sequence_id) << " and fragment id " << std::to_string(frag.fragment_id) << " because there is no room in the queue and reliable mode is off.";
168  }
169  dropped_data_.reset(new Fragment(frag.word_count - frag.num_words()));
170  return dropped_data_->dataBegin();
171  }
172 
173  if (metricMan)
174  {
175  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
176  }
177 
178  buffer_writes_pending_[buffer]++;
179  TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
180  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
181 
182  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
183  if (frag.word_count - frag.num_words() > 0)
184  {
185  IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
186  }
187 
188  TLOG(14) << "WriteFragmentHeader END";
189  return pos;
190 
191 }
192 
193 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
194 {
195  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
196  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
197  if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
198  if (buffer == -2) return;
199  TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
200 
201  auto hdr = getEventHeader_(buffer);
202  if (update_run_ids_)
203  {
204  hdr->run_id = run_id_;
205  hdr->subrun_id = subrun_id_;
206  }
207 
208  buffer_writes_pending_[buffer]--;
209  if (buffer_writes_pending_[buffer] != 0)
210  {
211  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
212  return;
213  }
214  auto frag_count = GetFragmentCount(frag.sequence_id);
215  hdr->is_complete = frag_count == num_fragments_per_event_;
216 #if ART_SUPPORTS_DUPLICATE_EVENTS
217  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
218  {
219  hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
220  }
221 #endif
222 
223  complete_buffer_(buffer);
224  if (requests_) requests_->SendRequest(true);
225  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
226 }
227 
228 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
229 {
230  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
231 }
232 
233 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
234 {
235  if (buffer == -1) return 0;
236  ResetReadPos(buffer);
237  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
238 
239  size_t count = 0;
240 
241  while (MoreDataInBuffer(buffer))
242  {
243  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
244  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
245  if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
246  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << std::to_string(fragHdr->word_count) << " to Fragment count";
247  ++count;
248  }
249 
250  return count;
251 }
252 
253 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, pid_t& pid_out)
254 {
255  do
256  {
257  auto start_time = std::chrono::steady_clock::now();
258  send_init_frag_();
259  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
260 
261  char* filename = new char[config_file->getFileName().length() + 1];
262  strcpy(filename, config_file->getFileName().c_str());
263 
264  std::vector<char*> args{ (char*)"art", (char*)"-c", filename, NULL };
265 
266 
267  auto pid = fork();
268  if (pid == 0)
269  { /* child */
270  execvp("art", &args[0]);
271  delete[] filename;
272  exit(1);
273  }
274  delete[] filename;
275  pid_out = pid;
276 
277  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
278  art_processes_.insert(pid);
279  siginfo_t status;
280  auto sts = waitid(P_PID, pid, &status, WEXITED);
281  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
282  art_processes_.erase(pid);
283  if (sts < 0)
284  {
285  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
286  }
287  else if (status.si_code == CLD_EXITED && status.si_status == 0)
288  {
289  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
290  }
291  else
292  {
293  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
294  if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
295 
296  auto exit_type = "exited with status code";
297  switch (status.si_code)
298  {
299  case CLD_DUMPED:
300  case CLD_KILLED:
301  exit_type = "was killed with signal";
302  break;
303  case CLD_EXITED:
304  default:
305  break;
306  }
307 
308  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
309  << "art process " << pid << " " << exit_type << " " << status.si_status
310  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
311  << " after " << std::setprecision(2) << art_lifetime << " seconds, "
312  << (restart_art_ ? "restarting" : "not restarting");
313  }
314  } while (restart_art_);
315 }
316 
318 {
319  restart_art_ = always_restart_art_;
320  if (num_art_processes_ == 0) return;
321  for (size_t ii = 0; ii < num_art_processes_; ++ii)
322  {
323  StartArtProcess(current_art_pset_);
324  }
325 }
326 
328 {
329  static std::mutex start_art_mutex;
330  TraceLock lk(start_art_mutex, 15, "StartArtLock");
331  restart_art_ = always_restart_art_;
332  auto initialCount = GetAttachedCount();
333  auto startTime = std::chrono::steady_clock::now();
334 
335  if (pset != current_art_pset_ || !current_art_config_file_)
336  {
337  current_art_pset_ = pset;
338  current_art_config_file_ = std::make_shared<art_config_file>(pset/*, GetKey(), GetBroadcastKey()*/);
339  }
340  pid_t pid = -1;
341  boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
342  thread.detach();
343 
344 
345  while (GetAttachedCount() - initialCount < 1 && TimeUtils::GetElapsedTime(startTime) < 5)
346  {
347  usleep(1000);
348  }
349  if (GetAttachedCount() - initialCount < 1 || pid <= 0)
350  {
351  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
352  << " (pid=" << pid << ", attachedCount=" << std::to_string(GetAttachedCount() - initialCount) << ")";
353  return 0;
354  }
355  else
356  {
357  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
358  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
359 
360  return pid;
361  }
362 
363 }
364 
366 {
367  restart_art_ = false;
368  //current_art_config_file_ = nullptr;
369  //current_art_pset_ = fhicl::ParameterSet();
370 
371  for (auto pid = pids.begin(); pid != pids.end();)
372  {
373  if (kill(*pid, 0) < 0)
374  {
375  pid = pids.erase(pid);
376  }
377  else
378  {
379  ++pid;
380  }
381  }
382  if (pids.size() == 0)
383  {
384  TLOG(14) << "All art processes already exited, nothing to do.";
385  usleep(1000);
386  return;
387  }
388 
389  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
390  for (auto pid : pids)
391  {
392  kill(pid, SIGQUIT);
393  }
394 
395  int graceful_wait_ms = 5000;
396  int int_wait_ms = 1000;
397 
398  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
399  for (int ii = 0; ii < graceful_wait_ms; ++ii)
400  {
401  usleep(1000);
402 
403  for (auto pid = pids.begin(); pid != pids.end();)
404  {
405  if (kill(*pid, 0) < 0)
406  {
407  pid = pids.erase(pid);
408  }
409  else
410  {
411  ++pid;
412  }
413  }
414  if (pids.size() == 0)
415  {
416  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
417  return;
418  }
419  }
420 
421  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
422  for (auto pid : pids)
423  {
424  kill(pid, SIGINT);
425  }
426 
427  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
428  for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
429  {
430  usleep(1000);
431 
432  for (auto pid = pids.begin(); pid != pids.end();)
433  {
434  if (kill(*pid, 0) < 0)
435  {
436  pid = pids.erase(pid);
437  }
438  else
439  {
440  ++pid;
441  }
442  }
443 
444  if (pids.size() == 0)
445  {
446  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
447  return;
448  }
449  }
450 
451  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
452  while (pids.size() > 0)
453  {
454  kill(*pids.begin(), SIGKILL);
455  usleep(1000);
456 
457  for (auto pid = pids.begin(); pid != pids.end();)
458  {
459  if (kill(*pid, 0) < 0)
460  {
461  pid = pids.erase(pid);
462  }
463  else
464  {
465  ++pid;
466  }
467  }
468  }
469 }
470 
471 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
472 {
473  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
474  if (restart_art_ || !always_restart_art_) // Art is running
475  {
476  endOfData();
477  }
478  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
479  {
480  broadcasts_.MarkBufferEmpty(ii, true);
481  }
482  if (newRun == 0) newRun = run_id_ + 1;
483 
484  if (art_pset != current_art_pset_ || !current_art_config_file_)
485  {
486  current_art_pset_ = art_pset;
487  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
488  }
489 
490  if (n_art_processes != -1)
491  {
492  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
493  num_art_processes_ = n_art_processes;
494  }
495  startRun(newRun);
496  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
497 }
498 
500 {
501  init_fragment_.reset(nullptr);
502  TLOG(TLVL_TRACE) << "SharedMemoryEventManager::endOfData";
503  restart_art_ = false;
504 
505  size_t initialStoreSize = GetIncompleteEventCount();
506  TLOG(TLVL_TRACE) << "endOfData: Flushing " << initialStoreSize
507  << " stale events from the SharedMemoryEventManager.";
508  int counter = initialStoreSize;
509  while (active_buffers_.size() > 0 && counter > 0)
510  {
511  complete_buffer_(*active_buffers_.begin());
512  counter--;
513  }
514  TLOG(TLVL_TRACE) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
515  << " stale events in the SharedMemoryEventManager.";
516 
517 
518  TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
519  auto start = std::chrono::steady_clock::now();
520  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
521  auto end_of_data_wait_us = art_event_processing_time_us_ * size();
522 
523  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
524  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && art_processes_.size() > 0)
525  {
526  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
527  if (temp != lastReadCount)
528  {
529  TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(temp) << " outstanding buffers...";
530  lastReadCount = temp;
531  start = std::chrono::steady_clock::now();
532  }
533  if (lastReadCount > 0) usleep(art_event_processing_time_us_);
534  }
535  TLOG(TLVL_TRACE) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: " << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << art_processes_.size();
536 
537  TLOG(TLVL_TRACE) << "endOfData: Broadcasting EndOfData Fragment";
538  FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
539  bool success = broadcastFragment_(std::move(outFrag), outFrag);
540  if (!success)
541  {
542  TLOG(TLVL_TRACE) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
543  for (size_t ii = 0; ii < size(); ++ii)
544  {
545  broadcasts_.MarkBufferEmpty(ii, true);
546  }
547  broadcastFragment_(std::move(outFrag), outFrag);
548  }
549  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
550 
551  if (art_processes_.size() > 0)
552  {
553  TLOG(TLVL_DEBUG) << "Allowing " << std::to_string(art_processes_.size()) << " art processes the chance to end gracefully";
554  if (end_of_data_wait_us == 0)
555  {
556  TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
557  end_of_data_wait_us = 100 * 1000000;
558  }
559 
560  auto sleep_count = (end_of_data_wait_us / 10000) + 1;
561  for (size_t ii = 0; ii < sleep_count; ++ii)
562  {
563  usleep(10000);
564  if (art_processes_.size() == 0) break;
565  }
566  }
567 
568  while (art_processes_.size() > 0)
569  {
570  TLOG(TLVL_DEBUG) << "There are " << std::to_string(art_processes_.size()) << " art processes remaining. Proceeding to shutdown.";
571  ShutdownArtProcesses(art_processes_);
572  }
573  TLOG(TLVL_INFO) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
574 
575  ResetAttachedCount();
576 
577  TLOG(TLVL_TRACE) << "endOfData: Clearing buffers";
578  for (size_t ii = 0; ii < size(); ++ii)
579  {
580  MarkBufferEmpty(ii, true);
581  }
582  released_incomplete_events_.clear();
583 
584  TLOG(TLVL_TRACE) << "endOfData: Shutting down RequestReceiver";
585  requests_.reset(nullptr);
586 
587  TLOG(TLVL_TRACE) << "endOfData END";
588  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
589  running_ = false;
590  return true;
591 }
592 
594 {
595  running_ = true;
596  init_fragment_.reset(nullptr);
597  StartArt();
598  run_id_ = runID;
599  subrun_id_ = 1;
600  subrun_rollover_event_ = Fragment::InvalidSequenceID;
601  last_released_event_ = 0;
602  requests_.reset(new RequestSender(data_pset_));
603  if (requests_) requests_->SendRoutingToken(queue_size_);
604  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
605  << ", max queue size = "
606  << queue_size_
607  << ", queue size = "
608  << GetLockedBufferCount();
609  if (metricMan)
610  {
611  double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
612  metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
613  }
614 }
615 
617 {
618  ++subrun_id_;
619  subrun_rollover_event_ = Fragment::InvalidSequenceID;
620  if (metricMan)
621  {
622  double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
623  metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
624  }
625 }
626 
628 {
629  TLOG(TLVL_INFO) << "Ending run " << run_id_;
630  FragmentPtr endOfRunFrag(new
631  Fragment(static_cast<size_t>
632  (ceil(sizeof(my_rank) /
633  static_cast<double>(sizeof(Fragment::value_type))))));
634 
635  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
636  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
637  *endOfRunFrag->dataBegin() = my_rank;
638  broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
639 
640  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
641  run_event_count_ = 0;
642  run_incomplete_event_count_ = 0;
643  return true;
644 }
645 
647 {
648  TLOG(TLVL_INFO) << "Ending subrun " << subrun_id_;
649  std::unique_ptr<artdaq::Fragment>
650  endOfSubrunFrag(new
651  Fragment(static_cast<size_t>
652  (ceil(sizeof(my_rank) /
653  static_cast<double>(sizeof(Fragment::value_type))))));
654 
655  TLOG(TLVL_DEBUG) << "Broadcasting EndOfSubrun Fragment";
656  endOfSubrunFrag->setSequenceID(subrun_rollover_event_);
657  endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
658  *endOfSubrunFrag->dataBegin() = my_rank;
659 
660  broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
661 
662  TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun.";
663  subrun_event_count_ = 0;
664  subrun_incomplete_event_count_ = 0;
665 
666  return true;
667 }
668 
670 {
671  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
672  if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
673 
674  if (boundary < last_released_event_)
675  {
676  auto logLevel = TLVL_ERROR;
677  bool processAnyway = false;
678  if (last_released_event_ - boundary < 100)
679  {
680  logLevel = TLVL_WARNING;
681  processAnyway = true;
682  }
683  TLOG(logLevel) << "Subrun rollover requested for event that is in the past. (delta = " << (last_released_event_ - boundary) << ").";
684  if (!processAnyway) return;
685  }
686  TLOG(TLVL_INFO) << "Will roll over when I reach Sequence ID " << boundary;
687  subrun_rollover_event_ = boundary;
688 }
689 
691 {
692  if (metricMan)
693  {
694  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
695  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
696  }
697 
698  if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
699  {
700  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
701  return;
702 
703  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
704  std::ostringstream oss;
705  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
706  for (auto& ev : active_buffers_)
707  {
708  auto hdr = getEventHeader_(ev);
709  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
710  }
711  TLOG(TLVL_DEBUG) << oss.str();
712  }
713 }
714 
715 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
716 {
717  TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
718  auto buffer = broadcasts_.GetBufferForWriting(false);
719  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
720  auto start_time = std::chrono::steady_clock::now();
721  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
722  {
723  usleep(10000);
724  buffer = broadcasts_.GetBufferForWriting(false);
725  }
726  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
727  if (buffer == -1)
728  {
729  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
730  outFrag.swap(frag);
731  return false;
732  }
733 
734  TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
735  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
736  hdr->run_id = run_id_;
737  hdr->subrun_id = subrun_id_;
738  hdr->sequence_id = frag->sequenceID();
739  hdr->is_complete = true;
740  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
741 
742  TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
743  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
744 
745  TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
746  broadcasts_.MarkBufferFull(buffer, -1);
747  outFrag.swap(frag);
748  TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
749  return true;
750 }
751 
752 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
753 {
754  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
755 }
756 
757 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
758 {
759  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
760  TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " BEGIN";
761  auto buffers = GetBuffersOwnedByManager();
762  for (auto& buf : buffers)
763  {
764  auto hdr = getEventHeader_(buf);
765  if (hdr->sequence_id == seqID)
766  {
767  TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning " << buf;
768  return buf;
769  }
770  }
771 
772 #if !ART_SUPPORTS_DUPLICATE_EVENTS
773  if (released_incomplete_events_.count(seqID))
774  {
775  TLOG(TLVL_ERROR) << "Event " << std::to_string(seqID) << " has already been marked \"Incomplete\" and sent to art!";
776  return -2;
777  }
778 #endif
779 
780  if (!create_new) return -1;
781 
782  check_pending_buffers_(lk);
783  int new_buffer = GetBufferForWriting(false);
784 
785  if (new_buffer == -1)
786  {
787  new_buffer = GetBufferForWriting(overwrite_mode_);
788  }
789 
790  if (new_buffer == -1) return -1;
791  TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
792  auto hdr = getEventHeader_(new_buffer);
793  hdr->is_complete = false;
794  hdr->run_id = run_id_;
795  hdr->subrun_id = subrun_id_;
796  hdr->sequence_id = seqID;
797  buffer_writes_pending_[new_buffer] = 0;
798  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
799  SetMFIteration("Sequence ID " + std::to_string(seqID));
800 
801  active_buffers_.insert(new_buffer);
802 
803  if (requests_)
804  {
805  if (timestamp != Fragment::InvalidTimestamp)
806  {
807  requests_->AddRequest(seqID, timestamp);
808  }
809  requests_->SendRequest();
810  }
811  TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning newly initialized buffer " << new_buffer;
812  return new_buffer;
813 }
814 
815 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
816 {
817  if (buffer == -1) return true;
818  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
819  {
820  return true;
821  }
822  ResetReadPos(buffer);
823  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
824  return MoreDataInBuffer(buffer);
825 }
826 
827 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
828 {
829  auto hdr = getEventHeader_(buffer);
830  if (hdr->is_complete)
831  {
832  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << std::to_string(hdr->sequence_id) << ".";
833 
834  if (requests_)
835  {
836  requests_->RemoveRequest(hdr->sequence_id);
837  requests_->SendRoutingToken(1);
838  }
839  {
840  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
841  active_buffers_.erase(buffer);
842  pending_buffers_.insert(buffer);
843  }
844  }
845  check_pending_buffers_();
846 }
847 
848 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
849 {
850  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
851 }
852 
853 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
854 {
855  TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
856 
857  auto buffers = GetBuffersOwnedByManager();
858  for (auto buf : buffers)
859  {
860  if (ResetBuffer(buf) && !pending_buffers_.count(buf))
861  {
862  auto hdr = getEventHeader_(buf);
863  if (active_buffers_.count(buf))
864  {
865  if (requests_)
866  {
867  requests_->RemoveRequest(hdr->sequence_id);
868  requests_->SendRoutingToken(1);
869  }
870  active_buffers_.erase(buf);
871  pending_buffers_.insert(buf);
872  subrun_incomplete_event_count_++;
873  run_incomplete_event_count_++;
874  if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
875  if (!released_incomplete_events_.count(hdr->sequence_id))
876  {
877  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
878  }
879  else
880  {
881  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
882  }
883  TLOG(TLVL_WARNING) << "Active event " << std::to_string(hdr->sequence_id) << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
884  }
885 
886  }
887  }
888 
889  Fragment::sequence_id_t lowestSeqId = Fragment::InvalidSequenceID;
890 
891  // Only use "weak ordering" when buffers are available for writing
892  if (WriteReadyCount(false) != 0)
893  {
894  for (auto buf : active_buffers_)
895  {
896  auto hdr = getEventHeader_(buf);
897  TLOG(TLVL_TRACE) << "Buffer: " << buf << ", SeqID: " << std::to_string(hdr->sequence_id) << ", ACTIVE";
898  if (hdr->sequence_id < lowestSeqId)
899  {
900  lowestSeqId = hdr->sequence_id;
901  }
902  }
903  TLOG(TLVL_TRACE) << "Lowest SeqID held: " << std::to_string(lowestSeqId);
904  }
905 
906  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
907  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
908 
909  auto counter = 0;
910  double eventSize = 0;
911  for (auto buf : sorted_buffers)
912  {
913  auto hdr = getEventHeader_(buf);
914  if (hdr->sequence_id > lowestSeqId) break;
915 
916  if (hdr->sequence_id >= subrun_rollover_event_)
917  {
918  TLOG(TLVL_INFO) << "Subrun rollover reached at event " << hdr->sequence_id << " (boundary=" << subrun_rollover_event_ << ").";
919  endSubrun();
920  startSubrun();
921  }
922  if (hdr->sequence_id > last_released_event_) last_released_event_ = hdr->sequence_id;
923 
924  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art.";
925  MarkBufferFull(buf);
926  subrun_event_count_++;
927  run_event_count_++;
928  counter++;
929  eventSize += BufferDataSize(buf);
930  pending_buffers_.erase(buf);
931  }
932  eventSize /= counter;
933 
934  TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
935  if (metricMan)
936  {
937  auto full = ReadReadyCount();
938  auto empty = WriteReadyCount(overwrite_mode_);
939  auto total = size();
940 
941  metricMan->sendMetric("Event Rate", counter, "Events/s", 1, MetricMode::Rate);
942  metricMan->sendMetric("Events Released to art (run)", run_event_count_, "Events", 1, MetricMode::LastPoint);
943  metricMan->sendMetric("Incomplete Events Released to art (run)", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
944  metricMan->sendMetric("Events Released to art (subrun)", subrun_event_count_, "Events", 2, MetricMode::LastPoint);
945  metricMan->sendMetric("Incomplete Events Released to art (subrun)", subrun_incomplete_event_count_, "Events", 2, MetricMode::LastPoint);
946  metricMan->sendMetric("Event Size", eventSize, "Bytes", 1, MetricMode::Average);
947 
948  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
949  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
950  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
951  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
952  }
953  TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
954 }
955 
956 void artdaq::SharedMemoryEventManager::send_init_frag_()
957 {
958  if (init_fragment_ != nullptr)
959  {
960  TLOG(TLVL_TRACE) << "Sending init Fragment to art...";
961 
962 #if 0
963  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
964  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
965  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
966  ostream.close();
967 #endif
968 
969  broadcastFragment_(std::move(init_fragment_), init_fragment_);
970  TLOG(TLVL_TRACE) << "Init Fragment sent";
971  }
972  else if (send_init_fragments_)
973  {
974  TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
975  }
976 }
977 
979 {
980  if (!init_fragment_ || init_fragment_ == nullptr)
981  {
982  init_fragment_.swap(frag);
983  send_init_frag_();
984  }
985 }
986 
987 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
988 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager)
989 #endif
void RunArt(std::shared_ptr< art_config_file > config_file, pid_t &pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
The SharedMemoryEventManager is a SharedMemoryManger which tracks events as they are built...
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void StartArt()
Start all the art processes.
void ShutdownArtProcesses(std::set< pid_t > pids)
Shutdown a set of art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
void rolloverSubrun(sequence_id_t boundary)
Rollover the subrun after the specified event.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
void startSubrun()
Start a new Subrun, incrementing the subrun number.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endSubrun()
Send an EndOfSubRunFragment to the art thread.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
bool endOfData()
Indicate that the end of input has been reached to the art processes.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.