artdaq  v3_01_00
SharedMemoryEventManager.cc
1 
2 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
3 
4 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
5 #include "artdaq-core/Core/StatisticsCollection.hh"
6 #include "artdaq-core/Utilities/TraceLock.hh"
7 #include <sys/wait.h>
8 
9 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
10 
11 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
12  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
13  pset.get<size_t>("buffer_count"),
14  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
15  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
16  !pset.get<bool>("broadcast_mode", false))
17  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
18  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
19  , queue_size_(pset.get<size_t>("buffer_count"))
20  , run_id_(0)
21  , subrun_id_(0)
22  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
23  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
24  , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
25  , running_(false)
26  , buffer_writes_pending_()
27  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
28  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
29  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
30  , run_event_count_(0)
31  , run_incomplete_event_count_(0)
32  , subrun_event_count_(0)
33  , subrun_incomplete_event_count_(0)
34  , art_processes_()
35  , restart_art_(false)
36  , current_art_pset_(art_pset)
37  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
38  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 100000))
39  , requests_(nullptr)
40  , data_pset_(pset)
41  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
42  pset.get<size_t>("broadcast_buffer_count", 10),
43  pset.get<size_t>("broadcast_buffer_size", 0x100000),
44  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
45 {
46  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
47  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
48 
49  if (pset.get<bool>("use_art", true) == false) {
50  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
51  num_art_processes_ = 0;
52  }
53  else {
54  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
55  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
56  }
57  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
58 
59  if (overwrite_mode_ && num_art_processes_ > 0)
60  {
61  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
62  }
63  else if (overwrite_mode_)
64  {
65  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
66  }
67 
68  for (size_t ii = 0; ii < size(); ++ii)
69  {
70  buffer_writes_pending_[ii] = 0;
71  }
72 
73  if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
74 
75  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
76  SetRank(my_rank);
77  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
78 
79 
80  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
81 }
82 
84 {
85  TLOG(TLVL_TRACE) << "DESTRUCTOR";
86  if (running_) endOfData();
87  TLOG(TLVL_TRACE) << "Destructor END";
88 }
89 
90 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
91 {
92  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << std::to_string(frag.word_count)
93  << ", sequence_id=" << std::to_string(frag.sequence_id);
94  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
95  TLOG(TLVL_TRACE) << "Using buffer " << std::to_string(buffer);
96  if (buffer == -1) return false;
97  if (buffer == -2)
98  {
99  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << std::to_string(frag.sequence_id);
100  return true;
101  }
102 
103  auto hdr = getEventHeader_(buffer);
104  if (update_run_ids_)
105  {
106  hdr->run_id = run_id_;
107  hdr->subrun_id = subrun_id_;
108  }
109 
110  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
111  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
112 
113  TLOG(TLVL_TRACE) << "Checking for complete event";
114  auto fragmentCount = GetFragmentCount(frag.sequence_id);
115  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
116  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
117  << ", fragmentCount=" << std::to_string(fragmentCount)
118  << ", num_fragments_per_event=" << std::to_string(num_fragments_per_event_)
119  << ", buffer_writes_pending_[buffer]=" << std::to_string(buffer_writes_pending_[buffer]);
120 
121  complete_buffer_(buffer);
122  if (requests_) requests_->SendRequest(true);
123 
124  TLOG(TLVL_TRACE) << "AddFragment END";
125  return true;
126 }
127 
128 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
129 {
130  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
131  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
132  auto data = frag->headerAddress();
133  auto start = std::chrono::steady_clock::now();
134  bool sts = false;
135  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
136  {
137  sts = AddFragment(hdr, data);
138  if (!sts) usleep(1000);
139  }
140  if (!sts)
141  {
142  outfrag = std::move(frag);
143  }
144  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
145  return sts;
146 }
147 
148 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
149 {
150  TLOG(14) << "WriteFragmentHeader BEGIN";
151  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
152 
153  if (buffer < 0)
154  {
155  if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
156  if (buffer == -2)
157  {
158  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << std::to_string(frag.sequence_id) << " and fragment id " << std::to_string(frag.fragment_id) << " because data taking has already passed this event.";
159  }
160  else
161  {
162  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << std::to_string(frag.sequence_id) << " and fragment id " << std::to_string(frag.fragment_id) << " because there is no room in the queue and reliable mode is off.";
163  }
164  dropped_data_.reset(new Fragment(frag.word_count - frag.num_words()));
165  return dropped_data_->dataBegin();
166  }
167 
168  if (metricMan)
169  {
170  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
171  }
172 
173  buffer_writes_pending_[buffer]++;
174  TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
175  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
176 
177  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
178  if (frag.word_count - frag.num_words() > 0) {
179  IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
180  }
181 
182  TLOG(14) << "WriteFragmentHeader END";
183  return pos;
184 
185 }
186 
187 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
188 {
189  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
190  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
191  if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
192  if (buffer == -2) return;
193  TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
194 
195  auto hdr = getEventHeader_(buffer);
196  if (update_run_ids_)
197  {
198  hdr->run_id = run_id_;
199  hdr->subrun_id = subrun_id_;
200  }
201 
202  buffer_writes_pending_[buffer]--;
203  if (buffer_writes_pending_[buffer] != 0)
204  {
205  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
206  return;
207  }
208  auto frag_count = GetFragmentCount(frag.sequence_id);
209  hdr->is_complete = frag_count == num_fragments_per_event_;
210 #if ART_SUPPORTS_DUPLICATE_EVENTS
211  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id)) {
212  hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
213  }
214 #endif
215 
216  complete_buffer_(buffer);
217  if (requests_) requests_->SendRequest(true);
218  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
219 }
220 
221 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
222 {
223  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
224 }
225 
226 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
227 {
228  if (buffer == -1) return 0;
229  ResetReadPos(buffer);
230  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
231 
232  size_t count = 0;
233 
234  while (MoreDataInBuffer(buffer))
235  {
236  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
237  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
238  if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
239  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << std::to_string(fragHdr->word_count) << " to Fragment count";
240  ++count;
241  }
242 
243  return count;
244 }
245 
246 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, pid_t& pid_out)
247 {
248  while (restart_art_)
249  {
250  auto start_time = std::chrono::steady_clock::now();
251  send_init_frag_();
252  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
253  std::vector<char*> args{ (char*)"art", (char*)"-c", &config_file->getFileName()[0], NULL };
254 
255  auto pid = fork();
256  if (pid == 0)
257  { /* child */
258  execvp("art", &args[0]);
259  exit(1);
260  }
261  pid_out = pid;
262 
263  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
264  art_processes_.insert(pid);
265  siginfo_t status;
266  auto sts = waitid(P_PID, pid, &status, WEXITED);
267  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
268  art_processes_.erase(pid);
269  if (sts < 0) {
270  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
271  }
272  else if (status.si_code == CLD_EXITED && status.si_status == 0)
273  {
274  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
275  }
276  else
277  {
278  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
279  if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
280 
281  auto exit_type = "exited with status code";
282  switch (status.si_code) {
283  case CLD_DUMPED:
284  case CLD_KILLED:
285  exit_type = "was killed with signal";
286  break;
287  case CLD_EXITED:
288  default:
289  break;
290  }
291 
292  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
293  << "art process " << pid << " " << exit_type << " " << status.si_status
294  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
295  << " after " << std::setprecision(2) << art_lifetime << " seconds, "
296  << (restart_art_ ? "restarting" : "not restarting");
297  }
298  }
299 }
300 
302 {
303  restart_art_ = true;
304  if (num_art_processes_ == 0) return;
305  for (size_t ii = 0; ii < num_art_processes_; ++ii)
306  {
307  StartArtProcess(current_art_pset_);
308  }
309 }
310 
312 {
313  static std::mutex start_art_mutex;
314  TraceLock lk(start_art_mutex, 15, "StartArtLock");
315  restart_art_ = true;
316  auto initialCount = GetAttachedCount();
317  auto startTime = std::chrono::steady_clock::now();
318 
319  if (pset != current_art_pset_ || !current_art_config_file_)
320  {
321  current_art_pset_ = pset;
322  current_art_config_file_ = std::make_shared<art_config_file>(pset/*, GetKey(), GetBroadcastKey()*/);
323  }
324  pid_t pid = -1;
325  boost::thread thread([&] {RunArt(current_art_config_file_, pid); });
326  thread.detach();
327 
328 
329  while ((GetAttachedCount() - initialCount < 1 || pid <= 0)
330  && TimeUtils::GetElapsedTime(startTime) < 5)
331  {
332  usleep(1000);
333  }
334  if (GetAttachedCount() - initialCount < 1 || pid <= 0)
335  {
336  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
337  << " (pid=" << pid << ", attachedCount=" << std::to_string(GetAttachedCount() - initialCount) << ")";
338  return 0;
339  }
340  else
341  {
342  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
343  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
344 
345  return pid;
346  }
347 
348 }
349 
351 {
352  restart_art_ = false;
353  //current_art_config_file_ = nullptr;
354  //current_art_pset_ = fhicl::ParameterSet();
355 
356  for (auto pid = pids.begin(); pid != pids.end();)
357  {
358  if (kill(*pid, 0) < 0)
359  {
360  pid = pids.erase(pid);
361  }
362  else {
363  ++pid;
364  }
365  }
366  if (pids.size() == 0)
367  {
368  TLOG(14) << "All art processes already exited, nothing to do.";
369  usleep(1000);
370  return;
371  }
372 
373  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
374  for (auto pid : pids)
375  {
376  kill(pid, SIGQUIT);
377  }
378 
379  int graceful_wait_ms = 5000;
380  int int_wait_ms = 1000;
381 
382  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
383  for (int ii = 0; ii < graceful_wait_ms; ++ii)
384  {
385  usleep(1000);
386 
387  for (auto pid = pids.begin(); pid != pids.end();)
388  {
389  if (kill(*pid, 0) < 0)
390  {
391  pid = pids.erase(pid);
392  }
393  else {
394  ++pid;
395  }
396  }
397  if (pids.size() == 0)
398  {
399  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
400  return;
401  }
402  }
403 
404  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
405  for (auto pid : pids)
406  {
407  kill(pid, SIGINT);
408  }
409 
410  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
411  for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
412  {
413  usleep(1000);
414 
415  for (auto pid = pids.begin(); pid != pids.end();)
416  {
417  if (kill(*pid, 0) < 0)
418  {
419  pid = pids.erase(pid);
420  }
421  else {
422  ++pid;
423  }
424  }
425 
426  if (pids.size() == 0)
427  {
428  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
429  return;
430  }
431  }
432 
433  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
434  while (pids.size() > 0)
435  {
436  kill(*pids.begin(), SIGKILL);
437  usleep(1000);
438 
439  for (auto pid = pids.begin(); pid != pids.end();)
440  {
441  if (kill(*pid, 0) < 0)
442  {
443  pid = pids.erase(pid);
444  }
445  else {
446  ++pid;
447  }
448  }
449  }
450 }
451 
452 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
453 {
454  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
455  if (restart_art_) // Art is running
456  {
457  endOfData();
458  }
459  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
460  {
461  broadcasts_.MarkBufferEmpty(ii, true);
462  }
463  if (newRun == 0) newRun = run_id_ + 1;
464 
465  if (art_pset != current_art_pset_ || !current_art_config_file_) {
466  current_art_pset_ = art_pset;
467  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
468  }
469 
470  if (n_art_processes != -1)
471  {
472  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
473  num_art_processes_ = n_art_processes;
474  }
475  startRun(newRun);
476  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
477 }
478 
480 {
481  init_fragment_.reset(nullptr);
482  TLOG(TLVL_TRACE) << "SharedMemoryEventManager::endOfData";
483  restart_art_ = false;
484 
485  size_t initialStoreSize = GetIncompleteEventCount();
486  TLOG(TLVL_TRACE) << "endOfData: Flushing " << initialStoreSize
487  << " stale events from the SharedMemoryEventManager.";
488  int counter = initialStoreSize;
489  while (active_buffers_.size() > 0 && counter > 0)
490  {
491  complete_buffer_(*active_buffers_.begin());
492  counter--;
493  }
494  TLOG(TLVL_TRACE) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
495  << " stale events in the SharedMemoryEventManager.";
496 
497 
498  TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
499  auto start = std::chrono::steady_clock::now();
500  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
501  auto end_of_data_wait_us = art_event_processing_time_us_ * size();
502 
503  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
504  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && art_processes_.size() > 0)
505  {
506  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
507  if (temp != lastReadCount)
508  {
509  TLOG(TLVL_TRACE) << "Waiting for " << std::to_string(temp) << " outstanding buffers...";
510  lastReadCount = temp;
511  start = std::chrono::steady_clock::now();
512  }
513  if (lastReadCount > 0) usleep(art_event_processing_time_us_);
514  }
515  TLOG(TLVL_TRACE) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: " << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << art_processes_.size();
516 
517  TLOG(TLVL_TRACE) << "endOfData: Broadcasting EndOfData Fragment";
518  FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
519  bool success = broadcastFragment_(std::move(outFrag), outFrag);
520  if (!success)
521  {
522  TLOG(TLVL_TRACE) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
523  for (size_t ii = 0; ii < size(); ++ii)
524  {
525  broadcasts_.MarkBufferEmpty(ii, true);
526  }
527  broadcastFragment_(std::move(outFrag), outFrag);
528  }
529  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
530 
531  if (art_processes_.size() > 0)
532  {
533  TLOG(TLVL_DEBUG) << "Allowing " << std::to_string(art_processes_.size()) << " art processes the chance to end gracefully";
534  if (end_of_data_wait_us == 0)
535  {
536  TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
537  end_of_data_wait_us = 100 * 1000000;
538  }
539 
540  auto sleep_count = (end_of_data_wait_us / 10000) + 1;
541  for (size_t ii = 0; ii < sleep_count; ++ii) {
542  usleep(10000);
543  if (art_processes_.size() == 0) break;
544  }
545  }
546 
547  while (art_processes_.size() > 0)
548  {
549  TLOG(TLVL_DEBUG) << "There are " << std::to_string(art_processes_.size()) << " art processes remaining. Proceeding to shutdown.";
550  ShutdownArtProcesses(art_processes_);
551  }
552  TLOG(TLVL_INFO) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
553 
554  ResetAttachedCount();
555 
556  TLOG(TLVL_TRACE) << "endOfData: Clearing buffers";
557  for (size_t ii = 0; ii < size(); ++ii)
558  {
559  MarkBufferEmpty(ii, true);
560  }
561  released_incomplete_events_.clear();
562 
563  TLOG(TLVL_TRACE) << "endOfData: Shutting down RequestReceiver";
564  requests_.reset(nullptr);
565 
566  TLOG(TLVL_TRACE) << "endOfData END";
567  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
568  running_ = false;
569  return true;
570 }
571 
573 {
574  running_ = true;
575  init_fragment_.reset(nullptr);
576  StartArt();
577  run_id_ = runID;
578  subrun_id_ = 1;
579  requests_.reset(new RequestSender(data_pset_));
580  if (requests_) requests_->SendRoutingToken(queue_size_);
581  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
582  << ", max queue size = "
583  << queue_size_
584  << ", queue size = "
585  << GetLockedBufferCount();
586  if (metricMan)
587  {
588  double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
589  metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
590  }
591 }
592 
594 {
595  ++subrun_id_;
596  if (metricMan)
597  {
598  double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
599  metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
600  }
601 }
602 
604 {
605  TLOG(TLVL_INFO) << "Ending run " << run_id_;
606  FragmentPtr endOfRunFrag(new
607  Fragment(static_cast<size_t>
608  (ceil(sizeof(my_rank) /
609  static_cast<double>(sizeof(Fragment::value_type))))));
610 
611  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
612  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
613  *endOfRunFrag->dataBegin() = my_rank;
614  broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
615 
616  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
617  run_event_count_ = 0;
618  run_incomplete_event_count_ = 0;
619  return true;
620 }
621 
623 {
624  TLOG(TLVL_INFO) << "Ending subrun " << subrun_id_;
625  std::unique_ptr<artdaq::Fragment>
626  endOfSubrunFrag(new
627  Fragment(static_cast<size_t>
628  (ceil(sizeof(my_rank) /
629  static_cast<double>(sizeof(Fragment::value_type))))));
630 
631  TLOG(TLVL_DEBUG) << "Broadcasting EndOfSubrun Fragment";
632  endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
633  *endOfSubrunFrag->dataBegin() = my_rank;
634 
635  broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
636 
637  TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun.";
638  subrun_event_count_ = 0;
639  subrun_incomplete_event_count_ = 0;
640 
641  return true;
642 }
643 
645 {
646  if (metricMan)
647  {
648  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
649  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
650  }
651 
652  if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
653  {
654  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
655  return;
656 
657  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
658  std::ostringstream oss;
659  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
660  for (auto& ev : active_buffers_)
661  {
662  auto hdr = getEventHeader_(ev);
663  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
664  }
665  TLOG(TLVL_DEBUG) << oss.str();
666  }
667 }
668 
669 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
670 {
671  TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
672  auto buffer = broadcasts_.GetBufferForWriting(false);
673  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
674  auto start_time = std::chrono::steady_clock::now();
675  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
676  {
677  usleep(10000);
678  buffer = broadcasts_.GetBufferForWriting(false);
679  }
680  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
681  if (buffer == -1)
682  {
683  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
684  outFrag.swap(frag);
685  return false;
686  }
687 
688  TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
689  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
690  hdr->run_id = run_id_;
691  hdr->subrun_id = subrun_id_;
692  hdr->sequence_id = frag->sequenceID();
693  hdr->is_complete = true;
694  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
695 
696  TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
697  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
698 
699  TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
700  broadcasts_.MarkBufferFull(buffer, -1);
701  outFrag.swap(frag);
702  TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
703  return true;
704 }
705 
706 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
707 {
708  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
709 }
710 
711 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
712 {
713  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
714  TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " BEGIN";
715  auto buffers = GetBuffersOwnedByManager();
716  for (auto& buf : buffers)
717  {
718  auto hdr = getEventHeader_(buf);
719  if (hdr->sequence_id == seqID)
720  {
721  TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning " << buf;
722  return buf;
723  }
724  }
725 
726 #if !ART_SUPPORTS_DUPLICATE_EVENTS
727  if (released_incomplete_events_.count(seqID)) {
728  TLOG(TLVL_ERROR) << "Event " << std::to_string(seqID) << " has already been marked \"Incomplete\" and sent to art!";
729  return -2;
730  }
731 #endif
732 
733  if (!create_new) return -1;
734 
735  check_pending_buffers_(lk);
736  int new_buffer = GetBufferForWriting(false);
737 
738  if (new_buffer == -1)
739  {
740  new_buffer = GetBufferForWriting(overwrite_mode_);
741  }
742 
743  if (new_buffer == -1) return -1;
744  TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
745  auto hdr = getEventHeader_(new_buffer);
746  hdr->is_complete = false;
747  hdr->run_id = run_id_;
748  hdr->subrun_id = subrun_id_;
749  hdr->sequence_id = seqID;
750  buffer_writes_pending_[new_buffer] = 0;
751  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
752 #if ART_HEX_VERSION >= 0x21100
753  SetMFIteration("Sequence ID " + std::to_string(seqID));
754 #endif
755 
756  active_buffers_.insert(new_buffer);
757 
758  if (requests_) {
759  if (timestamp != Fragment::InvalidTimestamp)
760  {
761  requests_->AddRequest(seqID, timestamp);
762  }
763  requests_->SendRequest();
764  }
765  TLOG(14) << "getBufferForSequenceID " << std::to_string(seqID) << " returning newly initialized buffer " << new_buffer;
766  return new_buffer;
767 }
768 
769 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
770 {
771  if (buffer == -1) return true;
772  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
773  {
774  return true;
775  }
776  ResetReadPos(buffer);
777  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
778  return MoreDataInBuffer(buffer);
779 }
780 
781 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
782 {
783  auto hdr = getEventHeader_(buffer);
784  if (hdr->is_complete)
785  {
786  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << std::to_string(hdr->sequence_id) << ".";
787 
788  if (requests_) {
789  requests_->RemoveRequest(hdr->sequence_id);
790  requests_->SendRoutingToken(1);
791  }
792  {
793  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
794  active_buffers_.erase(buffer);
795  pending_buffers_.insert(buffer);
796  }
797  }
798  check_pending_buffers_();
799 }
800 
801 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
802 {
803  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
804 }
805 
806 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
807 {
808  TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
809 
810  auto buffers = GetBuffersOwnedByManager();
811  for (auto buf : buffers)
812  {
813  if (ResetBuffer(buf) && !pending_buffers_.count(buf))
814  {
815  auto hdr = getEventHeader_(buf);
816  if (active_buffers_.count(buf))
817  {
818  if (requests_) {
819  requests_->RemoveRequest(hdr->sequence_id);
820  requests_->SendRoutingToken(1);
821  }
822  active_buffers_.erase(buf);
823  pending_buffers_.insert(buf);
824  subrun_incomplete_event_count_++;
825  run_incomplete_event_count_++;
826  if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
827  if (!released_incomplete_events_.count(hdr->sequence_id)) {
828  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
829  }
830  else {
831  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
832  }
833  TLOG(TLVL_WARNING) << "Active event " << std::to_string(hdr->sequence_id) << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
834  }
835 
836  }
837  }
838 
839  Fragment::sequence_id_t lowestSeqId = Fragment::InvalidSequenceID;
840 
841  // Only use "weak ordering" when buffers are available for writing
842  if (WriteReadyCount(false) != 0)
843  {
844  for (auto buf : active_buffers_)
845  {
846  auto hdr = getEventHeader_(buf);
847  TLOG(TLVL_TRACE) << "Buffer: " << buf << ", SeqID: " << std::to_string(hdr->sequence_id) << ", ACTIVE";
848  if (hdr->sequence_id < lowestSeqId)
849  {
850  lowestSeqId = hdr->sequence_id;
851  }
852  }
853  TLOG(TLVL_TRACE) << "Lowest SeqID held: " << std::to_string(lowestSeqId);
854  }
855 
856  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
857  sorted_buffers.sort([this](int a, int b) {return bufferComparator(a, b); });
858 
859  auto counter = 0;
860  double eventSize = 0;
861  for (auto buf : sorted_buffers)
862  {
863  auto hdr = getEventHeader_(buf);
864  if (hdr->sequence_id > lowestSeqId) break;
865  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art.";
866  MarkBufferFull(buf);
867  subrun_event_count_++;
868  run_event_count_++;
869  counter++;
870  eventSize += BufferDataSize(buf);
871  pending_buffers_.erase(buf);
872  }
873  eventSize /= counter;
874 
875  TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
876  if (metricMan)
877  {
878  auto full = ReadReadyCount();
879  auto empty = WriteReadyCount(overwrite_mode_);
880  auto total = size();
881 
882  metricMan->sendMetric("Event Rate", counter, "Events/s", 1, MetricMode::Rate);
883  metricMan->sendMetric("Events Released to art (run)", run_event_count_, "Events", 1, MetricMode::LastPoint);
884  metricMan->sendMetric("Incomplete Events Released to art (run)", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
885  metricMan->sendMetric("Events Released to art (subrun)", subrun_event_count_, "Events", 2, MetricMode::LastPoint);
886  metricMan->sendMetric("Incomplete Events Released to art (subrun)", subrun_incomplete_event_count_, "Events", 2, MetricMode::LastPoint);
887  metricMan->sendMetric("Event Size", eventSize, "Bytes", 1, MetricMode::Average);
888 
889  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
890  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
891  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
892  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
893  }
894  TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
895 }
896 
897 void artdaq::SharedMemoryEventManager::send_init_frag_()
898 {
899  if (init_fragment_ != nullptr)
900  {
901  TLOG(TLVL_TRACE) << "Sending init Fragment to art...";
902 
903 #if 0
904  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
905  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
906  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
907  ostream.close();
908 #endif
909 
910  broadcastFragment_(std::move(init_fragment_), init_fragment_);
911  TLOG(TLVL_TRACE) << "Init Fragment sent";
912  }
913  else if (send_init_fragments_)
914  {
915  TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
916  }
917 }
918 
920 {
921  if (!init_fragment_ || init_fragment_ == nullptr)
922  {
923  init_fragment_.swap(frag);
924  send_init_frag_();
925  }
926 }
void RunArt(std::shared_ptr< art_config_file > config_file, pid_t &pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void StartArt()
Start all the art processes.
void ShutdownArtProcesses(std::set< pid_t > pids)
Shutdown a set of art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
void startSubrun()
Start a new Subrun, incrementing the subrun number.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endSubrun()
Send an EndOfSubRunFragment to the art thread.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
bool endOfData()
Indicate that the end of input has been reached to the art processes.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.