artdaq  v3_03_01
SharedMemoryEventManager.cc
1 
2 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
3 
4 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
5 #include "artdaq-core/Core/StatisticsCollection.hh"
6 #include "artdaq-core/Utilities/TraceLock.hh"
7 #include <sys/wait.h>
8 
9 #define TLVL_BUFFER 40
10 #define TLVL_BUFLCK 41
11 
12 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
13 
14 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
15  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
16  pset.get<size_t>("buffer_count"),
17  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
18  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
19  !pset.get<bool>("broadcast_mode", false))
20  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
21  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
22  , queue_size_(pset.get<size_t>("buffer_count"))
23  , run_id_(0)
24  , subrun_id_(0)
25  , subrun_rollover_event_(Fragment::InvalidSequenceID)
26  , last_released_event_(0)
27  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
28  , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
29  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
30  , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
31  , running_(false)
32  , buffer_writes_pending_()
33  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
34  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
35  , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now())
36  , metric_data_()
37  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
38  , run_event_count_(0)
39  , run_incomplete_event_count_(0)
40  , subrun_event_count_(0)
41  , subrun_incomplete_event_count_(0)
42  , oversize_fragment_count_(0)
43  , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
44  , art_processes_()
45  , restart_art_(false)
46  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
47  , manual_art_(pset.get<bool>("manual_art", false))
48  , current_art_pset_(art_pset)
49  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
50  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 100000))
51  , requests_(nullptr)
52  , data_pset_(pset)
53  , dropped_data_()
54  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
55  pset.get<size_t>("broadcast_buffer_count", 10),
56  pset.get<size_t>("broadcast_buffer_size", 0x100000),
57  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
58 {
59  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
60  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
61 
62  if (pset.get<bool>("use_art", true) == false)
63  {
64  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
65  num_art_processes_ = 0;
66  }
67  else
68  {
69  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
70  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
71  }
72  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
73 
74  if (overwrite_mode_ && num_art_processes_ > 0)
75  {
76  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
77  }
78  else if (overwrite_mode_)
79  {
80  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
81  }
82 
83  for (size_t ii = 0; ii < size(); ++ii)
84  {
85  buffer_writes_pending_[ii] = 0;
86  }
87 
88  if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
89 
90  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
91  SetRank(my_rank);
92  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
93 
94 
95  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
96 }
97 
99 {
100  TLOG(TLVL_TRACE) << "DESTRUCTOR";
101  if (running_) endOfData();
102  TLOG(TLVL_TRACE) << "Destructor END";
103 }
104 
105 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
106 {
107  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
108  << ", sequence_id=" << frag.sequence_id;
109  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
110  TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
111  if (buffer == -1) return false;
112  if (buffer == -2)
113  {
114  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
115  return true;
116  }
117 
118  auto hdr = getEventHeader_(buffer);
119  if (update_run_ids_)
120  {
121  hdr->run_id = run_id_;
122  hdr->subrun_id = subrun_id_;
123  }
124 
125  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
126  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
127 
128  TLOG(TLVL_TRACE) << "Checking for complete event";
129  auto fragmentCount = GetFragmentCount(frag.sequence_id);
130  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
131  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
132  << ", fragmentCount=" << fragmentCount
133  << ", num_fragments_per_event=" << num_fragments_per_event_
134  << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
135 
136  complete_buffer_(buffer);
137  if (requests_) requests_->SendRequest(true);
138 
139  TLOG(TLVL_TRACE) << "AddFragment END";
140  return true;
141 }
142 
143 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
144 {
145  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
146  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
147  auto data = frag->headerAddress();
148  auto start = std::chrono::steady_clock::now();
149  bool sts = false;
150  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
151  {
152  sts = AddFragment(hdr, data);
153  if (!sts) usleep(1000);
154  }
155  if (!sts)
156  {
157  outfrag = std::move(frag);
158  }
159  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
160  return sts;
161 }
162 
163 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
164 {
165  TLOG(14) << "WriteFragmentHeader BEGIN";
166  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
167 
168  if (buffer < 0)
169  {
170  if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
171  if (buffer == -2)
172  {
173  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
174  }
175  else
176  {
177  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
178  }
179  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
180 
181  TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
182  return dropped_data_[frag.fragment_id]->dataBegin();
183  }
184 
185  // Increment this as soon as we know we want to use the buffer
186  buffer_writes_pending_[buffer]++;
187 
188  if (metricMan)
189  {
190  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
191  }
192 
193  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
194 
195  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
196 
197  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
198 
199  //TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
200  auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
201  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
202 
203  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
204  if (frag.word_count - frag.num_words() > 0)
205  {
206  auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
207 
208  if (!sts)
209  {
210  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words();
211  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType;
212  TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
213  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
214 
215  oversize_fragment_count_++;
216 
217  if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
218  {
219  throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
220  }
221 
222  TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin();
223  return dropped_data_[frag.fragment_id]->dataBegin();
224  }
225  }
226  TLOG(14) << "WriteFragmentHeader END";
227  return pos;
228 
229 }
230 
231 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
232 {
233  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
234  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
235  if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
236  if (buffer == -2) { return; }
237 
238  {
239  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
240 
241  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
242 
243  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
244 
245  //TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
246 
247  TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")";
248  auto hdr = getEventHeader_(buffer);
249  if (update_run_ids_)
250  {
251  hdr->run_id = run_id_;
252  hdr->subrun_id = subrun_id_;
253  }
254 
255  buffer_writes_pending_[buffer]--;
256  if (buffer_writes_pending_[buffer] != 0)
257  {
258  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
259  return;
260  }
261  TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
262  auto frag_count = GetFragmentCount(frag.sequence_id);
263  hdr->is_complete = frag_count == num_fragments_per_event_;
264  TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
265 #if ART_SUPPORTS_DUPLICATE_EVENTS
266  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
267  {
268  hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
269  }
270 #endif
271  }
272 
273  complete_buffer_(buffer);
274  if (requests_) requests_->SendRequest(true);
275  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
276 }
277 
278 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
279 {
280  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
281 }
282 
283 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
284 {
285  if (buffer == -1) return 0;
286  ResetReadPos(buffer);
287  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
288 
289  size_t count = 0;
290 
291  while (MoreDataInBuffer(buffer))
292  {
293  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
294  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
295  if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
296  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
297  ++count;
298  }
299 
300  return count;
301 }
302 
303 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out)
304 {
305  do
306  {
307  auto start_time = std::chrono::steady_clock::now();
308  send_init_frag_();
309  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
310 
311  pid_t pid = 0;
312 
313  if (!manual_art_)
314  {
315  char* filename = new char[config_file->getFileName().length() + 1];
316  strcpy(filename, config_file->getFileName().c_str());
317 
318  std::vector<char*> args{ (char*)"art", (char*)"-c", filename, NULL };
319  pid = fork();
320  if (pid == 0)
321  { /* child */
322  // 23-May-2018, KAB: added the setting of the partition number env var
323  // in the environment of the child art process so that Globals.hh
324  // will pick it up there and provide it to the artdaq classes that
325  // are used in data transfers, etc. within the art process.
326  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
327  std::string envVarValue = std::to_string(GetPartitionNumber());
328  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
329  {
330  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
331  << "\" in the environment of a child art process. "
332  << "This may result in incorrect TCP port number "
333  << "assignments or other issues, and data may "
334  << "not flow through the system correctly.";
335  }
336 
337  execvp("art", &args[0]);
338  delete[] filename;
339  exit(1);
340  }
341  delete[] filename;
342  }
343  else
344  {
345  //Using cin/cout here to ensure console is active (artdaqDriver)
346  std::cout << "Please run the following command in a separate terminal:" << std::endl
347  << "art -c " << config_file->getFileName() << std::endl
348  << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
349  << "Finally, return to this window and enter the pid: " << std::endl;
350  std::cin >> pid;
351  }
352  *pid_out = pid;
353 
354  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
355  {
356  std::unique_lock<std::mutex> lk(art_process_mutex_);
357  art_processes_.insert(pid);
358  }
359  siginfo_t status;
360  auto sts = waitid(P_PID, pid, &status, WEXITED);
361  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
362  {
363  std::unique_lock<std::mutex> lk(art_process_mutex_);
364  art_processes_.erase(pid);
365  }
366  if (sts < 0)
367  {
368  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
369  }
370  else if (status.si_code == CLD_EXITED && status.si_status == 0)
371  {
372  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
373  }
374  else
375  {
376  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
377  if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
378 
379  auto exit_type = "exited with status code";
380  switch (status.si_code)
381  {
382  case CLD_DUMPED:
383  case CLD_KILLED:
384  exit_type = "was killed with signal";
385  break;
386  case CLD_EXITED:
387  default:
388  break;
389  }
390 
391  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
392  << "art process " << pid << " " << exit_type << " " << status.si_status
393  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
394  << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
395  << (restart_art_ ? "restarting" : "not restarting");
396  }
397  } while (restart_art_);
398 }
399 
401 {
402  restart_art_ = always_restart_art_;
403  if (num_art_processes_ == 0) return;
404  for (size_t ii = 0; ii < num_art_processes_; ++ii)
405  {
406  StartArtProcess(current_art_pset_);
407  }
408 }
409 
411 {
412  static std::mutex start_art_mutex;
413  std::unique_lock<std::mutex> lk(start_art_mutex);
414  //TraceLock lk(start_art_mutex, 15, "StartArtLock");
415  restart_art_ = always_restart_art_;
416  auto initialCount = GetAttachedCount();
417  auto startTime = std::chrono::steady_clock::now();
418 
419  if (pset != current_art_pset_ || !current_art_config_file_)
420  {
421  current_art_pset_ = pset;
422  current_art_config_file_ = std::make_shared<art_config_file>(pset/*, GetKey(), GetBroadcastKey()*/);
423  }
424  std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
425  boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
426  thread.detach();
427 
428  auto currentCount = GetAttachedCount() - initialCount;
429  while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
430  {
431  usleep(10000);
432  currentCount = GetAttachedCount() - initialCount;
433  }
434  if ((currentCount < 1 || *pid <= 0) && manual_art_)
435  {
436  TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
437  return 0;
438  }
439  else if (currentCount < 1 || *pid <= 0)
440  {
441  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
442  << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
443  return 0;
444  }
445  else
446  {
447  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
448  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
449 
450  return *pid;
451  }
452 
453 }
454 
456 {
457  restart_art_ = false;
458  //current_art_config_file_ = nullptr;
459  //current_art_pset_ = fhicl::ParameterSet();
460 
461  auto check_pids = [&](bool print) {
462 
463  for (auto pid = pids.begin(); pid != pids.end();)
464  {
465  // 08-May-2018, KAB: protect against killing invalid PIDS
466 
467  std::unique_lock<std::mutex> lk(art_process_mutex_);
468  if (*pid <= 0)
469  {
470  TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
471  << ") from the shutdown list.";
472  pid = pids.erase(pid);
473  }
474  else if (kill(*pid, 0) < 0)
475  {
476  pid = pids.erase(pid);
477  }
478  else
479  {
480  if (print) std::cout << *pid << " ";
481  ++pid;
482  }
483  }
484  };
485  check_pids(false);
486  if (pids.size() == 0)
487  {
488  TLOG(14) << "All art processes already exited, nothing to do.";
489  usleep(1000);
490  return;
491  }
492 
493  if (!manual_art_)
494  {
495  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
496  for (auto pid : pids)
497  {
498  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
499  kill(pid, SIGQUIT);
500  }
501 
502  int graceful_wait_ms = 5000;
503  int int_wait_ms = 1000;
504 
505  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
506  for (int ii = 0; ii < graceful_wait_ms; ++ii)
507  {
508  usleep(1000);
509 
510  check_pids(false);
511  if (pids.size() == 0)
512  {
513  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
514  return;
515  }
516  }
517 
518  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
519  for (auto pid : pids)
520  {
521  kill(pid, SIGINT);
522  }
523 
524  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
525  for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
526  {
527  usleep(1000);
528 
529  check_pids(false);
530 
531  if (pids.size() == 0)
532  {
533  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
534  return;
535  }
536  }
537 
538  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
539  while (pids.size() > 0)
540  {
541  kill(*pids.begin(), SIGKILL);
542  usleep(1000);
543 
544  check_pids(false);
545  }
546  }
547  else
548  {
549  std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
550  while (pids.size() > 0)
551  {
552  std::cout << "The following PIDs are running: ";
553  check_pids(true);
554  std::cout << std::endl;
555  std::string ignored;
556  std::cin >> ignored;
557  }
558  }
559 }
560 
561 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
562 {
563  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
564  if (restart_art_ || !always_restart_art_) // Art is running
565  {
566  endOfData();
567  }
568  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
569  {
570  broadcasts_.MarkBufferEmpty(ii, true);
571  }
572  if (newRun == 0) newRun = run_id_ + 1;
573 
574  if (art_pset != current_art_pset_ || !current_art_config_file_)
575  {
576  current_art_pset_ = art_pset;
577  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
578  }
579 
580  if (n_art_processes != -1)
581  {
582  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
583  num_art_processes_ = n_art_processes;
584  }
585  startRun(newRun);
586  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
587 }
588 
590 {
591  running_ = false;
592  init_fragment_.reset(nullptr);
593  TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
594  restart_art_ = false;
595 
596  size_t initialStoreSize = GetIncompleteEventCount();
597  TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
598  << " stale events from the SharedMemoryEventManager.";
599  int counter = initialStoreSize;
600  while (active_buffers_.size() > 0 && counter > 0)
601  {
602  complete_buffer_(*active_buffers_.begin());
603  counter--;
604  }
605  TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
606  << " stale events in the SharedMemoryEventManager.";
607 
608 
609  TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
610  auto start = std::chrono::steady_clock::now();
611  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
612  auto end_of_data_wait_us = art_event_processing_time_us_ * lastReadCount;//size();
613 
614  auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
615 
616  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
617  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
618  {
619  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
620  if (temp != lastReadCount)
621  {
622  TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
623  lastReadCount = temp;
624  start = std::chrono::steady_clock::now();
625  }
626  if (lastReadCount > 0) {
627  TRACE(19, "About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu", outstanding_buffer_wait_time, lastReadCount, size(), end_of_data_wait_us);
628  usleep(outstanding_buffer_wait_time);
629  }
630  }
631 
632  TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
633  << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
634 
635  TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
636  FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
637  bool success = broadcastFragment_(std::move(outFrag), outFrag);
638  if (!success)
639  {
640  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
641  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
642  {
643  broadcasts_.MarkBufferEmpty(ii, true);
644  }
645  broadcastFragment_(std::move(outFrag), outFrag);
646  }
647  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
648 
649  if (get_art_process_count_() > 0)
650  {
651  TLOG(TLVL_DEBUG) << "Allowing " << get_art_process_count_() << " art processes the chance to end gracefully";
652  if (end_of_data_wait_us == 0)
653  {
654  TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
655  end_of_data_wait_us = 100 * 1000000;
656  }
657 
658  auto sleep_count = (end_of_data_wait_us / 10000) + 1;
659  for (size_t ii = 0; ii < sleep_count; ++ii)
660  {
661  usleep(10000);
662  if (get_art_process_count_() == 0) break;
663  }
664  }
665 
666  while (get_art_process_count_() > 0)
667  {
668  TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
669 
670  ShutdownArtProcesses(art_processes_);
671  }
672  TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
673 
674  ResetAttachedCount();
675 
676  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
677  for (size_t ii = 0; ii < size(); ++ii)
678  {
679  MarkBufferEmpty(ii, true);
680  }
681  // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again...
682  // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers";
683  // for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
684  // {
685  // broadcasts_.MarkBufferEmpty(ii, true);
686  // }
687  released_incomplete_events_.clear();
688 
689  TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestReceiver";
690  requests_.reset(nullptr);
691 
692  TLOG(TLVL_DEBUG) << "endOfData END";
693  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
694  return true;
695 }
696 
698 {
699  running_ = true;
700  init_fragment_.reset(nullptr);
701  TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
702  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
703  {
704  broadcasts_.MarkBufferEmpty(ii, true);
705  }
706  StartArt();
707  run_id_ = runID;
708  subrun_id_ = 1;
709  subrun_rollover_event_ = Fragment::InvalidSequenceID;
710  last_released_event_ = 0;
711  requests_.reset(new RequestSender(data_pset_));
712  if (requests_)
713  {
714  requests_->SendRoutingToken(queue_size_);
715  }
716  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
717  << ", max queue size = "
718  << queue_size_
719  << ", queue size = "
720  << GetLockedBufferCount();
721  if (metricMan)
722  {
723  double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
724  metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
725  }
726 }
727 
729 {
730  ++subrun_id_;
731  subrun_rollover_event_ = Fragment::InvalidSequenceID;
732  if (metricMan)
733  {
734  double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
735  metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
736  }
737 }
738 
740 {
741  TLOG(TLVL_INFO) << "Ending run " << run_id_;
742  FragmentPtr endOfRunFrag(new
743  Fragment(static_cast<size_t>
744  (ceil(sizeof(my_rank) /
745  static_cast<double>(sizeof(Fragment::value_type))))));
746 
747  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
748  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
749  *endOfRunFrag->dataBegin() = my_rank;
750  broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
751 
752  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
753  run_event_count_ = 0;
754  run_incomplete_event_count_ = 0;
755  oversize_fragment_count_ = 0;
756  return true;
757 }
758 
760 {
761  TLOG(TLVL_INFO) << "Ending subrun " << subrun_id_;
762  std::unique_ptr<artdaq::Fragment>
763  endOfSubrunFrag(new
764  Fragment(static_cast<size_t>
765  (ceil(sizeof(my_rank) /
766  static_cast<double>(sizeof(Fragment::value_type))))));
767 
768  TLOG(TLVL_DEBUG) << "Broadcasting EndOfSubrun Fragment";
769  endOfSubrunFrag->setSequenceID(subrun_rollover_event_);
770  endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
771  *endOfSubrunFrag->dataBegin() = my_rank;
772 
773  broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
774 
775  TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun.";
776  subrun_event_count_ = 0;
777  subrun_incomplete_event_count_ = 0;
778 
779  return true;
780 }
781 
783 {
784  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
785  if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
786 
787  if (boundary < last_released_event_)
788  {
789  auto logLevel = TLVL_ERROR;
790  bool processAnyway = false;
791  if (last_released_event_ - boundary < 100)
792  {
793  logLevel = TLVL_WARNING;
794  processAnyway = true;
795  }
796  TLOG(logLevel) << "Subrun rollover requested for event that is in the past. (last_released_event="
797  << last_released_event_ << ",requested_rollover_boundary=" << boundary << ").";
798  if (!processAnyway) return;
799  }
800  TLOG(TLVL_INFO) << "Will roll over when I reach Sequence ID " << boundary;
801 
802  // JCF, May-11-2018
803 
804  // subrun_rollover_event_ is used in check_pending_buffers to
805  // trigger an endSubrun()/startSubrun(), but if the last event
806  // sent was right before the boundary we might as well switch
807  // to the new subrun here
808 
809  if (boundary == last_released_event_ + 1) {
810  TLOG(TLVL_INFO) << "rolloverSubrun: Last released event had sequence id " << last_released_event_ << \
811  ", boundary is sequence id " << boundary << ", so will start a new subrun here";
812  endSubrun();
813  startSubrun();
814  subrun_rollover_event_ = std::numeric_limits<sequence_id_t>::max();
815  }
816  else {
817  subrun_rollover_event_ = boundary;
818  }
819 }
820 
822 {
823  if (metricMan)
824  {
825  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
826  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
827  }
828 
829  if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
830  {
831  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
832  return;
833 
834  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
835  std::ostringstream oss;
836  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
837  for (auto& ev : active_buffers_)
838  {
839  auto hdr = getEventHeader_(ev);
840  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
841  }
842  TLOG(TLVL_DEBUG) << oss.str();
843  }
844 }
845 
846 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
847 {
848  TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
849  auto buffer = broadcasts_.GetBufferForWriting(false);
850  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
851  auto start_time = std::chrono::steady_clock::now();
852  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
853  {
854  usleep(10000);
855  buffer = broadcasts_.GetBufferForWriting(false);
856  }
857  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
858  if (buffer == -1)
859  {
860  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
861  outFrag.swap(frag);
862  return false;
863  }
864 
865  TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
866  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
867  hdr->run_id = run_id_;
868  hdr->subrun_id = subrun_id_;
869  hdr->sequence_id = frag->sequenceID();
870  hdr->is_complete = true;
871  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
872 
873  TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
874  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
875 
876  TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
877  broadcasts_.MarkBufferFull(buffer, -1);
878  outFrag.swap(frag);
879  TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
880  return true;
881 }
882 
883 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
884 {
885  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
886 }
887 
888 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
889 {
890  TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
891  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
892 
893  TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
894 
895  auto buffers = GetBuffersOwnedByManager();
896  for (auto& buf : buffers)
897  {
898  auto hdr = getEventHeader_(buf);
899  if (hdr->sequence_id == seqID)
900  {
901  TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
902  return buf;
903  }
904  }
905 
906 #if !ART_SUPPORTS_DUPLICATE_EVENTS
907  if (released_incomplete_events_.count(seqID))
908  {
909  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
910  return -2;
911  }
912 #endif
913 
914  if (!create_new) return -1;
915 
916  check_pending_buffers_(lk);
917  int new_buffer = GetBufferForWriting(false);
918 
919  if (new_buffer == -1)
920  {
921  new_buffer = GetBufferForWriting(overwrite_mode_);
922  }
923 
924  if (new_buffer == -1) return -1;
925  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
926  std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
927  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
928  //TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
929  auto hdr = getEventHeader_(new_buffer);
930  hdr->is_complete = false;
931  hdr->run_id = run_id_;
932  hdr->subrun_id = subrun_id_;
933  hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
934  hdr->sequence_id = seqID;
935  buffer_writes_pending_[new_buffer] = 0;
936  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
937  SetMFIteration("Sequence ID " + std::to_string(seqID));
938 
939  TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
940  active_buffers_.insert(new_buffer);
941  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
942  << size() << ","
943  << ReadReadyCount() << ","
944  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
945  << WriteReadyCount(false) << ","
946  << pending_buffers_.size() << ","
947  << active_buffers_.size() << ")";
948 
949  if (requests_)
950  {
951  if (timestamp != Fragment::InvalidTimestamp)
952  {
953  requests_->AddRequest(seqID, timestamp);
954  }
955  // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we
956  // don't double-send requests, but still get the benefit of calling SendRequest 'often'.
957  else
958  {
959  requests_->SendRequest();
960  }
961  }
962  TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
963  return new_buffer;
964 }
965 
966 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
967 {
968  if (buffer == -1) return true;
969  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
970  {
971  return true;
972  }
973  ResetReadPos(buffer);
974  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
975  return MoreDataInBuffer(buffer);
976 }
977 
978 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
979 {
980  auto hdr = getEventHeader_(buffer);
981  if (hdr->is_complete)
982  {
983  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
984 
985  {
986  TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
987 
988  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
989  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
990  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
991  active_buffers_.erase(buffer);
992  pending_buffers_.insert(buffer);
993 
994  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
995  << size() << ","
996  << ReadReadyCount() << ","
997  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
998  << WriteReadyCount(false) << ","
999  << pending_buffers_.size() << ","
1000  << active_buffers_.size() << ")";
1001  }
1002  if (requests_)
1003  {
1004  requests_->RemoveRequest(hdr->sequence_id);
1005  }
1006  }
1007  CheckPendingBuffers();
1008 }
1009 
1010 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
1011 {
1012  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1013 }
1014 
1016 {
1017  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
1018  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1019  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
1020  check_pending_buffers_(lk);
1021 }
1022 
1023 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
1024 {
1025  TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1026 
1027  auto buffers = GetBuffersOwnedByManager();
1028  for (auto buf : buffers)
1029  {
1030  if (ResetBuffer(buf) && !pending_buffers_.count(buf))
1031  {
1032  TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1033  auto hdr = getEventHeader_(buf);
1034  if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_))
1035  {
1036  if (requests_)
1037  {
1038  requests_->RemoveRequest(hdr->sequence_id);
1039  }
1040  TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
1041  active_buffers_.erase(buf);
1042  pending_buffers_.insert(buf);
1043  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1044  << size() << ","
1045  << ReadReadyCount() << ","
1046  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1047  << WriteReadyCount(false) << ","
1048  << pending_buffers_.size() << ","
1049  << active_buffers_.size() << ")";
1050 
1051  subrun_incomplete_event_count_++;
1052  run_incomplete_event_count_++;
1053  if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
1054  if (!released_incomplete_events_.count(hdr->sequence_id))
1055  {
1056  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1057  }
1058  else
1059  {
1060  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1061  }
1062  TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
1063  }
1064 
1065  }
1066  }
1067 
1068  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1069  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
1070 
1071  auto counter = 0;
1072  double eventSize = 0;
1073  for (auto buf : sorted_buffers)
1074  {
1075  auto hdr = getEventHeader_(buf);
1076 
1077  if (hdr->sequence_id >= subrun_rollover_event_)
1078  {
1079  TLOG(TLVL_INFO) << "Subrun rollover reached at event " << hdr->sequence_id << " (boundary=" << subrun_rollover_event_ << "), last released event is " << last_released_event_ << ".";
1080  endSubrun();
1081  startSubrun();
1082  }
1083  if (hdr->sequence_id > last_released_event_) last_released_event_ = hdr->sequence_id;
1084 
1085  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
1086  << "event_size=" << BufferDataSize(buf) << ", buffer_size=" << BufferSize();
1087 
1088  TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
1089  MarkBufferFull(buf);
1090  subrun_event_count_++;
1091  run_event_count_++;
1092  counter++;
1093  eventSize += BufferDataSize(buf);
1094  pending_buffers_.erase(buf);
1095  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1096  << size() << ","
1097  << ReadReadyCount() << ","
1098  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1099  << WriteReadyCount(false) << ","
1100  << pending_buffers_.size() << ","
1101  << active_buffers_.size() << ")";
1102  }
1103 
1104  if (requests_)
1105  {
1106  auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
1107  auto available_buffers = WriteReadyCount(overwrite_mode_);
1108 
1109  TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
1110  << ", tokens_to_send: " << available_buffers - outstanding_tokens;
1111 
1112  if (available_buffers > outstanding_tokens)
1113  {
1114  auto tokens_to_send = available_buffers - outstanding_tokens;
1115 
1116  while (tokens_to_send > 0)
1117  {
1118  TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
1119  requests_->SendRoutingToken(1);
1120  tokens_to_send--;
1121  }
1122  }
1123  }
1124 
1125  metric_data_.event_count += counter;
1126  metric_data_.event_size += eventSize;
1127 
1128  if (metricMan && TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500) // Limit to 2 Hz updates
1129  {
1130  TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
1131  metricMan->sendMetric("Event Rate", metric_data_.event_count, "Events/s", 1, MetricMode::Rate);
1132  if (metric_data_.event_count > 0) metricMan->sendMetric("Average Event Size", metric_data_.event_size / metric_data_.event_count, "Bytes", 1, MetricMode::Average);
1133  metric_data_ = MetricData();
1134 
1135  metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
1136  metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
1137  metricMan->sendMetric("Events Released to art this subrun", subrun_event_count_, "Events", 2, MetricMode::LastPoint);
1138  metricMan->sendMetric("Incomplete Events Released to art this subrun", subrun_incomplete_event_count_, "Events", 2, MetricMode::LastPoint);
1139  if (requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
1140 
1141  auto bufferReport = GetBufferReport();
1142  int full = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Full; });
1143  int empty = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Empty; });
1144  int writing = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Writing; });
1145  int reading = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Reading; });
1146  auto total = size();
1147  TLOG(TLVL_DEBUG) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
1148 
1149  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
1150  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
1151  metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
1152  metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
1153  if (total > 0)
1154  {
1155  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1156  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1157  }
1158 
1159  last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now();
1160  }
1161  TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
1162 }
1163 
1164 void artdaq::SharedMemoryEventManager::send_init_frag_()
1165 {
1166  if (init_fragment_ != nullptr)
1167  {
1168  TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
1169 
1170 #if 0
1171  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
1172  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1173  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1174  ostream.close();
1175 #endif
1176 
1177  broadcastFragment_(std::move(init_fragment_), init_fragment_);
1178  TLOG(TLVL_TRACE) << "Init Fragment sent";
1179  }
1180  else if (send_init_fragments_)
1181  {
1182  TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
1183  }
1184 }
1185 
1187 {
1188  if (!init_fragment_ || init_fragment_ == nullptr)
1189  {
1190  init_fragment_.swap(frag);
1191  send_init_frag_();
1192  }
1193 }
1194 
1196 {
1197  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
1198  if (art_pset != current_art_pset_ || !current_art_config_file_)
1199  {
1200  current_art_pset_ = art_pset;
1201  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
1202  }
1203  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
1204 }
1205 
1206 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
1207 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager)
1208 #endif
void RunArt(std::shared_ptr< art_config_file > config_file, std::shared_ptr< std::atomic< pid_t >> pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
The SharedMemoryEventManager is a SharedMemoryManger which tracks events as they are built...
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
void StartArt()
Start all the art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
void rolloverSubrun(sequence_id_t boundary)
Rollover the subrun after the specified event.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
void startSubrun()
Start a new Subrun, incrementing the subrun number.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endSubrun()
Send an EndOfSubRunFragment to the art thread.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
bool endOfData()
Indicate that the end of input has been reached to the art processes.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...