artdaq  v3_03_02
SharedMemoryEventManager.cc
1 
2 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
3 
4 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
5 #include "artdaq-core/Core/StatisticsCollection.hh"
6 #include "artdaq-core/Utilities/TraceLock.hh"
7 #include <sys/wait.h>
8 
9 #define TLVL_BUFFER 40
10 #define TLVL_BUFLCK 41
11 
12 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
13 
14 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
15  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
16  pset.get<size_t>("buffer_count"),
17  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
18  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
19  !pset.get<bool>("broadcast_mode", false))
20  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
21  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
22  , queue_size_(pset.get<size_t>("buffer_count"))
23  , run_id_(0)
24  , subrun_id_(0)
25  , subrun_rollover_event_(Fragment::InvalidSequenceID)
26  , last_released_event_(0)
27  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
28  , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
29  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
30  , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
31  , running_(false)
32  , buffer_writes_pending_()
33  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
34  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
35  , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now())
36  , metric_data_()
37  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
38  , run_event_count_(0)
39  , run_incomplete_event_count_(0)
40  , subrun_event_count_(0)
41  , subrun_incomplete_event_count_(0)
42  , oversize_fragment_count_(0)
43  , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
44  , art_processes_()
45  , restart_art_(false)
46  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
47  , manual_art_(pset.get<bool>("manual_art", false))
48  , current_art_pset_(art_pset)
49  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
50  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 100000))
51  , requests_(nullptr)
52  , data_pset_(pset)
53  , dropped_data_()
54  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
55  pset.get<size_t>("broadcast_buffer_count", 10),
56  pset.get<size_t>("broadcast_buffer_size", 0x100000),
57  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
58 {
59  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
60  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
61 
62  if (pset.get<bool>("use_art", true) == false)
63  {
64  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
65  num_art_processes_ = 0;
66  }
67  else
68  {
69  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
70  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
71  }
72  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
73 
74  if (overwrite_mode_ && num_art_processes_ > 0)
75  {
76  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
77  }
78  else if (overwrite_mode_)
79  {
80  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
81  }
82 
83  for (size_t ii = 0; ii < size(); ++ii)
84  {
85  buffer_writes_pending_[ii] = 0;
86  }
87 
88  if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
89 
90  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
91  SetRank(my_rank);
92  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
93 
94 
95  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
96 }
97 
99 {
100  TLOG(TLVL_TRACE) << "DESTRUCTOR";
101  if (running_) endOfData();
102  TLOG(TLVL_TRACE) << "Destructor END";
103 }
104 
105 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
106 {
107  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
108  << ", sequence_id=" << frag.sequence_id;
109  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
110  TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
111  if (buffer == -1) return false;
112  if (buffer == -2)
113  {
114  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
115  return true;
116  }
117 
118  auto hdr = getEventHeader_(buffer);
119  if (update_run_ids_)
120  {
121  hdr->run_id = run_id_;
122  hdr->subrun_id = subrun_id_;
123  }
124 
125  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
126  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
127 
128  TLOG(TLVL_TRACE) << "Checking for complete event";
129  auto fragmentCount = GetFragmentCount(frag.sequence_id);
130  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
131  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
132  << ", fragmentCount=" << fragmentCount
133  << ", num_fragments_per_event=" << num_fragments_per_event_
134  << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
135 
136  complete_buffer_(buffer);
137  if (requests_) requests_->SendRequest(true);
138 
139  TLOG(TLVL_TRACE) << "AddFragment END";
140  return true;
141 }
142 
143 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
144 {
145  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
146  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
147  auto data = frag->headerAddress();
148  auto start = std::chrono::steady_clock::now();
149  bool sts = false;
150  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
151  {
152  sts = AddFragment(hdr, data);
153  if (!sts) usleep(1000);
154  }
155  if (!sts)
156  {
157  outfrag = std::move(frag);
158  }
159  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
160  return sts;
161 }
162 
163 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
164 {
165  TLOG(14) << "WriteFragmentHeader BEGIN";
166  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
167 
168  if (buffer < 0)
169  {
170  if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
171  if (buffer == -2)
172  {
173  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
174  }
175  else
176  {
177  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
178  }
179  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
180 
181  TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
182  return dropped_data_[frag.fragment_id]->dataBegin();
183  }
184 
185  // Increment this as soon as we know we want to use the buffer
186  buffer_writes_pending_[buffer]++;
187 
188  if (metricMan)
189  {
190  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
191  }
192 
193  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
194 
195  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
196 
197  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
198 
199  //TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
200  auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
201  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
202 
203  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
204  if (frag.word_count - frag.num_words() > 0)
205  {
206  auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
207 
208  if (!sts)
209  {
210  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words();
211  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType;
212  TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
213  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
214 
215  oversize_fragment_count_++;
216 
217  if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
218  {
219  throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
220  }
221 
222  TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin();
223  return dropped_data_[frag.fragment_id]->dataBegin();
224  }
225  }
226  TLOG(14) << "WriteFragmentHeader END";
227  return pos;
228 
229 }
230 
231 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
232 {
233  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
234  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
235  if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
236  if (buffer == -2) { return; }
237 
238  {
239  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
240 
241  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
242 
243  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
244 
245  //TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
246 
247  TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")";
248  auto hdr = getEventHeader_(buffer);
249  if (update_run_ids_)
250  {
251  hdr->run_id = run_id_;
252  hdr->subrun_id = subrun_id_;
253  }
254 
255  TLOG(TLVL_TRACE) << "DoneWritingFragment: Updating buffer touch time";
256  TouchBuffer(buffer);
257 
258  buffer_writes_pending_[buffer]--;
259  if (buffer_writes_pending_[buffer] != 0)
260  {
261  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
262  return;
263  }
264  TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
265  auto frag_count = GetFragmentCount(frag.sequence_id);
266  hdr->is_complete = frag_count == num_fragments_per_event_;
267  TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
268 #if ART_SUPPORTS_DUPLICATE_EVENTS
269  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
270  {
271  hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
272  }
273 #endif
274  }
275 
276  complete_buffer_(buffer);
277  if (requests_) requests_->SendRequest(true);
278  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
279 }
280 
281 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
282 {
283  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
284 }
285 
286 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
287 {
288  if (buffer == -1) return 0;
289  ResetReadPos(buffer);
290  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
291 
292  size_t count = 0;
293 
294  while (MoreDataInBuffer(buffer))
295  {
296  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
297  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
298  if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
299  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
300  ++count;
301  }
302 
303  return count;
304 }
305 
306 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out)
307 {
308  do
309  {
310  auto start_time = std::chrono::steady_clock::now();
311  send_init_frag_();
312  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
313 
314  pid_t pid = 0;
315 
316  if (!manual_art_)
317  {
318  char* filename = new char[config_file->getFileName().length() + 1];
319  strcpy(filename, config_file->getFileName().c_str());
320 
321  std::vector<char*> args{ (char*)"art", (char*)"-c", filename, NULL };
322  pid = fork();
323  if (pid == 0)
324  { /* child */
325  // 23-May-2018, KAB: added the setting of the partition number env var
326  // in the environment of the child art process so that Globals.hh
327  // will pick it up there and provide it to the artdaq classes that
328  // are used in data transfers, etc. within the art process.
329  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
330  std::string envVarValue = std::to_string(GetPartitionNumber());
331  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
332  {
333  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
334  << "\" in the environment of a child art process. "
335  << "This may result in incorrect TCP port number "
336  << "assignments or other issues, and data may "
337  << "not flow through the system correctly.";
338  }
339  envVarKey = "ARTDAQ_APPLICATION_NAME";
340  envVarValue = app_name;
341  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
342  {
343  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
344  << "\" in the environment of a child art process. ";
345  }
346 
347  execvp("art", &args[0]);
348  delete[] filename;
349  exit(1);
350  }
351  delete[] filename;
352  }
353  else
354  {
355  //Using cin/cout here to ensure console is active (artdaqDriver)
356  std::cout << "Please run the following command in a separate terminal:" << std::endl
357  << "art -c " << config_file->getFileName() << std::endl
358  << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
359  << "Finally, return to this window and enter the pid: " << std::endl;
360  std::cin >> pid;
361  }
362  *pid_out = pid;
363 
364  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
365  {
366  std::unique_lock<std::mutex> lk(art_process_mutex_);
367  art_processes_.insert(pid);
368  }
369  siginfo_t status;
370  auto sts = waitid(P_PID, pid, &status, WEXITED);
371  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
372  {
373  std::unique_lock<std::mutex> lk(art_process_mutex_);
374  art_processes_.erase(pid);
375  }
376  if (sts < 0)
377  {
378  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
379  }
380  else if (status.si_code == CLD_EXITED && status.si_status == 0)
381  {
382  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
383  }
384  else
385  {
386  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
387  if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
388 
389  auto exit_type = "exited with status code";
390  switch (status.si_code)
391  {
392  case CLD_DUMPED:
393  case CLD_KILLED:
394  exit_type = "was killed with signal";
395  break;
396  case CLD_EXITED:
397  default:
398  break;
399  }
400 
401  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
402  << "art process " << pid << " " << exit_type << " " << status.si_status
403  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
404  << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
405  << (restart_art_ ? "restarting" : "not restarting");
406  }
407  } while (restart_art_);
408 }
409 
411 {
412  restart_art_ = always_restart_art_;
413  if (num_art_processes_ == 0) return;
414  for (size_t ii = 0; ii < num_art_processes_; ++ii)
415  {
416  StartArtProcess(current_art_pset_);
417  }
418 }
419 
421 {
422  static std::mutex start_art_mutex;
423  std::unique_lock<std::mutex> lk(start_art_mutex);
424  //TraceLock lk(start_art_mutex, 15, "StartArtLock");
425  restart_art_ = always_restart_art_;
426  auto initialCount = GetAttachedCount();
427  auto startTime = std::chrono::steady_clock::now();
428 
429  if (pset != current_art_pset_ || !current_art_config_file_)
430  {
431  current_art_pset_ = pset;
432  current_art_config_file_ = std::make_shared<art_config_file>(pset/*, GetKey(), GetBroadcastKey()*/);
433  }
434  std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
435  boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
436  thread.detach();
437 
438  auto currentCount = GetAttachedCount() - initialCount;
439  while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
440  {
441  usleep(10000);
442  currentCount = GetAttachedCount() - initialCount;
443  }
444  if ((currentCount < 1 || *pid <= 0) && manual_art_)
445  {
446  TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
447  return 0;
448  }
449  else if (currentCount < 1 || *pid <= 0)
450  {
451  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
452  << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
453  return 0;
454  }
455  else
456  {
457  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
458  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
459 
460  return *pid;
461  }
462 
463 }
464 
466 {
467  restart_art_ = false;
468  //current_art_config_file_ = nullptr;
469  //current_art_pset_ = fhicl::ParameterSet();
470 
471  auto check_pids = [&](bool print) {
472 
473  std::unique_lock<std::mutex> lk(art_process_mutex_);
474  for (auto pid = pids.begin(); pid != pids.end();)
475  {
476  // 08-May-2018, KAB: protect against killing invalid PIDS
477 
478  if (*pid <= 0)
479  {
480  TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
481  << ") from the shutdown list.";
482  pid = pids.erase(pid);
483  }
484  else if (kill(*pid, 0) < 0)
485  {
486  pid = pids.erase(pid);
487  }
488  else
489  {
490  if (print) std::cout << *pid << " ";
491  ++pid;
492  }
493  }
494  };
495  auto count_pids = [&]() {
496  std::unique_lock<std::mutex> lk(art_process_mutex_);
497  return pids.size();
498  };
499  check_pids(false);
500  if (count_pids() == 0)
501  {
502  TLOG(14) << "All art processes already exited, nothing to do.";
503  usleep(1000);
504  return;
505  }
506 
507  if (!manual_art_)
508  {
509  {
510  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
511  std::unique_lock<std::mutex> lk(art_process_mutex_);
512  for (auto pid : pids)
513  {
514  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
515  kill(pid, SIGQUIT);
516  }
517  }
518 
519  int graceful_wait_ms = 5000;
520  int int_wait_ms = 1000;
521 
522  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
523  for (int ii = 0; ii < graceful_wait_ms; ++ii)
524  {
525  usleep(1000);
526 
527  check_pids(false);
528  if (count_pids() == 0)
529  {
530  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
531  return;
532  }
533  }
534 
535  {
536  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
537  std::unique_lock<std::mutex> lk(art_process_mutex_);
538  for (auto pid : pids)
539  {
540  kill(pid, SIGINT);
541  }
542  }
543 
544  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
545  for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
546  {
547  usleep(1000);
548 
549  check_pids(false);
550 
551  if (count_pids() == 0)
552  {
553  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
554  return;
555  }
556  }
557 
558  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
559  while (count_pids() > 0)
560  {
561  {
562  std::unique_lock<std::mutex> lk(art_process_mutex_);
563  kill(*pids.begin(), SIGKILL);
564  usleep(1000);
565  }
566  check_pids(false);
567  }
568  }
569  else
570  {
571  std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
572  while (count_pids() > 0)
573  {
574  std::cout << "The following PIDs are running: ";
575  check_pids(true);
576  std::cout << std::endl;
577  std::string ignored;
578  std::cin >> ignored;
579  }
580  }
581 }
582 
583 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
584 {
585  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
586  if (restart_art_ || !always_restart_art_) // Art is running
587  {
588  endOfData();
589  }
590  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
591  {
592  broadcasts_.MarkBufferEmpty(ii, true);
593  }
594  if (newRun == 0) newRun = run_id_ + 1;
595 
596  if (art_pset != current_art_pset_ || !current_art_config_file_)
597  {
598  current_art_pset_ = art_pset;
599  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
600  }
601 
602  if (n_art_processes != -1)
603  {
604  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
605  num_art_processes_ = n_art_processes;
606  }
607  startRun(newRun);
608  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
609 }
610 
612 {
613  running_ = false;
614  init_fragment_.reset(nullptr);
615  TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
616  restart_art_ = false;
617 
618  size_t initialStoreSize = GetIncompleteEventCount();
619  TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
620  << " stale events from the SharedMemoryEventManager.";
621  int counter = initialStoreSize;
622  while (active_buffers_.size() > 0 && counter > 0)
623  {
624  complete_buffer_(*active_buffers_.begin());
625  counter--;
626  }
627  TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
628  << " stale events in the SharedMemoryEventManager.";
629 
630 
631  TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
632  auto start = std::chrono::steady_clock::now();
633  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
634  auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1);//size();
635 
636  auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
637 
638  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
639  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
640  {
641  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
642  if (temp != lastReadCount)
643  {
644  TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
645  lastReadCount = temp;
646  start = std::chrono::steady_clock::now();
647  }
648  if (lastReadCount > 0) {
649  TRACE(19, "About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu", outstanding_buffer_wait_time, lastReadCount, size(), end_of_data_wait_us);
650  usleep(outstanding_buffer_wait_time);
651  }
652  }
653 
654  TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
655  << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
656 
657  TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
658  FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
659  bool success = broadcastFragment_(std::move(outFrag), outFrag);
660  if (!success)
661  {
662  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
663  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
664  {
665  broadcasts_.MarkBufferEmpty(ii, true);
666  }
667  broadcastFragment_(std::move(outFrag), outFrag);
668  }
669  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
670 
671  if (get_art_process_count_() > 0)
672  {
673  TLOG(TLVL_DEBUG) << "Allowing " << get_art_process_count_() << " art processes the chance to end gracefully";
674  if (end_of_data_wait_us == 0)
675  {
676  TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
677  end_of_data_wait_us = 100 * 1000000;
678  }
679 
680  auto sleep_count = (end_of_data_wait_us / 10000) + 1;
681  for (size_t ii = 0; ii < sleep_count; ++ii)
682  {
683  usleep(10000);
684  if (get_art_process_count_() == 0) break;
685  }
686  }
687 
688  while (get_art_process_count_() > 0)
689  {
690  TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
691 
692  ShutdownArtProcesses(art_processes_);
693  }
694  TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
695 
696  ResetAttachedCount();
697 
698  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
699  for (size_t ii = 0; ii < size(); ++ii)
700  {
701  MarkBufferEmpty(ii, true);
702  }
703  // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again...
704  // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers";
705  // for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
706  // {
707  // broadcasts_.MarkBufferEmpty(ii, true);
708  // }
709  released_incomplete_events_.clear();
710 
711  TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestReceiver";
712  requests_.reset(nullptr);
713 
714  TLOG(TLVL_DEBUG) << "endOfData END";
715  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
716  return true;
717 }
718 
720 {
721  running_ = true;
722  init_fragment_.reset(nullptr);
723  TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
724  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
725  {
726  broadcasts_.MarkBufferEmpty(ii, true);
727  }
728  StartArt();
729  run_id_ = runID;
730  subrun_id_ = 1;
731  subrun_rollover_event_ = Fragment::InvalidSequenceID;
732  last_released_event_ = 0;
733  run_event_count_ = 0;
734  run_incomplete_event_count_ = 0;
735  subrun_event_count_ = 0;
736  subrun_incomplete_event_count_ = 0;
737  requests_.reset(new RequestSender(data_pset_));
738  if (requests_)
739  {
740  requests_->SendRoutingToken(queue_size_, run_id_);
741  }
742  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
743  << ", max queue size = "
744  << queue_size_
745  << ", queue size = "
746  << GetLockedBufferCount();
747  if (metricMan)
748  {
749  double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
750  metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
751  }
752 }
753 
755 {
756  ++subrun_id_;
757  subrun_event_count_ = 0;
758  subrun_incomplete_event_count_ = 0;
759  subrun_rollover_event_ = Fragment::InvalidSequenceID;
760  if (metricMan)
761  {
762  double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
763  metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
764  }
765 }
766 
768 {
769  TLOG(TLVL_INFO) << "Ending run " << run_id_;
770  FragmentPtr endOfRunFrag(new
771  Fragment(static_cast<size_t>
772  (ceil(sizeof(my_rank) /
773  static_cast<double>(sizeof(Fragment::value_type))))));
774 
775  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
776  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
777  *endOfRunFrag->dataBegin() = my_rank;
778  broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
779 
780  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
781  run_event_count_ = 0;
782  run_incomplete_event_count_ = 0;
783  subrun_event_count_ = 0;
784  subrun_incomplete_event_count_ = 0;
785  oversize_fragment_count_ = 0;
786  return true;
787 }
788 
790 {
791  TLOG(TLVL_INFO) << "Ending subrun " << subrun_id_;
792  std::unique_ptr<artdaq::Fragment>
793  endOfSubrunFrag(new
794  Fragment(static_cast<size_t>
795  (ceil(sizeof(my_rank) /
796  static_cast<double>(sizeof(Fragment::value_type))))));
797 
798  TLOG(TLVL_DEBUG) << "Broadcasting EndOfSubrun Fragment";
799  endOfSubrunFrag->setSequenceID(subrun_rollover_event_);
800  endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
801  *endOfSubrunFrag->dataBegin() = my_rank;
802 
803  broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
804 
805  TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun.";
806  subrun_event_count_ = 0;
807  subrun_incomplete_event_count_ = 0;
808 
809  return true;
810 }
811 
813 {
814  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
815  if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
816 
817  if (boundary < last_released_event_)
818  {
819  auto logLevel = TLVL_ERROR;
820  bool processAnyway = false;
821  if (last_released_event_ - boundary < 100)
822  {
823  logLevel = TLVL_WARNING;
824  processAnyway = true;
825  }
826  TLOG(logLevel) << "Subrun rollover requested for event that is in the past. (last_released_event="
827  << last_released_event_ << ",requested_rollover_boundary=" << boundary << ").";
828  if (!processAnyway) return;
829  }
830  TLOG(TLVL_INFO) << "Will roll over when I reach Sequence ID " << boundary;
831 
832  // JCF, May-11-2018
833 
834  // subrun_rollover_event_ is used in check_pending_buffers to
835  // trigger an endSubrun()/startSubrun(), but if the last event
836  // sent was right before the boundary we might as well switch
837  // to the new subrun here
838 
839  if (boundary == last_released_event_ + 1) {
840  TLOG(TLVL_INFO) << "rolloverSubrun: Last released event had sequence id " << last_released_event_ << \
841  ", boundary is sequence id " << boundary << ", so will start a new subrun here";
842  endSubrun();
843  startSubrun();
844  subrun_rollover_event_ = std::numeric_limits<sequence_id_t>::max();
845  }
846  else {
847  subrun_rollover_event_ = boundary;
848  }
849 }
850 
852 {
853  if (metricMan)
854  {
855  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
856  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
857  }
858 
859  if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
860  {
861  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
862  return;
863 
864  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
865  std::ostringstream oss;
866  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
867  for (auto& ev : active_buffers_)
868  {
869  auto hdr = getEventHeader_(ev);
870  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
871  }
872  TLOG(TLVL_DEBUG) << oss.str();
873  }
874 }
875 
876 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
877 {
878  TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
879  auto buffer = broadcasts_.GetBufferForWriting(false);
880  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
881  auto start_time = std::chrono::steady_clock::now();
882  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
883  {
884  usleep(10000);
885  buffer = broadcasts_.GetBufferForWriting(false);
886  }
887  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
888  if (buffer == -1)
889  {
890  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
891  outFrag.swap(frag);
892  return false;
893  }
894 
895  TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
896  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
897  hdr->run_id = run_id_;
898  hdr->subrun_id = subrun_id_;
899  hdr->sequence_id = frag->sequenceID();
900  hdr->is_complete = true;
901  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
902 
903  TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
904  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
905 
906  TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
907  broadcasts_.MarkBufferFull(buffer, -1);
908  outFrag.swap(frag);
909  TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
910  return true;
911 }
912 
913 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
914 {
915  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
916 }
917 
918 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
919 {
920  TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
921  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
922 
923  TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
924 
925  auto buffers = GetBuffersOwnedByManager();
926  for (auto& buf : buffers)
927  {
928  auto hdr = getEventHeader_(buf);
929  if (hdr->sequence_id == seqID)
930  {
931  TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
932  return buf;
933  }
934  }
935 
936 #if !ART_SUPPORTS_DUPLICATE_EVENTS
937  if (released_incomplete_events_.count(seqID))
938  {
939  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
940  return -2;
941  }
942 #endif
943 
944  if (!create_new) return -1;
945 
946  check_pending_buffers_(lk);
947  int new_buffer = GetBufferForWriting(false);
948 
949  if (new_buffer == -1)
950  {
951  new_buffer = GetBufferForWriting(overwrite_mode_);
952  }
953 
954  if (new_buffer == -1) return -1;
955  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
956  std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
957  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
958  //TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
959  auto hdr = getEventHeader_(new_buffer);
960  hdr->is_complete = false;
961  hdr->run_id = run_id_;
962  hdr->subrun_id = subrun_id_;
963  hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
964  hdr->sequence_id = seqID;
965  buffer_writes_pending_[new_buffer] = 0;
966  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
967  SetMFIteration("Sequence ID " + std::to_string(seqID));
968 
969  TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
970  active_buffers_.insert(new_buffer);
971  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
972  << size() << ","
973  << ReadReadyCount() << ","
974  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
975  << WriteReadyCount(false) << ","
976  << pending_buffers_.size() << ","
977  << active_buffers_.size() << ")";
978 
979  if (requests_)
980  {
981  if (timestamp != Fragment::InvalidTimestamp)
982  {
983  requests_->AddRequest(seqID, timestamp);
984  }
985  // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we
986  // don't double-send requests, but still get the benefit of calling SendRequest 'often'.
987  else
988  {
989  requests_->SendRequest();
990  }
991  }
992  TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
993  return new_buffer;
994 }
995 
996 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
997 {
998  if (buffer == -1) return true;
999  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
1000  {
1001  return true;
1002  }
1003  ResetReadPos(buffer);
1004  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
1005  return MoreDataInBuffer(buffer);
1006 }
1007 
1008 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
1009 {
1010  auto hdr = getEventHeader_(buffer);
1011  if (hdr->is_complete)
1012  {
1013  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
1014 
1015  {
1016  TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
1017 
1018  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1019  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1020  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1021  active_buffers_.erase(buffer);
1022  pending_buffers_.insert(buffer);
1023 
1024  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1025  << size() << ","
1026  << ReadReadyCount() << ","
1027  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1028  << WriteReadyCount(false) << ","
1029  << pending_buffers_.size() << ","
1030  << active_buffers_.size() << ")";
1031  }
1032  if (requests_)
1033  {
1034  requests_->RemoveRequest(hdr->sequence_id);
1035  }
1036  }
1037  CheckPendingBuffers();
1038 }
1039 
1040 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
1041 {
1042  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1043 }
1044 
1046 {
1047  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
1048  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1049  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
1050  check_pending_buffers_(lk);
1051 }
1052 
1053 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
1054 {
1055  TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1056 
1057  auto buffers = GetBuffersOwnedByManager();
1058  for (auto buf : buffers)
1059  {
1060  if (ResetBuffer(buf) && !pending_buffers_.count(buf))
1061  {
1062  TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1063  auto hdr = getEventHeader_(buf);
1064  if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_))
1065  {
1066  if (requests_)
1067  {
1068  requests_->RemoveRequest(hdr->sequence_id);
1069  }
1070  TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
1071  active_buffers_.erase(buf);
1072  pending_buffers_.insert(buf);
1073  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1074  << size() << ","
1075  << ReadReadyCount() << ","
1076  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1077  << WriteReadyCount(false) << ","
1078  << pending_buffers_.size() << ","
1079  << active_buffers_.size() << ")";
1080 
1081  subrun_incomplete_event_count_++;
1082  run_incomplete_event_count_++;
1083  if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
1084  if (!released_incomplete_events_.count(hdr->sequence_id))
1085  {
1086  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1087  }
1088  else
1089  {
1090  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1091  }
1092  TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
1093  }
1094 
1095  }
1096  }
1097 
1098  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1099  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
1100 
1101  auto counter = 0;
1102  double eventSize = 0;
1103  for (auto buf : sorted_buffers)
1104  {
1105  auto hdr = getEventHeader_(buf);
1106 
1107  if (hdr->sequence_id >= subrun_rollover_event_)
1108  {
1109  TLOG(TLVL_INFO) << "Subrun rollover reached at event " << hdr->sequence_id << " (boundary=" << subrun_rollover_event_ << "), last released event is " << last_released_event_ << ".";
1110  endSubrun();
1111  startSubrun();
1112  }
1113  if (hdr->sequence_id > last_released_event_) last_released_event_ = hdr->sequence_id;
1114 
1115  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
1116  << "event_size=" << BufferDataSize(buf) << ", buffer_size=" << BufferSize();
1117 
1118  TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
1119  MarkBufferFull(buf);
1120  subrun_event_count_++;
1121  run_event_count_++;
1122  counter++;
1123  eventSize += BufferDataSize(buf);
1124  pending_buffers_.erase(buf);
1125  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1126  << size() << ","
1127  << ReadReadyCount() << ","
1128  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1129  << WriteReadyCount(false) << ","
1130  << pending_buffers_.size() << ","
1131  << active_buffers_.size() << ")";
1132  }
1133 
1134  if (requests_)
1135  {
1136  TLOG(TLVL_TRACE) << "Sent tokens: " << requests_->GetSentTokenCount() << ", Event count: " << run_event_count_;
1137  auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
1138  auto available_buffers = WriteReadyCount(overwrite_mode_);
1139 
1140  TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
1141  << ", tokens_to_send: " << available_buffers - outstanding_tokens;
1142 
1143  if (available_buffers > outstanding_tokens)
1144  {
1145  auto tokens_to_send = available_buffers - outstanding_tokens;
1146 
1147  while (tokens_to_send > 0)
1148  {
1149  TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
1150  requests_->SendRoutingToken(1, run_id_);
1151  tokens_to_send--;
1152  }
1153  }
1154  }
1155 
1156  metric_data_.event_count += counter;
1157  metric_data_.event_size += eventSize;
1158 
1159  if (metricMan && TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500) // Limit to 2 Hz updates
1160  {
1161  TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
1162  metricMan->sendMetric("Event Rate", metric_data_.event_count, "Events/s", 1, MetricMode::Rate);
1163  if (metric_data_.event_count > 0) metricMan->sendMetric("Average Event Size", metric_data_.event_size / metric_data_.event_count, "Bytes", 1, MetricMode::Average);
1164  metric_data_ = MetricData();
1165 
1166  metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
1167  metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
1168  metricMan->sendMetric("Events Released to art this subrun", subrun_event_count_, "Events", 2, MetricMode::LastPoint);
1169  metricMan->sendMetric("Incomplete Events Released to art this subrun", subrun_incomplete_event_count_, "Events", 2, MetricMode::LastPoint);
1170  if (requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
1171 
1172  auto bufferReport = GetBufferReport();
1173  int full = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Full; });
1174  int empty = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Empty; });
1175  int writing = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Writing; });
1176  int reading = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Reading; });
1177  auto total = size();
1178  TLOG(TLVL_DEBUG) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
1179 
1180  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
1181  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
1182  metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
1183  metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
1184  if (total > 0)
1185  {
1186  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1187  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1188  }
1189 
1190  last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now();
1191  }
1192  TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
1193 }
1194 
1195 void artdaq::SharedMemoryEventManager::send_init_frag_()
1196 {
1197  if (init_fragment_ != nullptr)
1198  {
1199  TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
1200 
1201 #if 0
1202  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
1203  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1204  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1205  ostream.close();
1206 #endif
1207 
1208  broadcastFragment_(std::move(init_fragment_), init_fragment_);
1209  TLOG(TLVL_TRACE) << "Init Fragment sent";
1210  }
1211  else if (send_init_fragments_)
1212  {
1213  TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
1214  }
1215 }
1216 
1218 {
1219  if (!init_fragment_ || init_fragment_ == nullptr)
1220  {
1221  init_fragment_.swap(frag);
1222  send_init_frag_();
1223  }
1224 }
1225 
1227 {
1228  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
1229  if (art_pset != current_art_pset_ || !current_art_config_file_)
1230  {
1231  current_art_pset_ = art_pset;
1232  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
1233  }
1234  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
1235 }
1236 
1237 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
1238 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager)
1239 #endif
void RunArt(std::shared_ptr< art_config_file > config_file, std::shared_ptr< std::atomic< pid_t >> pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
The SharedMemoryEventManager is a SharedMemoryManger which tracks events as they are built...
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
void StartArt()
Start all the art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
void rolloverSubrun(sequence_id_t boundary)
Rollover the subrun after the specified event.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
void startSubrun()
Start a new Subrun, incrementing the subrun number.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endSubrun()
Send an EndOfSubRunFragment to the art thread.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
bool endOfData()
Indicate that the end of input has been reached to the art processes.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...