artdaq  v3_09_02
SharedMemoryEventManager.cc
1 
2 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
3 #include <sys/wait.h>
4 
5 #include <memory>
6 #include "artdaq-core/Core/StatisticsCollection.hh"
7 #include "artdaq-core/Utilities/TraceLock.hh"
8 
9 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
10 
11 #define TLVL_BUFFER 40
12 #define TLVL_BUFLCK 41
13 
14 #define build_key(seed) ((seed) + ((GetPartitionNumber() + 1) << 16) + (getpid() & 0xFFFF))
15 
16 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
17 std::mutex artdaq::SharedMemoryEventManager::subrun_event_map_mutex_;
18 const std::string artdaq::SharedMemoryEventManager::
19  FRAGMENTS_RECEIVED_STAT_KEY("SharedMemoryEventManagerFragmentsReceived");
20 const std::string artdaq::SharedMemoryEventManager::
21  EVENTS_RELEASED_STAT_KEY("SharedMemoryEventManagerEventsReleased");
22 
23 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(const fhicl::ParameterSet& pset, fhicl::ParameterSet art_pset)
24  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", build_key(0xEE000000)),
25  pset.get<size_t>("buffer_count"),
26  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
27  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
28  !pset.get<bool>("broadcast_mode", false))
29  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
30  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
31  , queue_size_(pset.get<size_t>("buffer_count"))
32  , run_id_(0)
33  , max_subrun_event_map_length_(pset.get<size_t>("max_subrun_lookup_table_size", 100))
34  , max_event_list_length_(pset.get<size_t>("max_event_list_length", 100))
35  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
36  , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
37  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
38  , init_fragment_count_(pset.get<size_t>("init_fragment_count", pset.get<bool>("send_init_fragments", true) ? 1 : 0))
39  , running_(false)
40  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
41  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
42  , last_backpressure_report_time_(std::chrono::steady_clock::now())
43  , last_fragment_header_write_time_(std::chrono::steady_clock::now())
44  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
45  , run_event_count_(0)
46  , run_incomplete_event_count_(0)
47  , subrun_event_count_(0)
48  , subrun_incomplete_event_count_(0)
49  , oversize_fragment_count_(0)
50  , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
51  , restart_art_(false)
52  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
53  , manual_art_(pset.get<bool>("manual_art", false))
54  , current_art_pset_(art_pset)
55  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
56  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 1000000))
57  , requests_(nullptr)
58  , data_pset_(pset)
59  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", build_key(0xBB000000)),
60  pset.get<size_t>("broadcast_buffer_count", 10),
61  pset.get<size_t>("broadcast_buffer_size", 0x100000),
62  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
63 {
64  subrun_event_map_[0] = 1;
65  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
66  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
67 
68  if (!pset.get<bool>("use_art", true))
69  {
70  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
71  num_art_processes_ = 0;
72  }
73  else
74  {
75  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
76  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
77  }
78  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
79 
80  if (overwrite_mode_ && num_art_processes_ > 0)
81  {
82  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
83  }
84  else if (overwrite_mode_)
85  {
86  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
87  }
88 
89  for (size_t ii = 0; ii < size(); ++ii)
90  {
91  buffer_writes_pending_[ii] = 0;
92  }
93 
94  if (!IsValid())
95  {
96  throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!"; // NOLINT(cert-err60-cpp)
97  }
98 
99  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
100  SetRank(my_rank);
101  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
102 
105 
106  // fetch the monitoring parameters and create the MonitoredQuantity instances
107  statsHelper_.createCollectors(pset, 100, 30.0, 60.0, EVENTS_RELEASED_STAT_KEY);
108 
109  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
110 }
111 
113 {
114  TLOG(TLVL_TRACE) << "DESTRUCTOR";
115  if (running_)
116  {
117  try
118  {
119  endOfData();
120  }
121  catch (...)
122  {
123  // IGNORED
124  }
125  }
126  TLOG(TLVL_TRACE) << "Destructor END";
127 }
128 
129 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
130 {
131  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
132  << ", sequence_id=" << frag.sequence_id;
133  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
134  TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
135  if (buffer == -1)
136  {
137  return false;
138  }
139  if (buffer == -2)
140  {
141  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
142  return true;
143  }
144 
145  auto hdr = getEventHeader_(buffer);
146  if (update_run_ids_)
147  {
148  hdr->run_id = run_id_;
149  }
150  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
151 
152  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
153  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
154 
155  TLOG(TLVL_TRACE) << "Checking for complete event";
156  auto fragmentCount = GetFragmentCount(frag.sequence_id);
157  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
158  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
159  << ", fragmentCount=" << fragmentCount
160  << ", num_fragments_per_event=" << num_fragments_per_event_
161  << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
162 
163  complete_buffer_(buffer);
164  if (requests_)
165  {
166  requests_->SendRequest(true);
167  }
168 
169  TLOG(TLVL_TRACE) << "AddFragment END";
170  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
171  return true;
172 }
173 
174 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
175 {
176  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
177  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress()); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
178  auto data = frag->headerAddress();
179  auto start = std::chrono::steady_clock::now();
180  bool sts = false;
181  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
182  {
183  sts = AddFragment(hdr, data);
184  if (!sts)
185  {
186  usleep(1000);
187  }
188  }
189  if (!sts)
190  {
191  outfrag = std::move(frag);
192  }
193  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
194  return sts;
195 }
196 
197 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
198 {
199  TLOG(14) << "WriteFragmentHeader BEGIN";
200  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
201 
202  if (buffer < 0)
203  {
204  if (buffer == -1 && !dropIfNoBuffersAvailable)
205  {
206  std::unique_lock<std::mutex> bp_lk(sequence_id_mutex_);
207  if (TimeUtils::GetElapsedTime(last_backpressure_report_time_) > 1.0)
208  {
209  TLOG(TLVL_WARNING) << app_name << ": Back-pressure condition: All Shared Memory buffers have been full for " << TimeUtils::GetElapsedTime(last_fragment_header_write_time_) << " s!";
210  last_backpressure_report_time_ = std::chrono::steady_clock::now();
211  }
212  return nullptr;
213  }
214  if (buffer == -2)
215  {
216  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
217  }
218  else
219  {
220  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
221  }
222  dropped_data_[frag.fragment_id] = std::make_unique<Fragment>(frag.word_count - frag.num_words());
223 
224  TLOG(TLVL_DEBUG + 3) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into "
225  << static_cast<void*>(dropped_data_[frag.fragment_id]->dataBegin()) << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
226 
227  return dropped_data_[frag.fragment_id]->dataBegin();
228  }
229 
230  last_backpressure_report_time_ = std::chrono::steady_clock::now();
231  last_fragment_header_write_time_ = std::chrono::steady_clock::now();
232  // Increment this as soon as we know we want to use the buffer
233  buffer_writes_pending_[buffer]++;
234 
235  if (metricMan)
236  {
237  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
238  }
239 
240  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
241 
242  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
243 
244  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
245 
246  //TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
247  auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
248  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
249 
250  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
251  if (frag.word_count - frag.num_words() > 0)
252  {
253  auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
254 
255  if (!sts)
256  {
257  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words(); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
258  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType; // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
259  TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
260  dropped_data_[frag.fragment_id] = std::make_unique<Fragment>(frag.word_count - frag.num_words());
261 
262  oversize_fragment_count_++;
263 
264  if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
265  {
266  throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
267  }
268 
269  TLOG(TLVL_DEBUG + 3) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id
270  << " into " << static_cast<void*>(dropped_data_[frag.fragment_id]->dataBegin());
271  return dropped_data_[frag.fragment_id]->dataBegin();
272  }
273  }
274  TLOG(14) << "WriteFragmentHeader END";
275  return pos;
276 }
277 
278 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
279 {
280  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
281  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
282  if (buffer == -1)
283  {
284  Detach(true, "SharedMemoryEventManager",
285  "getBufferForSequenceID_ returned -1 in DoneWritingFragment. This indicates a possible mismatch between expected Fragment count and the actual number of Fragments received.");
286  }
287  if (buffer == -2) { return; }
288 
289  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
290  {
291  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
292 
293  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
294 
295  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
296 
297  //TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
298 
299  TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << static_cast<int>(frag.type) << ")";
300  auto hdr = getEventHeader_(buffer);
301  if (update_run_ids_)
302  {
303  hdr->run_id = run_id_;
304  }
305  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
306 
307  TLOG(TLVL_TRACE) << "DoneWritingFragment: Updating buffer touch time";
308  TouchBuffer(buffer);
309 
310  buffer_writes_pending_[buffer]--;
311  if (buffer_writes_pending_[buffer] != 0)
312  {
313  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
314  return;
315  }
316  TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
317  auto frag_count = GetFragmentCount(frag.sequence_id);
318  hdr->is_complete = frag_count >= num_fragments_per_event_;
319 
320  if (frag_count > num_fragments_per_event_)
321  {
322  TLOG(TLVL_WARNING) << "DoneWritingFragment: This Event has more Fragments ( " << frag_count << " ) than specified in configuration ( " << num_fragments_per_event_ << " )!"
323  << " This is probably due to a misconfiguration and is *not* a reliable mode!";
324  }
325 
326  TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
327 #if ART_SUPPORTS_DUPLICATE_EVENTS
328  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
329  {
330  hdr->is_complete = frag_count >= released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
331  }
332 #endif
333  }
334 
335  complete_buffer_(buffer);
336  if (requests_)
337  {
338  requests_->SendRequest(true);
339  }
340  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
341 }
342 
343 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
344 {
345  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
346 }
347 
348 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
349 {
350  if (buffer == -1)
351  {
352  return 0;
353  }
354  ResetReadPos(buffer);
355  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
356 
357  size_t count = 0;
358 
359  while (MoreDataInBuffer(buffer))
360  {
361  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
362  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
363  if (type != Fragment::InvalidFragmentType && fragHdr->type != type)
364  {
365  continue;
366  }
367  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
368  ++count;
369  }
370 
371  return count;
372 }
373 
374 void artdaq::SharedMemoryEventManager::RunArt(const std::shared_ptr<art_config_file>& config_file, const std::shared_ptr<std::atomic<pid_t>>& pid_out)
375 {
376  do
377  {
378  auto start_time = std::chrono::steady_clock::now();
379  send_init_frags_();
380  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
381 
382  pid_t pid = 0;
383 
384  if (!manual_art_)
385  {
386  char* filename = new char[config_file->getFileName().length() + 1];
387  memcpy(filename, config_file->getFileName().c_str(), config_file->getFileName().length());
388  filename[config_file->getFileName().length()] = '\0'; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
389 
390 #if DEBUG_ART
391  std::string debugArgS = "--config-out=" + app_name + "_art.out";
392  char* debugArg = new char[debugArgS.length() + 1];
393  memcpy(debugArg, debugArgS.c_str(), debugArgS.length());
394  debugArg[debugArgS.length()] = '\0';
395 
396  std::vector<char*> args{const_cast<char*>("art"), const_cast<char*>("-c"), filename, debugArg, NULL}; // NOLINT(cppcoreguidelines-pro-type-const-cast)
397 #else
398  std::vector<char*> args{const_cast<char*>("art"), const_cast<char*>("-c"), filename, nullptr}; // NOLINT(cppcoreguidelines-pro-type-const-cast)
399 #endif
400 
401  pid = fork();
402  if (pid == 0)
403  { /* child */
404  // 23-May-2018, KAB: added the setting of the partition number env var
405  // in the environment of the child art process so that Globals.hh
406  // will pick it up there and provide it to the artdaq classes that
407  // are used in data transfers, etc. within the art process.
408  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
409  std::string envVarValue = std::to_string(GetPartitionNumber());
410  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
411  {
412  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
413  << "\" in the environment of a child art process. "
414  << "This may result in incorrect TCP port number "
415  << "assignments or other issues, and data may "
416  << "not flow through the system correctly.";
417  }
418  envVarKey = "ARTDAQ_APPLICATION_NAME";
419  envVarValue = app_name;
420  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
421  {
422  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
423  << "\" in the environment of a child art process. ";
424  }
425  envVarKey = "ARTDAQ_RANK";
426  envVarValue = std::to_string(my_rank);
427  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
428  {
429  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
430  << "\" in the environment of a child art process. ";
431  }
432 
433  execvp("art", &args[0]);
434  delete[] filename;
435  exit(1);
436  }
437  delete[] filename;
438  }
439  else
440  {
441  //Using cin/cout here to ensure console is active (artdaqDriver)
442  std::cout << "Please run the following command in a separate terminal:" << std::endl
443  << "art -c " << config_file->getFileName() << std::endl
444  << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
445  << "Finally, return to this window and enter the pid: " << std::endl;
446  std::cin >> pid;
447  }
448  *pid_out = pid;
449 
450  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
451  {
452  std::unique_lock<std::mutex> lk(art_process_mutex_);
453  art_processes_.insert(pid);
454  }
455  siginfo_t status;
456  auto sts = waitid(P_PID, pid, &status, WEXITED);
457  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
458  {
459  std::unique_lock<std::mutex> lk(art_process_mutex_);
460  art_processes_.erase(pid);
461  }
462  if (sts < 0)
463  {
464  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
465  }
466  else if (status.si_code == CLD_EXITED && status.si_status == 0)
467  {
468  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
469  }
470  else
471  {
472  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
473  if (art_lifetime < minimum_art_lifetime_s_)
474  {
475  restart_art_ = false;
476  }
477 
478  auto exit_type = "exited with status code";
479  switch (status.si_code)
480  {
481  case CLD_DUMPED:
482  case CLD_KILLED:
483  exit_type = "was killed with signal";
484  break;
485  case CLD_EXITED:
486  default:
487  break;
488  }
489 
490  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
491  << "art process " << pid << " " << exit_type << " " << status.si_status
492  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
493  << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
494  << (restart_art_ ? "restarting" : "not restarting");
495  }
496  } while (restart_art_);
497 }
498 
500 {
501  restart_art_ = always_restart_art_;
502  if (num_art_processes_ == 0)
503  {
504  return;
505  }
506  for (size_t ii = 0; ii < num_art_processes_; ++ii)
507  {
508  StartArtProcess(current_art_pset_);
509  }
510 }
511 
513 {
514  static std::mutex start_art_mutex;
515  std::unique_lock<std::mutex> lk(start_art_mutex);
516  //TraceLock lk(start_art_mutex, 15, "StartArtLock");
517  restart_art_ = always_restart_art_;
518  auto initialCount = GetAttachedCount();
519  auto startTime = std::chrono::steady_clock::now();
520 
521  if (pset != current_art_pset_ || !current_art_config_file_)
522  {
523  current_art_pset_ = pset;
524  current_art_config_file_ = std::make_shared<art_config_file>(pset /*, GetKey(), GetBroadcastKey()*/);
525  }
526  std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
527  boost::thread thread([=] { RunArt(current_art_config_file_, pid); });
528  thread.detach();
529 
530  auto currentCount = GetAttachedCount() - initialCount;
531  while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
532  {
533  usleep(10000);
534  currentCount = GetAttachedCount() - initialCount;
535  }
536  if ((currentCount < 1 || *pid <= 0) && manual_art_)
537  {
538  TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
539  return 0;
540  }
541  if (currentCount < 1 || *pid <= 0)
542  {
543  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
544  << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
545  return 0;
546  }
547 
548  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
549  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
550 
551  return *pid;
552 }
553 
555 {
556  restart_art_ = false;
557  //current_art_config_file_ = nullptr;
558  //current_art_pset_ = fhicl::ParameterSet();
559 
560  auto check_pids = [&](bool print) {
561  std::unique_lock<std::mutex> lk(art_process_mutex_);
562  for (auto pid = pids.begin(); pid != pids.end();)
563  {
564  // 08-May-2018, KAB: protect against killing invalid PIDS
565 
566  if (*pid <= 0)
567  {
568  TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
569  << ") from the shutdown list.";
570  pid = pids.erase(pid);
571  }
572  else if (kill(*pid, 0) < 0)
573  {
574  pid = pids.erase(pid);
575  }
576  else
577  {
578  if (print)
579  {
580  std::cout << *pid << " ";
581  }
582  ++pid;
583  }
584  }
585  };
586  auto count_pids = [&]() {
587  std::unique_lock<std::mutex> lk(art_process_mutex_);
588  return pids.size();
589  };
590  check_pids(false);
591  if (count_pids() == 0)
592  {
593  TLOG(14) << "All art processes already exited, nothing to do.";
594  usleep(1000);
595  return;
596  }
597 
598  if (!manual_art_)
599  {
600  int graceful_wait_ms = art_event_processing_time_us_ * size() * 10 / 1000;
601  int gentle_wait_ms = art_event_processing_time_us_ * size() * 2 / 1000;
602  int int_wait_ms = art_event_processing_time_us_ * size() / 1000;
603  auto shutdown_start = std::chrono::steady_clock::now();
604 
605  if (!overwrite_mode_)
606  {
607  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
608  for (int ii = 0; ii < graceful_wait_ms; ++ii)
609  {
610  usleep(1000);
611 
612  check_pids(false);
613  if (count_pids() == 0)
614  {
615  TLOG(TLVL_INFO) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms.";
616  return;
617  }
618  }
619  }
620 
621  {
622  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
623  std::unique_lock<std::mutex> lk(art_process_mutex_);
624  for (auto pid : pids)
625  {
626  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
627  kill(pid, SIGQUIT);
628  }
629  }
630 
631  TLOG(TLVL_TRACE) << "Waiting up to " << gentle_wait_ms << " ms for all art processes to exit from SIGQUIT";
632  for (int ii = 0; ii < gentle_wait_ms; ++ii)
633  {
634  usleep(1000);
635 
636  check_pids(false);
637  if (count_pids() == 0)
638  {
639  TLOG(TLVL_INFO) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms (SIGQUIT).";
640  return;
641  }
642  }
643 
644  {
645  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
646  std::unique_lock<std::mutex> lk(art_process_mutex_);
647  for (auto pid : pids)
648  {
649  kill(pid, SIGINT);
650  }
651  }
652 
653  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit from SIGINT";
654  for (int ii = 0; ii < int_wait_ms; ++ii)
655  {
656  usleep(1000);
657 
658  check_pids(false);
659 
660  if (count_pids() == 0)
661  {
662  TLOG(TLVL_INFO) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms (SIGINT).";
663  return;
664  }
665  }
666 
667  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
668  while (count_pids() > 0)
669  {
670  {
671  std::unique_lock<std::mutex> lk(art_process_mutex_);
672  kill(*pids.begin(), SIGKILL);
673  usleep(1000);
674  }
675  check_pids(false);
676  }
677  TLOG(TLVL_INFO) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms (SIGKILL).";
678  }
679  else
680  {
681  std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
682  while (count_pids() > 0)
683  {
684  std::cout << "The following PIDs are running: ";
685  check_pids(true);
686  std::cout << std::endl;
687  std::string ignored;
688  std::cin >> ignored;
689  }
690  }
691 }
692 
693 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
694 {
695  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
696  if (restart_art_ || !always_restart_art_) // Art is running
697  {
698  endOfData();
699  }
700  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
701  {
702  broadcasts_.MarkBufferEmpty(ii, true);
703  }
704  if (newRun == 0)
705  {
706  newRun = run_id_ + 1;
707  }
708 
709  if (art_pset != current_art_pset_ || !current_art_config_file_)
710  {
711  current_art_pset_ = art_pset;
712  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
713  }
714 
715  if (n_art_processes != -1)
716  {
717  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
718  num_art_processes_ = n_art_processes;
719  }
720  startRun(newRun);
721  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
722 }
723 
725 {
726  running_ = false;
727  init_fragments_.clear();
728  TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
729  restart_art_ = false;
730 
731  size_t initialStoreSize = GetIncompleteEventCount();
732  TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
733  << " stale events from the SharedMemoryEventManager.";
734  int counter = initialStoreSize;
735  while (!active_buffers_.empty() && counter > 0)
736  {
737  complete_buffer_(*active_buffers_.begin());
738  counter--;
739  }
740  TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
741  << " stale events in the SharedMemoryEventManager.";
742 
743  TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
744  auto start = std::chrono::steady_clock::now();
745  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
746  auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1); //size();
747 
748  auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
749 
750  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
751  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
752  {
753  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
754  if (temp != lastReadCount)
755  {
756  TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
757  lastReadCount = temp;
758  start = std::chrono::steady_clock::now();
759  }
760  if (lastReadCount > 0)
761  {
762  TLOG(19) << "About to sleep " << outstanding_buffer_wait_time << " us - lastReadCount=" << lastReadCount << " size=" << size() << " end_of_data_wait_us=" << end_of_data_wait_us;
763  usleep(outstanding_buffer_wait_time);
764  }
765  }
766 
767  TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
768  << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
769 
770  TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
771  FragmentPtrs broadcast;
772  broadcast.emplace_back(Fragment::eodFrag(GetBufferCount()));
773  bool success = broadcastFragments_(broadcast);
774  if (!success)
775  {
776  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
777  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
778  {
779  broadcasts_.MarkBufferEmpty(ii, true);
780  }
781  broadcastFragments_(broadcast);
782  }
783  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
784  while (get_art_process_count_() > 0)
785  {
786  TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
787 
788  ShutdownArtProcesses(art_processes_);
789  }
790  TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
791 
792  ResetAttachedCount();
793 
794  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
795  for (size_t ii = 0; ii < size(); ++ii)
796  {
797  MarkBufferEmpty(ii, true);
798  }
799  // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again...
800  // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers";
801  // for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
802  // {
803  // broadcasts_.MarkBufferEmpty(ii, true);
804  // }
805  released_events_.clear();
806  released_incomplete_events_.clear();
807 
808  TLOG(TLVL_DEBUG) << "endOfData END";
809  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
810  return true;
811 }
812 
814 {
815  running_ = true;
816  init_fragments_.clear();
817  statsHelper_.resetStatistics();
818  TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
819  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
820  {
821  broadcasts_.MarkBufferEmpty(ii, true);
822  }
823  released_events_.clear();
824  released_incomplete_events_.clear();
825  StartArt();
826  run_id_ = runID;
827  {
828  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
829  subrun_event_map_.clear();
830  subrun_event_map_[0] = 1;
831  }
832  run_event_count_ = 0;
833  run_incomplete_event_count_ = 0;
834  requests_ = std::make_unique<RequestSender>(data_pset_);
835  if (requests_)
836  {
837  requests_->SetRunNumber(static_cast<uint32_t>(run_id_));
838  requests_->SendRoutingToken(queue_size_, run_id_);
839  }
840  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
841  << ", max queue size = "
842  << queue_size_
843  << ", queue size = "
844  << GetLockedBufferCount();
845  if (metricMan)
846  {
847  metricMan->sendMetric("Run Number", static_cast<uint64_t>(run_id_), "Run", 1, MetricMode::LastPoint | MetricMode::Persist);
848  }
849 }
850 
852 {
853  TLOG(TLVL_INFO) << "Ending run " << run_id_;
854  FragmentPtr endOfRunFrag(new Fragment(static_cast<size_t>(ceil(sizeof(my_rank) /
855  static_cast<double>(sizeof(Fragment::value_type))))));
856 
857  TLOG(TLVL_DEBUG) << "Shutting down RequestSender";
858  requests_.reset(nullptr);
859 
860  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
861  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
862  *endOfRunFrag->dataBegin() = my_rank;
863  FragmentPtrs broadcast;
864  broadcast.emplace_back(std::move(endOfRunFrag));
865  broadcastFragments_(broadcast);
866 
867  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
868  run_event_count_ = 0;
869  run_incomplete_event_count_ = 0;
870  oversize_fragment_count_ = 0;
871  {
872  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
873  subrun_event_map_.clear();
874  subrun_event_map_[0] = 1;
875  }
876  return true;
877 }
878 
880 {
881  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
882  if (boundary == 0 || boundary == Fragment::InvalidSequenceID)
883  {
884  return;
885  }
886 
887  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
888 
889  TLOG(TLVL_INFO) << "Will roll over to subrun " << subrun << " when I reach Sequence ID " << boundary;
890  subrun_event_map_[boundary] = subrun;
891  while (subrun_event_map_.size() > max_subrun_event_map_length_)
892  {
893  subrun_event_map_.erase(subrun_event_map_.begin());
894  }
895 }
896 
898 {
899  Fragment::sequence_id_t seqID = 0;
900  subrun_id_t subrun = 0;
901  {
902  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
903  for (auto& it : subrun_event_map_)
904  {
905  if (it.first >= seqID)
906  {
907  seqID = it.first + 1;
908  }
909  if (it.second >= subrun)
910  {
911  subrun = it.second + 1;
912  }
913  }
914  }
915  rolloverSubrun(seqID, subrun);
916 }
917 
919 {
920  if (metricMan)
921  {
922  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
923  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
924  }
925 
926  if (incomplete_event_report_interval_ms_ > 0 && (GetLockedBufferCount() != 0u))
927  {
928  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
929  {
930  return;
931  }
932 
933  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
934  std::ostringstream oss;
935  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
936  for (auto& ev : active_buffers_)
937  {
938  auto hdr = getEventHeader_(ev);
939  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
940  }
941  TLOG(TLVL_DEBUG) << oss.str();
942  }
943 }
944 
945 bool artdaq::SharedMemoryEventManager::broadcastFragments_(FragmentPtrs& frags)
946 {
947  if (frags.empty())
948  {
949  TLOG(TLVL_ERROR) << "Requested broadcast but no Fragments given!";
950  return false;
951  }
952  TLOG(TLVL_DEBUG) << "Broadcasting Fragments with seqID=" << frags.front()->sequenceID()
953  << ", type " << detail::RawFragmentHeader::SystemTypeToString(frags.front()->type())
954  << ", size=" << frags.front()->sizeBytes() << "B.";
955  auto buffer = broadcasts_.GetBufferForWriting(false);
956  TLOG(TLVL_DEBUG) << "broadcastFragments_: after getting buffer 1st buffer=" << buffer;
957  auto start_time = std::chrono::steady_clock::now();
958  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
959  {
960  usleep(10000);
961  buffer = broadcasts_.GetBufferForWriting(false);
962  }
963  TLOG(TLVL_DEBUG) << "broadcastFragments_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
964  if (buffer == -1)
965  {
966  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frags.front()->typeString() << " failed due to timeout waiting for buffer!";
967  return false;
968  }
969 
970  TLOG(TLVL_DEBUG) << "broadcastFragments_: Filling in RawEventHeader";
971  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
972  hdr->run_id = run_id_;
973  hdr->subrun_id = GetSubrunForSequenceID(frags.front()->sequenceID());
974  hdr->sequence_id = frags.front()->sequenceID();
975  hdr->is_complete = true;
976  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
977 
978  for (auto& frag : frags)
979  {
980  TLOG(TLVL_DEBUG) << "broadcastFragments_ before Write calls";
981  if (frag->sequenceID() != hdr->sequence_id || frag->type() != frags.front()->type())
982  {
983  TLOG(TLVL_WARNING) << "Not sending fragment because its SequenceID or Type disagrees with leading Fragment";
984  continue;
985  }
986  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
987  }
988 
989  TLOG(TLVL_DEBUG) << "broadcastFragments_ Marking buffer full";
990  broadcasts_.MarkBufferFull(buffer, -1);
991  TLOG(TLVL_DEBUG) << "broadcastFragment_s Complete";
992  return true;
993 }
994 
995 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
996 {
997  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
998 }
999 
1001 {
1002  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
1003 
1004  TLOG(TLVL_TRACE) << "GetSubrunForSequenceID BEGIN map size = " << subrun_event_map_.size();
1005  auto it = subrun_event_map_.begin();
1006  subrun_id_t subrun = 1;
1007 
1008  while (it->first <= seqID && it != subrun_event_map_.end())
1009  {
1010  TLOG(TLVL_TRACE) << "Map has sequence ID " << it->first << ", subrun " << it->second << " (looking for <= " << seqID << ")";
1011  subrun = it->second;
1012  ++it;
1013  }
1014 
1015  TLOG(TLVL_DEBUG) << "GetSubrunForSequenceID returning subrun " << subrun << " for sequence ID " << seqID;
1016  return subrun;
1017 }
1018 
1019 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
1020 {
1021  TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
1022  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1023 
1024  TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
1025 
1026  auto buffers = GetBuffersOwnedByManager();
1027  for (auto& buf : buffers)
1028  {
1029  auto hdr = getEventHeader_(buf);
1030  if (hdr->sequence_id == seqID)
1031  {
1032  TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
1033  return buf;
1034  }
1035  }
1036 
1037 #if !ART_SUPPORTS_DUPLICATE_EVENTS
1038  if (released_incomplete_events_.count(seqID) != 0u)
1039  {
1040  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
1041  return -2;
1042  }
1043  if (released_events_.count(seqID) != 0u)
1044  {
1045  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been completed and released to art! Check configuration for inconsistent Fragment count per event!";
1046  return -2;
1047  }
1048 #endif
1049 
1050  if (!create_new)
1051  {
1052  return -1;
1053  }
1054 
1055  check_pending_buffers_(lk);
1056  int new_buffer = GetBufferForWriting(false);
1057 
1058  if (new_buffer == -1)
1059  {
1060  new_buffer = GetBufferForWriting(overwrite_mode_);
1061  }
1062 
1063  if (new_buffer == -1)
1064  {
1065  return -1;
1066  }
1067  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
1068  std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
1069  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
1070  //TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
1071  auto hdr = getEventHeader_(new_buffer);
1072  hdr->is_complete = false;
1073  hdr->run_id = run_id_;
1074  hdr->subrun_id = GetSubrunForSequenceID(seqID);
1075  hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
1076  hdr->sequence_id = seqID;
1077  hdr->timestamp = timestamp;
1078  buffer_writes_pending_[new_buffer] = 0;
1079  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
1080  SetMFIteration("Sequence ID " + std::to_string(seqID));
1081 
1082  TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
1083  active_buffers_.insert(new_buffer);
1084  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1085  << size() << ","
1086  << ReadReadyCount() << ","
1087  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1088  << WriteReadyCount(false) << ","
1089  << pending_buffers_.size() << ","
1090  << active_buffers_.size() << ")";
1091 
1092  if (requests_)
1093  {
1094  if (timestamp != Fragment::InvalidTimestamp)
1095  {
1096  requests_->AddRequest(seqID, timestamp);
1097  }
1098  // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we
1099  // don't double-send requests, but still get the benefit of calling SendRequest 'often'.
1100  else
1101  {
1102  requests_->SendRequest();
1103  }
1104  }
1105  TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
1106  return new_buffer;
1107 }
1108 
1109 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
1110 {
1111  if (buffer == -1)
1112  {
1113  return true;
1114  }
1115  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
1116  {
1117  return true;
1118  }
1119  ResetReadPos(buffer);
1120  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
1121  return MoreDataInBuffer(buffer);
1122 }
1123 
1124 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
1125 {
1126  auto hdr = getEventHeader_(buffer);
1127  if (hdr->is_complete)
1128  {
1129  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
1130 
1131  {
1132  TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
1133 
1134  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1135  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1136  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1137  active_buffers_.erase(buffer);
1138  pending_buffers_.insert(buffer);
1139  released_events_.insert(hdr->sequence_id);
1140  while (released_events_.size() > max_event_list_length_)
1141  {
1142  released_events_.erase(released_events_.begin());
1143  }
1144 
1145  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1146  << size() << ","
1147  << ReadReadyCount() << ","
1148  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1149  << WriteReadyCount(false) << ","
1150  << pending_buffers_.size() << ","
1151  << active_buffers_.size() << ")";
1152  }
1153  if (requests_)
1154  {
1155  requests_->RemoveRequest(hdr->sequence_id);
1156  }
1157  }
1158  CheckPendingBuffers();
1159 }
1160 
1161 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
1162 {
1163  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1164 }
1165 
1167 {
1168  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
1169  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1170  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
1171  check_pending_buffers_(lk);
1172 }
1173 
1174 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
1175 {
1176  TLOG(14) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1177 
1178  auto buffers = GetBuffersOwnedByManager();
1179  for (auto buf : buffers)
1180  {
1181  if (ResetBuffer(buf) && (pending_buffers_.count(buf) == 0u))
1182  {
1183  TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1184  auto hdr = getEventHeader_(buf);
1185  if ((active_buffers_.count(buf) != 0u) && (buffer_writes_pending_[buf].load() == 0 || !running_))
1186  {
1187  if (requests_)
1188  {
1189  requests_->RemoveRequest(hdr->sequence_id);
1190  }
1191  TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
1192  active_buffers_.erase(buf);
1193  pending_buffers_.insert(buf);
1194  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1195  << size() << ","
1196  << ReadReadyCount() << ","
1197  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1198  << WriteReadyCount(false) << ","
1199  << pending_buffers_.size() << ","
1200  << active_buffers_.size() << ")";
1201 
1202  run_incomplete_event_count_++;
1203  if (metricMan)
1204  {
1205  metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
1206  }
1207  if (released_incomplete_events_.count(hdr->sequence_id) == 0u)
1208  {
1209  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1210  }
1211  else
1212  {
1213  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1214  }
1215  TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
1216  }
1217  }
1218  }
1219 
1220  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1221  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
1222 
1223  auto counter = 0;
1224  double eventSize = 0;
1225  for (auto buf : sorted_buffers)
1226  {
1227  auto hdr = getEventHeader_(buf);
1228  auto thisEventSize = BufferDataSize(buf);
1229 
1230  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
1231  << "event_size=" << thisEventSize << ", buffer_size=" << BufferSize();
1232  statsHelper_.addSample(EVENTS_RELEASED_STAT_KEY, thisEventSize);
1233 
1234  TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
1235  MarkBufferFull(buf);
1236  run_event_count_++;
1237  counter++;
1238  eventSize += thisEventSize;
1239  pending_buffers_.erase(buf);
1240  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1241  << size() << ","
1242  << ReadReadyCount() << ","
1243  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1244  << WriteReadyCount(false) << ","
1245  << pending_buffers_.size() << ","
1246  << active_buffers_.size() << ")";
1247  }
1248 
1249  if (requests_ && requests_->RoutingTokenSendsEnabled())
1250  {
1251  TLOG(TLVL_TRACE) << "Sent tokens: " << requests_->GetSentTokenCount() << ", Event count: " << run_event_count_;
1252  auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
1253  auto available_buffers = WriteReadyCount(overwrite_mode_);
1254 
1255  TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
1256  << ", tokens_to_send: " << available_buffers - outstanding_tokens;
1257 
1258  if (available_buffers > outstanding_tokens)
1259  {
1260  auto tokens_to_send = available_buffers - outstanding_tokens;
1261 
1262  while (tokens_to_send > 0)
1263  {
1264  TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
1265  requests_->SendRoutingToken(1, run_id_);
1266  tokens_to_send--;
1267  }
1268  }
1269  }
1270 
1271  if (statsHelper_.readyToReport())
1272  {
1273  std::string statString = buildStatisticsString_();
1274  TLOG(TLVL_INFO) << statString;
1275  }
1276 
1277  if (metricMan)
1278  {
1279  TLOG(14) << "check_pending_buffers_: Sending Metrics";
1280  metricMan->sendMetric("Event Rate", counter, "Events", 1, MetricMode::Rate);
1281  metricMan->sendMetric("Data Rate", eventSize, "Bytes", 1, MetricMode::Rate);
1282  if (counter > 0)
1283  {
1284  metricMan->sendMetric("Average Event Size", eventSize / counter, "Bytes", 1, MetricMode::Average);
1285  }
1286 
1287  metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
1288  metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
1289  if (requests_)
1290  {
1291  metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
1292  }
1293 
1294  auto bufferReport = GetBufferReport();
1295  int full = 0, empty = 0, writing = 0, reading = 0;
1296  for (auto& buf : bufferReport)
1297  {
1298  switch (buf.second)
1299  {
1300  case BufferSemaphoreFlags::Full:
1301  full++;
1302  break;
1303  case BufferSemaphoreFlags::Empty:
1304  empty++;
1305  break;
1306  case BufferSemaphoreFlags::Writing:
1307  writing++;
1308  break;
1309  case BufferSemaphoreFlags::Reading:
1310  reading++;
1311  break;
1312  }
1313  }
1314  auto total = size();
1315  TLOG(15) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
1316 
1317  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
1318  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
1319  metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
1320  metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
1321  if (total > 0)
1322  {
1323  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1324  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1325  }
1326  }
1327  TLOG(14) << "check_pending_buffers_ END";
1328 }
1329 
1330 void artdaq::SharedMemoryEventManager::send_init_frags_()
1331 {
1332  if (init_fragments_.size() >= init_fragment_count_ && init_fragment_count_ > 0)
1333  {
1334  TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
1335 
1336 #if 0
1337  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
1338  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1339  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1340  ostream.close();
1341 #endif
1342 
1343  broadcastFragments_(init_fragments_);
1344  TLOG(TLVL_TRACE) << "Init Fragment sent";
1345  }
1346  else if (init_fragment_count_ > 0)
1347  {
1348  TLOG(TLVL_WARNING) << "Cannot send init fragments because I haven't yet received them! Set send_init_fragments to false or init_fragment_count to 0 if this process does not receive serialized art events to avoid potentially lengthy timeouts!";
1349  }
1350  else
1351  {
1352  // Send an empty Init Fragment so that ArtdaqInput knows that this is a pure-Fragment input
1353  artdaq::FragmentPtrs begin_run_fragments_;
1354  begin_run_fragments_.emplace_back(new artdaq::Fragment());
1355  begin_run_fragments_.back()->setSystemType(artdaq::Fragment::InitFragmentType);
1356  broadcastFragments_(begin_run_fragments_);
1357  }
1358 }
1359 
1361 {
1362  init_fragments_.push_back(std::move(frag));
1363  send_init_frags_();
1364 }
1365 
1367 {
1368  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
1369  if (art_pset != current_art_pset_ || !current_art_config_file_)
1370  {
1371  current_art_pset_ = art_pset;
1372  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
1373  }
1374  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
1375 }
1376 
1377 std::string artdaq::SharedMemoryEventManager::buildStatisticsString_() const
1378 {
1379  std::ostringstream oss;
1380  oss << app_name << " statistics:" << std::endl;
1381 
1382  artdaq::MonitoredQuantityPtr mqPtr =
1383  artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(EVENTS_RELEASED_STAT_KEY);
1384  if (mqPtr.get() != nullptr)
1385  {
1386  artdaq::MonitoredQuantityStats stats;
1387  mqPtr->getStats(stats);
1388  oss << " Event statistics: " << stats.recentSampleCount << " events released at " << stats.recentSampleRate
1389  << " events/sec, effective data rate = "
1390  << (stats.recentValueRate / 1024.0 / 1024.0)
1391  << " MB/sec, monitor window = " << stats.recentDuration
1392  << " sec, min::max event size = " << (stats.recentValueMin / 1024.0 / 1024.0)
1393  << "::" << (stats.recentValueMax / 1024.0 / 1024.0) << " MB" << std::endl;
1394  if (stats.recentSampleRate > 0.0)
1395  {
1396  oss << " Average time per event: ";
1397  oss << " elapsed time = " << (1.0 / stats.recentSampleRate) << " sec" << std::endl;
1398  }
1399  }
1400 
1401  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(FRAGMENTS_RECEIVED_STAT_KEY);
1402  if (mqPtr.get() != nullptr)
1403  {
1404  artdaq::MonitoredQuantityStats stats;
1405  mqPtr->getStats(stats);
1406  oss << " Fragment statistics: " << stats.recentSampleCount << " fragments received at " << stats.recentSampleRate
1407  << " fragments/sec, effective data rate = "
1408  << (stats.recentValueRate / 1024.0 / 1024.0)
1409  << " MB/sec, monitor window = " << stats.recentDuration
1410  << " sec, min::max fragment size = " << (stats.recentValueMin / 1024.0 / 1024.0)
1411  << "::" << (stats.recentValueMax / 1024.0 / 1024.0) << " MB" << std::endl;
1412  }
1413 
1414  oss << " Event counts: Run -- " << run_event_count_ << " Total, " << run_incomplete_event_count_ << " Incomplete."
1415  << " Subrun -- " << subrun_event_count_ << " Total, " << subrun_incomplete_event_count_ << " Incomplete. "
1416  << std::endl;
1417  return oss.str();
1418 }
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
void AddInitFragment(FragmentPtr &frag)
Set the stored Init fragment, if one has not yet been set already.
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
void RunArt(const std::shared_ptr< art_config_file > &config_file, const std::shared_ptr< std::atomic< pid_t >> &pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
void StartArt()
Start all the art processes.
subrun_id_t GetSubrunForSequenceID(Fragment::sequence_id_t seqID)
Get the subrun number that the given Sequence ID would be assigned to.
SharedMemoryEventManager(const fhicl::ParameterSet &pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
void rolloverSubrun()
Add a subrun transition immediately after the highest currently define sequence ID.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
static const std::string FRAGMENTS_RECEIVED_STAT_KEY
Key for Fragments Received MonitoredQuantity.
bool createCollectors(fhicl::ParameterSet const &pset, int defaultReportIntervalFragments, double defaultReportIntervalSeconds, double defaultMonitorWindow, std::string const &primaryStatKeyName)
Create MonitoredQuantity objects for all names registered with the StatisticsHelper.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
static const std::string EVENTS_RELEASED_STAT_KEY
Key for the Events Released MonitoredQuantity.
bool endOfData()
Indicate that the end of input has been reached to the art processes.
RawEvent::subrun_id_t subrun_id_t
Copy RawEvent::subrun_id_t into local scope.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...