artdaq  v3_09_06
SharedMemoryEventManager.cc
1 
2 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
3 #include <sys/wait.h>
4 
5 #include <memory>
6 #include "artdaq-core/Core/StatisticsCollection.hh"
7 #include "artdaq-core/Utilities/TraceLock.hh"
8 
9 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
10 
11 #define TLVL_BUFFER 40
12 #define TLVL_BUFLCK 41
13 
14 #define build_key(seed) ((seed) + ((GetPartitionNumber() + 1) << 16) + (getpid() & 0xFFFF))
15 
16 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
17 std::mutex artdaq::SharedMemoryEventManager::subrun_event_map_mutex_;
18 const std::string artdaq::SharedMemoryEventManager::
19  FRAGMENTS_RECEIVED_STAT_KEY("SharedMemoryEventManagerFragmentsReceived");
20 const std::string artdaq::SharedMemoryEventManager::
21  EVENTS_RELEASED_STAT_KEY("SharedMemoryEventManagerEventsReleased");
22 
23 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(const fhicl::ParameterSet& pset, fhicl::ParameterSet art_pset)
24  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", build_key(0xEE000000)),
25  pset.get<size_t>("buffer_count"),
26  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
27  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
28  !pset.get<bool>("broadcast_mode", false))
29  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
30  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
31  , queue_size_(pset.get<size_t>("buffer_count"))
32  , run_id_(0)
33  , max_subrun_event_map_length_(pset.get<size_t>("max_subrun_lookup_table_size", 100))
34  , max_event_list_length_(pset.get<size_t>("max_event_list_length", 100))
35  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
36  , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
37  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
38  , init_fragment_count_(pset.get<size_t>("init_fragment_count", pset.get<bool>("send_init_fragments", true) ? 1 : 0))
39  , running_(false)
40  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
41  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
42  , last_backpressure_report_time_(std::chrono::steady_clock::now())
43  , last_fragment_header_write_time_(std::chrono::steady_clock::now())
44  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
45  , run_event_count_(0)
46  , run_incomplete_event_count_(0)
47  , subrun_event_count_(0)
48  , subrun_incomplete_event_count_(0)
49  , oversize_fragment_count_(0)
50  , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
51  , restart_art_(false)
52  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
53  , manual_art_(pset.get<bool>("manual_art", false))
54  , current_art_pset_(art_pset)
55  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
56  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 1000000))
57  , requests_(nullptr)
58  , data_pset_(pset)
59  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", build_key(0xBB000000)),
60  pset.get<size_t>("broadcast_buffer_count", 10),
61  pset.get<size_t>("broadcast_buffer_size", 0x100000),
62  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
63 {
64  subrun_event_map_[0] = 1;
65  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
66  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
67 
68  if (!pset.get<bool>("use_art", true))
69  {
70  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
71  num_art_processes_ = 0;
72  }
73  else
74  {
75  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
76  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
77  }
78  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
79 
80  if (overwrite_mode_ && num_art_processes_ > 0)
81  {
82  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
83  }
84  else if (overwrite_mode_)
85  {
86  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
87  }
88 
89  for (size_t ii = 0; ii < size(); ++ii)
90  {
91  buffer_writes_pending_[ii] = 0;
92  // Make sure the mutexes are created once
93  std::lock_guard<std::mutex> lk(buffer_mutexes_[ii]);
94  }
95 
96  if (!IsValid())
97  {
98  throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!"; // NOLINT(cert-err60-cpp)
99  }
100 
101  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
102  SetRank(my_rank);
103  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
104 
107 
108  // fetch the monitoring parameters and create the MonitoredQuantity instances
109  statsHelper_.createCollectors(pset, 100, 30.0, 60.0, EVENTS_RELEASED_STAT_KEY);
110 
111  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
112 }
113 
115 {
116  TLOG(TLVL_TRACE) << "DESTRUCTOR";
117  if (running_)
118  {
119  try
120  {
121  endOfData();
122  }
123  catch (...)
124  {
125  // IGNORED
126  }
127  }
128  TLOG(TLVL_TRACE) << "Destructor END";
129 }
130 
131 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
132 {
133  if (!running_) return true;
134 
135  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
136  << ", sequence_id=" << frag.sequence_id;
137  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
138  TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
139  if (buffer == -1)
140  {
141  return false;
142  }
143  if (buffer == -2)
144  {
145  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
146  return true;
147  }
148 
149  auto hdr = getEventHeader_(buffer);
150  if (update_run_ids_)
151  {
152  hdr->run_id = run_id_;
153  }
154  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
155 
156  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
157  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
158 
159  TLOG(TLVL_TRACE) << "Checking for complete event";
160  auto fragmentCount = GetFragmentCount(frag.sequence_id);
161  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
162  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
163  << ", fragmentCount=" << fragmentCount
164  << ", num_fragments_per_event=" << num_fragments_per_event_
165  << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
166 
167  complete_buffer_(buffer);
168  if (requests_)
169  {
170  requests_->SendRequest(true);
171  }
172 
173  TLOG(TLVL_TRACE) << "AddFragment END";
174  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
175  return true;
176 }
177 
178 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
179 {
180  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
181  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress()); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
182  auto data = frag->headerAddress();
183  auto start = std::chrono::steady_clock::now();
184  bool sts = false;
185  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
186  {
187  sts = AddFragment(hdr, data);
188  if (!sts)
189  {
190  usleep(1000);
191  }
192  }
193  if (!sts)
194  {
195  outfrag = std::move(frag);
196  }
197  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
198  return sts;
199 }
200 
201 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
202 {
203  if (!running_) return nullptr;
204  TLOG(14) << "WriteFragmentHeader BEGIN";
205  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
206 
207  if (buffer < 0)
208  {
209  if (buffer == -1 && !dropIfNoBuffersAvailable)
210  {
211  std::unique_lock<std::mutex> bp_lk(sequence_id_mutex_);
212  if (TimeUtils::GetElapsedTime(last_backpressure_report_time_) > 1.0)
213  {
214  TLOG(TLVL_WARNING) << app_name << ": Back-pressure condition: All Shared Memory buffers have been full for " << TimeUtils::GetElapsedTime(last_fragment_header_write_time_) << " s!";
215  last_backpressure_report_time_ = std::chrono::steady_clock::now();
216  }
217  return nullptr;
218  }
219  if (buffer == -2)
220  {
221  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
222  }
223  else
224  {
225  TLOG(TLVL_INFO) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
226  }
227  dropped_data_.emplace_back(frag, std::make_unique<Fragment>(frag.word_count - frag.num_words()));
228  auto it = dropped_data_.rbegin();
229 
230  TLOG(TLVL_DEBUG + 3) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into "
231  << static_cast<void*>(it->second->dataBegin()) << " sz=" << it->second->dataSizeBytes();
232 
233  return it->second->dataBegin();
234  }
235 
236  last_backpressure_report_time_ = std::chrono::steady_clock::now();
237  last_fragment_header_write_time_ = std::chrono::steady_clock::now();
238  // Increment this as soon as we know we want to use the buffer
239  buffer_writes_pending_[buffer]++;
240 
241  if (metricMan)
242  {
243  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
244  }
245 
246  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
247 
248  std::unique_lock<std::mutex> lk(buffer_mutexes_.at(buffer));
249 
250  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
251 
252  auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
253  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
254 
255  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
256  if (frag.word_count - frag.num_words() > 0)
257  {
258  auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
259 
260  if (!sts)
261  {
262  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words(); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
263  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType; // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
264  TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
265  dropped_data_.emplace_back(frag, std::make_unique<Fragment>(frag.word_count - frag.num_words()));
266  auto it = dropped_data_.rbegin();
267 
268  oversize_fragment_count_++;
269 
270  if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
271  {
272  throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
273  }
274 
275  TLOG(TLVL_DEBUG + 3) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id
276  << " into " << static_cast<void*>(it->second->dataBegin());
277  return it->second->dataBegin();
278  }
279  }
280  TLOG(14) << "WriteFragmentHeader END";
281  return pos;
282 }
283 
284 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
285 {
286  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
287 
288  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
289  if (buffer < 0)
290  {
291  for (auto it = dropped_data_.begin(); it != dropped_data_.end(); ++it)
292  {
293  if (it->first == frag)
294  {
295  dropped_data_.erase(it);
296  return;
297  }
298  }
299  if (buffer == -1)
300  {
301  Detach(true, "SharedMemoryEventManager",
302  "getBufferForSequenceID_ returned -1 in DoneWritingFragment. This indicates a possible mismatch between expected Fragment count and the actual number of Fragments received.");
303  }
304  return;
305  }
306 
307  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
308  {
309  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
310 
311  std::unique_lock<std::mutex> lk(buffer_mutexes_.at(buffer));
312 
313  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
314 
315  TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << static_cast<int>(frag.type) << ")";
316  auto hdr = getEventHeader_(buffer);
317  if (update_run_ids_)
318  {
319  hdr->run_id = run_id_;
320  }
321  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
322 
323  TLOG(TLVL_TRACE) << "DoneWritingFragment: Updating buffer touch time";
324  TouchBuffer(buffer);
325 
326  if (buffer_writes_pending_[buffer] > 1)
327  {
328  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
329  buffer_writes_pending_[buffer]--;
330  return;
331  }
332  TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
333  auto frag_count = GetFragmentCount(frag.sequence_id);
334  hdr->is_complete = frag_count >= num_fragments_per_event_;
335 
336  if (frag_count > num_fragments_per_event_)
337  {
338  TLOG(TLVL_WARNING) << "DoneWritingFragment: This Event has more Fragments ( " << frag_count << " ) than specified in configuration ( " << num_fragments_per_event_ << " )!"
339  << " This is probably due to a misconfiguration and is *not* a reliable mode!";
340  }
341 
342  TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
343 #if ART_SUPPORTS_DUPLICATE_EVENTS
344  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
345  {
346  hdr->is_complete = frag_count >= released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
347  }
348 #endif
349 
350  complete_buffer_(buffer);
351 
352  // Move this down here to avoid race condition
353  buffer_writes_pending_[buffer]--;
354  }
355  if (requests_)
356  {
357  requests_->SendRequest(true);
358  }
359  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
360 }
361 
362 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
363 {
364  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
365 }
366 
367 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
368 {
369  if (buffer < 0)
370  {
371  return 0;
372  }
373  ResetReadPos(buffer);
374  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
375 
376  size_t count = 0;
377 
378  while (MoreDataInBuffer(buffer))
379  {
380  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
381  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
382  if (type != Fragment::InvalidFragmentType && fragHdr->type != type)
383  {
384  continue;
385  }
386  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
387  ++count;
388  }
389 
390  return count;
391 }
392 
393 void artdaq::SharedMemoryEventManager::RunArt(const std::shared_ptr<art_config_file>& config_file, const std::shared_ptr<std::atomic<pid_t>>& pid_out)
394 {
395  do
396  {
397  auto start_time = std::chrono::steady_clock::now();
398  send_init_frags_();
399  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
400 
401  pid_t pid = 0;
402 
403  if (!manual_art_)
404  {
405  char* filename = new char[config_file->getFileName().length() + 1];
406  memcpy(filename, config_file->getFileName().c_str(), config_file->getFileName().length());
407  filename[config_file->getFileName().length()] = '\0'; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
408 
409 #if DEBUG_ART
410  std::string debugArgS = "--config-out=" + app_name + "_art.out";
411  char* debugArg = new char[debugArgS.length() + 1];
412  memcpy(debugArg, debugArgS.c_str(), debugArgS.length());
413  debugArg[debugArgS.length()] = '\0';
414 
415  std::vector<char*> args{const_cast<char*>("art"), const_cast<char*>("-c"), filename, debugArg, NULL}; // NOLINT(cppcoreguidelines-pro-type-const-cast)
416 #else
417  std::vector<char*> args{const_cast<char*>("art"), const_cast<char*>("-c"), filename, nullptr}; // NOLINT(cppcoreguidelines-pro-type-const-cast)
418 #endif
419 
420  pid = fork();
421  if (pid == 0)
422  { /* child */
423  // 23-May-2018, KAB: added the setting of the partition number env var
424  // in the environment of the child art process so that Globals.hh
425  // will pick it up there and provide it to the artdaq classes that
426  // are used in data transfers, etc. within the art process.
427  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
428  std::string envVarValue = std::to_string(GetPartitionNumber());
429  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
430  {
431  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
432  << "\" in the environment of a child art process. "
433  << "This may result in incorrect TCP port number "
434  << "assignments or other issues, and data may "
435  << "not flow through the system correctly.";
436  }
437  envVarKey = "ARTDAQ_APPLICATION_NAME";
438  envVarValue = app_name;
439  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
440  {
441  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
442  << "\" in the environment of a child art process. ";
443  }
444  envVarKey = "ARTDAQ_RANK";
445  envVarValue = std::to_string(my_rank);
446  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
447  {
448  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
449  << "\" in the environment of a child art process. ";
450  }
451 
452  execvp("art", &args[0]);
453  delete[] filename;
454  exit(1);
455  }
456  delete[] filename;
457  }
458  else
459  {
460  //Using cin/cout here to ensure console is active (artdaqDriver)
461  std::cout << "Please run the following command in a separate terminal:" << std::endl
462  << "art -c " << config_file->getFileName() << std::endl
463  << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
464  << "Finally, return to this window and enter the pid: " << std::endl;
465  std::cin >> pid;
466  }
467  *pid_out = pid;
468 
469  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
470  {
471  std::unique_lock<std::mutex> lk(art_process_mutex_);
472  art_processes_.insert(pid);
473  }
474  siginfo_t status;
475  auto sts = waitid(P_PID, pid, &status, WEXITED);
476  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
477  {
478  std::unique_lock<std::mutex> lk(art_process_mutex_);
479  art_processes_.erase(pid);
480  }
481  if (sts < 0)
482  {
483  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
484  }
485  else if (status.si_code == CLD_EXITED && status.si_status == 0)
486  {
487  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
488  }
489  else
490  {
491  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
492  if (art_lifetime < minimum_art_lifetime_s_)
493  {
494  restart_art_ = false;
495  }
496 
497  auto exit_type = "exited with status code";
498  switch (status.si_code)
499  {
500  case CLD_DUMPED:
501  case CLD_KILLED:
502  exit_type = "was killed with signal";
503  break;
504  case CLD_EXITED:
505  default:
506  break;
507  }
508 
509  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
510  << "art process " << pid << " " << exit_type << " " << status.si_status
511  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
512  << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
513  << (restart_art_ ? "restarting" : "not restarting");
514  }
515  } while (restart_art_);
516 }
517 
519 {
520  restart_art_ = always_restart_art_;
521  if (num_art_processes_ == 0)
522  {
523  return;
524  }
525  for (size_t ii = 0; ii < num_art_processes_; ++ii)
526  {
527  StartArtProcess(current_art_pset_);
528  }
529 }
530 
532 {
533  static std::mutex start_art_mutex;
534  std::unique_lock<std::mutex> lk(start_art_mutex);
535  //TraceLock lk(start_art_mutex, 15, "StartArtLock");
536  restart_art_ = always_restart_art_;
537  auto initialCount = GetAttachedCount();
538  auto startTime = std::chrono::steady_clock::now();
539 
540  if (pset != current_art_pset_ || !current_art_config_file_)
541  {
542  current_art_pset_ = pset;
543  current_art_config_file_ = std::make_shared<art_config_file>(pset /*, GetKey(), GetBroadcastKey()*/);
544  }
545  std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
546  boost::thread thread([=] { RunArt(current_art_config_file_, pid); });
547  thread.detach();
548 
549  auto currentCount = GetAttachedCount() - initialCount;
550  while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
551  {
552  usleep(10000);
553  currentCount = GetAttachedCount() - initialCount;
554  }
555  if ((currentCount < 1 || *pid <= 0) && manual_art_)
556  {
557  TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
558  return 0;
559  }
560  if (currentCount < 1 || *pid <= 0)
561  {
562  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
563  << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
564  return 0;
565  }
566 
567  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
568  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
569 
570  return *pid;
571 }
572 
574 {
575  restart_art_ = false;
576  //current_art_config_file_ = nullptr;
577  //current_art_pset_ = fhicl::ParameterSet();
578 
579  auto check_pids = [&](bool print) {
580  std::unique_lock<std::mutex> lk(art_process_mutex_);
581  for (auto pid = pids.begin(); pid != pids.end();)
582  {
583  // 08-May-2018, KAB: protect against killing invalid PIDS
584 
585  if (*pid <= 0)
586  {
587  TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
588  << ") from the shutdown list.";
589  pid = pids.erase(pid);
590  }
591  else if (kill(*pid, 0) < 0)
592  {
593  pid = pids.erase(pid);
594  }
595  else
596  {
597  if (print)
598  {
599  std::cout << *pid << " ";
600  }
601  ++pid;
602  }
603  }
604  };
605  auto count_pids = [&]() {
606  std::unique_lock<std::mutex> lk(art_process_mutex_);
607  return pids.size();
608  };
609  check_pids(false);
610  if (count_pids() == 0)
611  {
612  TLOG(14) << "All art processes already exited, nothing to do.";
613  usleep(1000);
614  return;
615  }
616 
617  if (!manual_art_)
618  {
619  int graceful_wait_ms = art_event_processing_time_us_ * size() * 10 / 1000;
620  int gentle_wait_ms = art_event_processing_time_us_ * size() * 2 / 1000;
621  int int_wait_ms = art_event_processing_time_us_ * size() / 1000;
622  auto shutdown_start = std::chrono::steady_clock::now();
623 
624 // if (!overwrite_mode_)
625  {
626  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
627  for (int ii = 0; ii < graceful_wait_ms; ++ii)
628  {
629  usleep(1000);
630 
631  check_pids(false);
632  if (count_pids() == 0)
633  {
634  TLOG(TLVL_INFO) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms.";
635  return;
636  }
637  }
638  }
639 
640  {
641  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
642  std::unique_lock<std::mutex> lk(art_process_mutex_);
643  for (auto pid : pids)
644  {
645  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
646  kill(pid, SIGQUIT);
647  }
648  }
649 
650  TLOG(TLVL_TRACE) << "Waiting up to " << gentle_wait_ms << " ms for all art processes to exit from SIGQUIT";
651  for (int ii = 0; ii < gentle_wait_ms; ++ii)
652  {
653  usleep(1000);
654 
655  check_pids(false);
656  if (count_pids() == 0)
657  {
658  TLOG(TLVL_INFO) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms (SIGQUIT).";
659  return;
660  }
661  }
662 
663  {
664  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
665  std::unique_lock<std::mutex> lk(art_process_mutex_);
666  for (auto pid : pids)
667  {
668  kill(pid, SIGINT);
669  }
670  }
671 
672  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit from SIGINT";
673  for (int ii = 0; ii < int_wait_ms; ++ii)
674  {
675  usleep(1000);
676 
677  check_pids(false);
678 
679  if (count_pids() == 0)
680  {
681  TLOG(TLVL_INFO) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms (SIGINT).";
682  return;
683  }
684  }
685 
686  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
687  while (count_pids() > 0)
688  {
689  {
690  std::unique_lock<std::mutex> lk(art_process_mutex_);
691  kill(*pids.begin(), SIGKILL);
692  usleep(1000);
693  }
694  check_pids(false);
695  }
696  TLOG(TLVL_INFO) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms (SIGKILL).";
697  }
698  else
699  {
700  std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
701  while (count_pids() > 0)
702  {
703  std::cout << "The following PIDs are running: ";
704  check_pids(true);
705  std::cout << std::endl;
706  std::string ignored;
707  std::cin >> ignored;
708  }
709  }
710 }
711 
712 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
713 {
714  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
715  if (restart_art_ || !always_restart_art_) // Art is running
716  {
717  endOfData();
718  }
719  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
720  {
721  broadcasts_.MarkBufferEmpty(ii, true);
722  }
723  if (newRun == 0)
724  {
725  newRun = run_id_ + 1;
726  }
727 
728  if (art_pset != current_art_pset_ || !current_art_config_file_)
729  {
730  current_art_pset_ = art_pset;
731  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
732  }
733 
734  if (n_art_processes != -1)
735  {
736  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
737  num_art_processes_ = n_art_processes;
738  }
739  startRun(newRun);
740  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
741 }
742 
744 {
745  running_ = false;
746  init_fragments_.clear();
747  received_init_frags_.clear();
748  TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
749  restart_art_ = false;
750 
751  auto start = std::chrono::steady_clock::now();
752  auto pendingWriteCount = std::accumulate(buffer_writes_pending_.begin(), buffer_writes_pending_.end(), 0, [](int a, auto& b) { return a + b.second.load(); });
753  TLOG(TLVL_DEBUG) << "endOfData: Waiting for " << pendingWriteCount << " pending writes to complete";
754  while (pendingWriteCount > 0 && TimeUtils::GetElapsedTimeMicroseconds(start) < 1000000)
755  {
756  usleep(10000);
757  pendingWriteCount = std::accumulate(buffer_writes_pending_.begin(), buffer_writes_pending_.end(), 0, [](int a, auto& b) { return a + b.second.load(); });
758  }
759 
760  size_t initialStoreSize = GetIncompleteEventCount();
761  TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
762  << " stale events from the SharedMemoryEventManager.";
763  int counter = initialStoreSize;
764  while (!active_buffers_.empty() && counter > 0)
765  {
766  complete_buffer_(*active_buffers_.begin());
767  counter--;
768  }
769  TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
770  << " stale events in the SharedMemoryEventManager.";
771 
772  TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
773  start = std::chrono::steady_clock::now();
774  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
775  auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1); //size();
776 
777  auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
778 
779  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
780  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
781  {
782  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
783  if (temp != lastReadCount)
784  {
785  TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
786  lastReadCount = temp;
787  start = std::chrono::steady_clock::now();
788  }
789  if (lastReadCount > 0)
790  {
791  TLOG(19) << "About to sleep " << outstanding_buffer_wait_time << " us - lastReadCount=" << lastReadCount << " size=" << size() << " end_of_data_wait_us=" << end_of_data_wait_us;
792  usleep(outstanding_buffer_wait_time);
793  }
794  }
795 
796  TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
797  << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
798 
799  TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
800  FragmentPtrs broadcast;
801  broadcast.emplace_back(Fragment::eodFrag(GetBufferCount()));
802  bool success = broadcastFragments_(broadcast);
803  if (!success)
804  {
805  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
806  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
807  {
808  broadcasts_.MarkBufferEmpty(ii, true);
809  }
810  broadcastFragments_(broadcast);
811  }
812  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
813  while (get_art_process_count_() > 0)
814  {
815  TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
816 
817  ShutdownArtProcesses(art_processes_);
818  }
819  TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
820 
821  ResetAttachedCount();
822 
823  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
824  for (size_t ii = 0; ii < size(); ++ii)
825  {
826  MarkBufferEmpty(ii, true);
827  }
828  // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again...
829  // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers";
830  // for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
831  // {
832  // broadcasts_.MarkBufferEmpty(ii, true);
833  // }
834  released_events_.clear();
835  released_incomplete_events_.clear();
836 
837  TLOG(TLVL_DEBUG) << "endOfData END";
838  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
839  return true;
840 }
841 
843 {
844  running_ = true;
845  init_fragments_.clear();
846  received_init_frags_.clear();
847  statsHelper_.resetStatistics();
848  TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
849  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
850  {
851  broadcasts_.MarkBufferEmpty(ii, true);
852  }
853  released_events_.clear();
854  released_incomplete_events_.clear();
855  StartArt();
856  run_id_ = runID;
857  {
858  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
859  subrun_event_map_.clear();
860  subrun_event_map_[0] = 1;
861  }
862  run_event_count_ = 0;
863  run_incomplete_event_count_ = 0;
864  requests_ = std::make_unique<RequestSender>(data_pset_);
865  if (requests_)
866  {
867  requests_->SetRunNumber(static_cast<uint32_t>(run_id_));
868  requests_->SendRoutingToken(queue_size_, run_id_);
869  }
870  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
871  << ", max queue size = "
872  << queue_size_
873  << ", queue size = "
874  << GetLockedBufferCount();
875  if (metricMan)
876  {
877  metricMan->sendMetric("Run Number", static_cast<uint64_t>(run_id_), "Run", 1, MetricMode::LastPoint | MetricMode::Persist);
878  }
879 }
880 
882 {
883  TLOG(TLVL_INFO) << "Ending run " << run_id_;
884  FragmentPtr endOfRunFrag(new Fragment(static_cast<size_t>(ceil(sizeof(my_rank) /
885  static_cast<double>(sizeof(Fragment::value_type))))));
886 
887  TLOG(TLVL_DEBUG) << "Shutting down RequestSender";
888  requests_.reset(nullptr);
889 
890  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
891  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
892  *endOfRunFrag->dataBegin() = my_rank;
893  FragmentPtrs broadcast;
894  broadcast.emplace_back(std::move(endOfRunFrag));
895  broadcastFragments_(broadcast);
896 
897  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
898  run_event_count_ = 0;
899  run_incomplete_event_count_ = 0;
900  oversize_fragment_count_ = 0;
901  {
902  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
903  subrun_event_map_.clear();
904  subrun_event_map_[0] = 1;
905  }
906  return true;
907 }
908 
910 {
911  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
912  if (boundary == 0 || boundary == Fragment::InvalidSequenceID)
913  {
914  return;
915  }
916 
917  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
918 
919  // Don't re-rollover to an already-defined subrun
920  if (!subrun_event_map_.empty() && subrun_event_map_.rbegin()->second == subrun)
921  {
922  return;
923  }
924  TLOG(TLVL_INFO) << "Will roll over to subrun " << subrun << " when I reach Sequence ID " << boundary;
925  subrun_event_map_[boundary] = subrun;
926  while (subrun_event_map_.size() > max_subrun_event_map_length_)
927  {
928  subrun_event_map_.erase(subrun_event_map_.begin());
929  }
930 }
931 
933 {
934  Fragment::sequence_id_t seqID = 0;
935  subrun_id_t subrun = 0;
936  {
937  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
938  for (auto& it : subrun_event_map_)
939  {
940  if (it.first >= seqID)
941  {
942  seqID = it.first + 1;
943  }
944  if (it.second >= subrun)
945  {
946  subrun = it.second + 1;
947  }
948  }
949  }
950  rolloverSubrun(seqID, subrun);
951 }
952 
954 {
955  if (metricMan)
956  {
957  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
958  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
959  }
960 
961  if (incomplete_event_report_interval_ms_ > 0 && (GetLockedBufferCount() != 0u))
962  {
963  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
964  {
965  return;
966  }
967 
968  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
969  std::ostringstream oss;
970  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
971  for (auto& ev : active_buffers_)
972  {
973  auto hdr = getEventHeader_(ev);
974  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
975  }
976  TLOG(TLVL_DEBUG) << oss.str();
977  }
978 }
979 
980 bool artdaq::SharedMemoryEventManager::broadcastFragments_(FragmentPtrs& frags)
981 {
982  if (frags.empty())
983  {
984  TLOG(TLVL_ERROR) << "Requested broadcast but no Fragments given!";
985  return false;
986  }
987  TLOG(TLVL_DEBUG) << "Broadcasting Fragments with seqID=" << frags.front()->sequenceID()
988  << ", type " << detail::RawFragmentHeader::SystemTypeToString(frags.front()->type())
989  << ", size=" << frags.front()->sizeBytes() << "B.";
990  auto buffer = broadcasts_.GetBufferForWriting(false);
991  TLOG(TLVL_DEBUG) << "broadcastFragments_: after getting buffer 1st buffer=" << buffer;
992  auto start_time = std::chrono::steady_clock::now();
993  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
994  {
995  usleep(10000);
996  buffer = broadcasts_.GetBufferForWriting(false);
997  }
998  TLOG(TLVL_DEBUG) << "broadcastFragments_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
999  if (buffer == -1)
1000  {
1001  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frags.front()->typeString() << " failed due to timeout waiting for buffer!";
1002  return false;
1003  }
1004 
1005  TLOG(TLVL_DEBUG) << "broadcastFragments_: Filling in RawEventHeader";
1006  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
1007  hdr->run_id = run_id_;
1008  hdr->subrun_id = GetSubrunForSequenceID(frags.front()->sequenceID());
1009  hdr->sequence_id = frags.front()->sequenceID();
1010  hdr->is_complete = true;
1011  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
1012 
1013  for (auto& frag : frags)
1014  {
1015  TLOG(TLVL_DEBUG) << "broadcastFragments_ before Write calls";
1016  if (frag->sequenceID() != hdr->sequence_id || frag->type() != frags.front()->type())
1017  {
1018  TLOG(TLVL_WARNING) << "Not sending fragment because its SequenceID or Type disagrees with leading Fragment";
1019  continue;
1020  }
1021  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
1022  }
1023 
1024  TLOG(TLVL_DEBUG) << "broadcastFragments_ Marking buffer full";
1025  broadcasts_.MarkBufferFull(buffer, -1);
1026  TLOG(TLVL_DEBUG) << "broadcastFragment_s Complete";
1027  return true;
1028 }
1029 
1030 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
1031 {
1032  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
1033 }
1034 
1036 {
1037  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
1038 
1039  TLOG(TLVL_TRACE) << "GetSubrunForSequenceID BEGIN map size = " << subrun_event_map_.size();
1040  auto it = subrun_event_map_.begin();
1041  subrun_id_t subrun = 1;
1042 
1043  while (it->first <= seqID && it != subrun_event_map_.end())
1044  {
1045  TLOG(TLVL_TRACE) << "Map has sequence ID " << it->first << ", subrun " << it->second << " (looking for <= " << seqID << ")";
1046  subrun = it->second;
1047  ++it;
1048  }
1049 
1050  TLOG(TLVL_DEBUG) << "GetSubrunForSequenceID returning subrun " << subrun << " for sequence ID " << seqID;
1051  return subrun;
1052 }
1053 
1054 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
1055 {
1056  TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
1057  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1058 
1059  TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
1060 
1061  auto buffers = GetBuffersOwnedByManager();
1062  for (auto& buf : buffers)
1063  {
1064  auto hdr = getEventHeader_(buf);
1065  if (hdr->sequence_id == seqID)
1066  {
1067  TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
1068  return buf;
1069  }
1070  }
1071 
1072 #if !ART_SUPPORTS_DUPLICATE_EVENTS
1073  if (released_incomplete_events_.count(seqID) != 0u)
1074  {
1075  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
1076  return -2;
1077  }
1078  if (released_events_.count(seqID) != 0u)
1079  {
1080  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been completed and released to art! Check configuration for inconsistent Fragment count per event!";
1081  return -2;
1082  }
1083 #endif
1084 
1085  if (!create_new)
1086  {
1087  return -1;
1088  }
1089 
1090  check_pending_buffers_(lk);
1091  int new_buffer = GetBufferForWriting(false);
1092 
1093  if (new_buffer == -1)
1094  {
1095  new_buffer = GetBufferForWriting(overwrite_mode_);
1096  }
1097 
1098  if (new_buffer == -1)
1099  {
1100  return -1;
1101  }
1102  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
1103  std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_.at(new_buffer));
1104  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
1105  auto hdr = getEventHeader_(new_buffer);
1106  hdr->is_complete = false;
1107  hdr->run_id = run_id_;
1108  hdr->subrun_id = GetSubrunForSequenceID(seqID);
1109  hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
1110  hdr->sequence_id = seqID;
1111  hdr->timestamp = timestamp;
1112  buffer_writes_pending_[new_buffer] = 0;
1113  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
1114  SetMFIteration("Sequence ID " + std::to_string(seqID));
1115 
1116  TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
1117  active_buffers_.insert(new_buffer);
1118  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1119  << size() << ","
1120  << ReadReadyCount() << ","
1121  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1122  << WriteReadyCount(false) << ","
1123  << pending_buffers_.size() << ","
1124  << active_buffers_.size() << ")";
1125 
1126  if (requests_)
1127  {
1128  if (timestamp != Fragment::InvalidTimestamp)
1129  {
1130  requests_->AddRequest(seqID, timestamp);
1131  }
1132  // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we
1133  // don't double-send requests, but still get the benefit of calling SendRequest 'often'.
1134  else
1135  {
1136  requests_->SendRequest();
1137  }
1138  }
1139  TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
1140  return new_buffer;
1141 }
1142 
1143 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
1144 {
1145  if (buffer == -1)
1146  {
1147  return true;
1148  }
1149  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
1150  {
1151  return true;
1152  }
1153  ResetReadPos(buffer);
1154  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
1155  return MoreDataInBuffer(buffer);
1156 }
1157 
1158 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
1159 {
1160  auto hdr = getEventHeader_(buffer);
1161  if (hdr->is_complete)
1162  {
1163  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
1164 
1165  {
1166  TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
1167 
1168  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1169  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1170  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1171  active_buffers_.erase(buffer);
1172  pending_buffers_.insert(buffer);
1173  released_events_.insert(hdr->sequence_id);
1174  while (released_events_.size() > max_event_list_length_)
1175  {
1176  released_events_.erase(released_events_.begin());
1177  }
1178 
1179  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1180  << size() << ","
1181  << ReadReadyCount() << ","
1182  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1183  << WriteReadyCount(false) << ","
1184  << pending_buffers_.size() << ","
1185  << active_buffers_.size() << ")";
1186  }
1187  if (requests_)
1188  {
1189  requests_->RemoveRequest(hdr->sequence_id);
1190  }
1191  }
1192  CheckPendingBuffers();
1193 }
1194 
1195 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
1196 {
1197  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1198 }
1199 
1201 {
1202  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
1203  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1204  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
1205  check_pending_buffers_(lk);
1206 }
1207 
1208 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
1209 {
1210  TLOG(14) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1211 
1212  auto buffers = GetBuffersOwnedByManager();
1213  for (auto buf : buffers)
1214  {
1215  if (ResetBuffer(buf) && (pending_buffers_.count(buf) == 0u))
1216  {
1217  TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1218  auto hdr = getEventHeader_(buf);
1219  if ((active_buffers_.count(buf) != 0u) && buffer_writes_pending_[buf].load() == 0)
1220  {
1221  if (requests_)
1222  {
1223  requests_->RemoveRequest(hdr->sequence_id);
1224  }
1225  TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
1226  active_buffers_.erase(buf);
1227  pending_buffers_.insert(buf);
1228  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1229  << size() << ","
1230  << ReadReadyCount() << ","
1231  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1232  << WriteReadyCount(false) << ","
1233  << pending_buffers_.size() << ","
1234  << active_buffers_.size() << ")";
1235 
1236  run_incomplete_event_count_++;
1237  if (metricMan)
1238  {
1239  metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
1240  }
1241  if (released_incomplete_events_.count(hdr->sequence_id) == 0u)
1242  {
1243  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1244  }
1245  else
1246  {
1247  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1248  }
1249  TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
1250  }
1251  }
1252  }
1253 
1254  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1255  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
1256 
1257  auto counter = 0;
1258  double eventSize = 0;
1259  for (auto buf : sorted_buffers)
1260  {
1261  auto hdr = getEventHeader_(buf);
1262  auto thisEventSize = BufferDataSize(buf);
1263 
1264  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
1265  << "event_size=" << thisEventSize << ", buffer_size=" << BufferSize();
1266  statsHelper_.addSample(EVENTS_RELEASED_STAT_KEY, thisEventSize);
1267 
1268  TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
1269  MarkBufferFull(buf);
1270  run_event_count_++;
1271  counter++;
1272  eventSize += thisEventSize;
1273  pending_buffers_.erase(buf);
1274  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1275  << size() << ","
1276  << ReadReadyCount() << ","
1277  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1278  << WriteReadyCount(false) << ","
1279  << pending_buffers_.size() << ","
1280  << active_buffers_.size() << ")";
1281  }
1282 
1283  if (requests_ && requests_->RoutingTokenSendsEnabled())
1284  {
1285  TLOG(TLVL_TRACE) << "Sent tokens: " << requests_->GetSentTokenCount() << ", Event count: " << run_event_count_;
1286  auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
1287  auto available_buffers = WriteReadyCount(overwrite_mode_);
1288 
1289  TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
1290  << ", tokens_to_send: " << available_buffers - outstanding_tokens;
1291 
1292  if (available_buffers > outstanding_tokens)
1293  {
1294  auto tokens_to_send = available_buffers - outstanding_tokens;
1295 
1296  while (tokens_to_send > 0)
1297  {
1298  TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
1299  requests_->SendRoutingToken(1, run_id_);
1300  tokens_to_send--;
1301  }
1302  }
1303  }
1304 
1305  if (statsHelper_.readyToReport())
1306  {
1307  std::string statString = buildStatisticsString_();
1308  TLOG(TLVL_INFO) << statString;
1309  }
1310 
1311  if (metricMan)
1312  {
1313  TLOG(14) << "check_pending_buffers_: Sending Metrics";
1314  metricMan->sendMetric("Event Rate", counter, "Events", 1, MetricMode::Rate);
1315  metricMan->sendMetric("Data Rate", eventSize, "Bytes", 1, MetricMode::Rate);
1316  if (counter > 0)
1317  {
1318  metricMan->sendMetric("Average Event Size", eventSize / counter, "Bytes", 1, MetricMode::Average);
1319  }
1320 
1321  metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
1322  metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
1323  if (requests_)
1324  {
1325  metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
1326  }
1327 
1328  auto bufferReport = GetBufferReport();
1329  int full = 0, empty = 0, writing = 0, reading = 0;
1330  for (auto& buf : bufferReport)
1331  {
1332  switch (buf.second)
1333  {
1334  case BufferSemaphoreFlags::Full:
1335  full++;
1336  break;
1337  case BufferSemaphoreFlags::Empty:
1338  empty++;
1339  break;
1340  case BufferSemaphoreFlags::Writing:
1341  writing++;
1342  break;
1343  case BufferSemaphoreFlags::Reading:
1344  reading++;
1345  break;
1346  }
1347  }
1348  auto total = size();
1349  TLOG(15) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
1350 
1351  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
1352  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
1353  metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
1354  metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
1355  if (total > 0)
1356  {
1357  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1358  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1359  }
1360  }
1361  TLOG(14) << "check_pending_buffers_ END";
1362 }
1363 
1364 void artdaq::SharedMemoryEventManager::send_init_frags_()
1365 {
1366  if (init_fragments_.size() >= init_fragment_count_ && init_fragment_count_ > 0)
1367  {
1368  TLOG(TLVL_INFO) << "Broadcasting " << init_fragments_.size() << " Init Fragment(s) to all art subprocesses...";
1369 
1370 #if 0
1371  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
1372  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1373  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1374  ostream.close();
1375 #endif
1376 
1377  broadcastFragments_(init_fragments_);
1378  TLOG(TLVL_TRACE) << "Init Fragment sent";
1379  }
1380  else if (init_fragment_count_ > 0 && init_fragments_.size() == 0)
1381  {
1382  TLOG(TLVL_WARNING) << "Cannot send Init Fragment(s) because I haven't yet received them! Set send_init_fragments to false or init_fragment_count to 0 if this process does not receive serialized art events to avoid potentially lengthy timeouts!";
1383  }
1384  else if (init_fragment_count_ > 0)
1385  {
1386  TLOG(TLVL_INFO) << "Cannot send Init Fragment(s) because I haven't yet received them (have " << init_fragments_.size() << " of " << init_fragment_count_ << ")!";
1387  }
1388  else
1389  {
1390  // Send an empty Init Fragment so that ArtdaqInput knows that this is a pure-Fragment input
1391  artdaq::FragmentPtrs begin_run_fragments_;
1392  begin_run_fragments_.emplace_back(new artdaq::Fragment());
1393  begin_run_fragments_.back()->setSystemType(artdaq::Fragment::InitFragmentType);
1394  broadcastFragments_(begin_run_fragments_);
1395  }
1396 }
1397 
1399 {
1400  static std::mutex init_fragment_mutex;
1401  std::lock_guard<std::mutex> lk(init_fragment_mutex);
1402  if (received_init_frags_.count(frag->fragmentID()) == 0)
1403  {
1404  TLOG(TLVL_DEBUG) << "Received Init Fragment from rank " << frag->fragmentID() << ". Now have " << init_fragments_.size() + 1 << " of " << init_fragment_count_;
1405  received_init_frags_.insert(frag->fragmentID());
1406  init_fragments_.push_back(std::move(frag));
1407 
1408  // Don't send until all init fragments have been received
1409  if (init_fragments_.size() >= init_fragment_count_)
1410  {
1411  send_init_frags_();
1412  }
1413  }
1414  else
1415  {
1416  TLOG(TLVL_TRACE) << "Ignoring duplicate Init Fragment from rank " << frag->fragmentID();
1417  }
1418 }
1419 
1421 {
1422  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
1423  if (art_pset != current_art_pset_ || !current_art_config_file_)
1424  {
1425  current_art_pset_ = art_pset;
1426  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
1427  }
1428  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
1429 }
1430 
1431 std::string artdaq::SharedMemoryEventManager::buildStatisticsString_() const
1432 {
1433  std::ostringstream oss;
1434  oss << app_name << " statistics:" << std::endl;
1435 
1436  artdaq::MonitoredQuantityPtr mqPtr =
1437  artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(EVENTS_RELEASED_STAT_KEY);
1438  if (mqPtr.get() != nullptr)
1439  {
1440  artdaq::MonitoredQuantityStats stats;
1441  mqPtr->getStats(stats);
1442  oss << " Event statistics: " << stats.recentSampleCount << " events released at " << stats.recentSampleRate
1443  << " events/sec, effective data rate = "
1444  << (stats.recentValueRate / 1024.0 / 1024.0)
1445  << " MB/sec, monitor window = " << stats.recentDuration
1446  << " sec, min::max event size = " << (stats.recentValueMin / 1024.0 / 1024.0)
1447  << "::" << (stats.recentValueMax / 1024.0 / 1024.0) << " MB" << std::endl;
1448  if (stats.recentSampleRate > 0.0)
1449  {
1450  oss << " Average time per event: ";
1451  oss << " elapsed time = " << (1.0 / stats.recentSampleRate) << " sec" << std::endl;
1452  }
1453  }
1454 
1455  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(FRAGMENTS_RECEIVED_STAT_KEY);
1456  if (mqPtr.get() != nullptr)
1457  {
1458  artdaq::MonitoredQuantityStats stats;
1459  mqPtr->getStats(stats);
1460  oss << " Fragment statistics: " << stats.recentSampleCount << " fragments received at " << stats.recentSampleRate
1461  << " fragments/sec, effective data rate = "
1462  << (stats.recentValueRate / 1024.0 / 1024.0)
1463  << " MB/sec, monitor window = " << stats.recentDuration
1464  << " sec, min::max fragment size = " << (stats.recentValueMin / 1024.0 / 1024.0)
1465  << "::" << (stats.recentValueMax / 1024.0 / 1024.0) << " MB" << std::endl;
1466  }
1467 
1468  oss << " Event counts: Run -- " << run_event_count_ << " Total, " << run_incomplete_event_count_ << " Incomplete."
1469  << " Subrun -- " << subrun_event_count_ << " Total, " << subrun_incomplete_event_count_ << " Incomplete. "
1470  << std::endl;
1471  return oss.str();
1472 }
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
void AddInitFragment(FragmentPtr &frag)
Set the stored Init fragment, if one has not yet been set already.
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
void RunArt(const std::shared_ptr< art_config_file > &config_file, const std::shared_ptr< std::atomic< pid_t >> &pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
void StartArt()
Start all the art processes.
subrun_id_t GetSubrunForSequenceID(Fragment::sequence_id_t seqID)
Get the subrun number that the given Sequence ID would be assigned to.
SharedMemoryEventManager(const fhicl::ParameterSet &pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
void rolloverSubrun()
Add a subrun transition immediately after the highest currently define sequence ID.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
static const std::string FRAGMENTS_RECEIVED_STAT_KEY
Key for Fragments Received MonitoredQuantity.
bool createCollectors(fhicl::ParameterSet const &pset, int defaultReportIntervalFragments, double defaultReportIntervalSeconds, double defaultMonitorWindow, std::string const &primaryStatKeyName)
Create MonitoredQuantity objects for all names registered with the StatisticsHelper.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
static const std::string EVENTS_RELEASED_STAT_KEY
Key for the Events Released MonitoredQuantity.
bool endOfData()
Indicate that the end of input has been reached to the art processes.
RawEvent::subrun_id_t subrun_id_t
Copy RawEvent::subrun_id_t into local scope.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...