artdaq  v3_10_01
SharedMemoryEventManager.cc
1 
2 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
3 #include <sys/wait.h>
4 
5 #include <memory>
6 #include <numeric>
7 
8 #include "artdaq-core/Core/StatisticsCollection.hh"
9 #include "artdaq-core/Utilities/TraceLock.hh"
10 
11 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
12 
13 #define TLVL_BUFFER 40
14 #define TLVL_BUFLCK 41
15 
16 #define build_key(seed) ((seed) + ((GetPartitionNumber() + 1) << 16) + (getpid() & 0xFFFF))
17 
18 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
19 std::mutex artdaq::SharedMemoryEventManager::subrun_event_map_mutex_;
20 const std::string artdaq::SharedMemoryEventManager::
21  FRAGMENTS_RECEIVED_STAT_KEY("SharedMemoryEventManagerFragmentsReceived");
22 const std::string artdaq::SharedMemoryEventManager::
23  EVENTS_RELEASED_STAT_KEY("SharedMemoryEventManagerEventsReleased");
24 
25 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(const fhicl::ParameterSet& pset, fhicl::ParameterSet art_pset)
26  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", build_key(0xEE000000)),
27  pset.get<size_t>("buffer_count"),
28  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
29  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
30  !pset.get<bool>("broadcast_mode", false))
31  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
32  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
33  , queue_size_(pset.get<size_t>("buffer_count"))
34  , run_id_(0)
35  , max_subrun_event_map_length_(pset.get<size_t>("max_subrun_lookup_table_size", 100))
36  , max_event_list_length_(pset.get<size_t>("max_event_list_length", 100))
37  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
38  , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
39  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
40  , init_fragment_count_(pset.get<size_t>("init_fragment_count", pset.get<bool>("send_init_fragments", true) ? 1 : 0))
41  , running_(false)
42  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
43  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
44  , last_backpressure_report_time_(std::chrono::steady_clock::now())
45  , last_fragment_header_write_time_(std::chrono::steady_clock::now())
46  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
47  , run_event_count_(0)
48  , run_incomplete_event_count_(0)
49  , subrun_event_count_(0)
50  , subrun_incomplete_event_count_(0)
51  , oversize_fragment_count_(0)
52  , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
53  , restart_art_(false)
54  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
55  , manual_art_(pset.get<bool>("manual_art", false))
56  , current_art_pset_(art_pset)
57  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
58  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 1000000))
59  , requests_(nullptr)
60  , data_pset_(pset)
61  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", build_key(0xBB000000)),
62  pset.get<size_t>("broadcast_buffer_count", 10),
63  pset.get<size_t>("broadcast_buffer_size", 0x100000),
64  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
65 {
66  subrun_event_map_[0] = 1;
67  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
68  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
69 
70  if (!pset.get<bool>("use_art", true))
71  {
72  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
73  num_art_processes_ = 0;
74  }
75  else
76  {
77  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
78  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
79  }
80  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
81 
82  if (overwrite_mode_ && num_art_processes_ > 0)
83  {
84  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
85  }
86  else if (overwrite_mode_)
87  {
88  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
89  }
90 
91  for (size_t ii = 0; ii < size(); ++ii)
92  {
93  buffer_writes_pending_[ii] = 0;
94  // Make sure the mutexes are created once
95  std::lock_guard<std::mutex> lk(buffer_mutexes_[ii]);
96  }
97 
98  if (!IsValid())
99  {
100  throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!"; // NOLINT(cert-err60-cpp)
101  }
102 
103  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
104  SetRank(my_rank);
105  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
106 
109 
110  // fetch the monitoring parameters and create the MonitoredQuantity instances
111  statsHelper_.createCollectors(pset, 100, 30.0, 60.0, EVENTS_RELEASED_STAT_KEY);
112 
113  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
114 }
115 
117 {
118  TLOG(TLVL_TRACE) << "DESTRUCTOR";
119  if (running_)
120  {
121  try
122  {
123  endOfData();
124  }
125  catch (...)
126  {
127  // IGNORED
128  }
129  }
130  TLOG(TLVL_TRACE) << "Destructor END";
131 }
132 
133 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
134 {
135  if (!running_) return true;
136 
137  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
138  << ", sequence_id=" << frag.sequence_id;
139  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
140  TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
141  if (buffer == -1)
142  {
143  return false;
144  }
145  if (buffer == -2)
146  {
147  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
148  return true;
149  }
150 
151  auto hdr = getEventHeader_(buffer);
152  if (update_run_ids_)
153  {
154  hdr->run_id = run_id_;
155  }
156  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
157 
158  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
159  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
160 
161  TLOG(TLVL_TRACE) << "Checking for complete event";
162  auto fragmentCount = GetFragmentCount(frag.sequence_id);
163  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
164  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
165  << ", fragmentCount=" << fragmentCount
166  << ", num_fragments_per_event=" << num_fragments_per_event_
167  << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
168 
169  complete_buffer_(buffer);
170  if (requests_)
171  {
172  requests_->SendRequest(true);
173  }
174 
175  TLOG(TLVL_TRACE) << "AddFragment END";
176  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
177  return true;
178 }
179 
180 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
181 {
182  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
183  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress()); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
184  auto data = frag->headerAddress();
185  auto start = std::chrono::steady_clock::now();
186  bool sts = false;
187  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
188  {
189  sts = AddFragment(hdr, data);
190  if (!sts)
191  {
192  usleep(1000);
193  }
194  }
195  if (!sts)
196  {
197  outfrag = std::move(frag);
198  }
199  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
200  return sts;
201 }
202 
203 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
204 {
205  if (!running_) return nullptr;
206  TLOG(14) << "WriteFragmentHeader BEGIN";
207  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
208 
209  if (buffer < 0)
210  {
211  if (buffer == -1 && !dropIfNoBuffersAvailable)
212  {
213  std::unique_lock<std::mutex> bp_lk(sequence_id_mutex_);
214  if (TimeUtils::GetElapsedTime(last_backpressure_report_time_) > 1.0)
215  {
216  TLOG(TLVL_WARNING) << app_name << ": Back-pressure condition: All Shared Memory buffers have been full for " << TimeUtils::GetElapsedTime(last_fragment_header_write_time_) << " s!";
217  last_backpressure_report_time_ = std::chrono::steady_clock::now();
218  }
219  return nullptr;
220  }
221  if (buffer == -2)
222  {
223  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
224  }
225  else
226  {
227  TLOG(TLVL_INFO) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
228  }
229  dropped_data_.emplace_back(frag, std::make_unique<Fragment>(frag.word_count - frag.num_words()));
230  auto it = dropped_data_.rbegin();
231 
232  TLOG(TLVL_DEBUG + 3) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into "
233  << static_cast<void*>(it->second->dataBegin()) << " sz=" << it->second->dataSizeBytes();
234 
235  return it->second->dataBegin();
236  }
237 
238  last_backpressure_report_time_ = std::chrono::steady_clock::now();
239  last_fragment_header_write_time_ = std::chrono::steady_clock::now();
240  // Increment this as soon as we know we want to use the buffer
241  buffer_writes_pending_[buffer]++;
242 
243  if (metricMan)
244  {
245  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
246  }
247 
248  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
249 
250  std::unique_lock<std::mutex> lk(buffer_mutexes_.at(buffer));
251 
252  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
253 
254  auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
255  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
256 
257  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
258  if (frag.word_count - frag.num_words() > 0)
259  {
260  auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
261 
262  if (!sts)
263  {
264  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words(); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
265  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType; // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
266  TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
267  dropped_data_.emplace_back(frag, std::make_unique<Fragment>(frag.word_count - frag.num_words()));
268  auto it = dropped_data_.rbegin();
269 
270  oversize_fragment_count_++;
271 
272  if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
273  {
274  throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
275  }
276 
277  TLOG(TLVL_DEBUG + 3) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id
278  << " into " << static_cast<void*>(it->second->dataBegin());
279  return it->second->dataBegin();
280  }
281  }
282  TLOG(14) << "WriteFragmentHeader END";
283  return pos;
284 }
285 
286 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
287 {
288  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
289 
290  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
291  if (buffer < 0)
292  {
293  for (auto it = dropped_data_.begin(); it != dropped_data_.end(); ++it)
294  {
295  if (it->first == frag)
296  {
297  dropped_data_.erase(it);
298  return;
299  }
300  }
301  if (buffer == -1)
302  {
303  Detach(true, "SharedMemoryEventManager",
304  "getBufferForSequenceID_ returned -1 in DoneWritingFragment. This indicates a possible mismatch between expected Fragment count and the actual number of Fragments received.");
305  }
306  return;
307  }
308 
309  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
310  {
311  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
312 
313  std::unique_lock<std::mutex> lk(buffer_mutexes_.at(buffer));
314 
315  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
316 
317  TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << static_cast<int>(frag.type) << ")";
318  auto hdr = getEventHeader_(buffer);
319  if (update_run_ids_)
320  {
321  hdr->run_id = run_id_;
322  }
323  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
324 
325  TLOG(TLVL_TRACE) << "DoneWritingFragment: Updating buffer touch time";
326  TouchBuffer(buffer);
327 
328  if (buffer_writes_pending_[buffer] > 1)
329  {
330  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
331  buffer_writes_pending_[buffer]--;
332  return;
333  }
334  TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
335  auto frag_count = GetFragmentCount(frag.sequence_id);
336  hdr->is_complete = frag_count >= num_fragments_per_event_;
337 
338  if (frag_count > num_fragments_per_event_)
339  {
340  TLOG(TLVL_WARNING) << "DoneWritingFragment: This Event has more Fragments ( " << frag_count << " ) than specified in configuration ( " << num_fragments_per_event_ << " )!"
341  << " This is probably due to a misconfiguration and is *not* a reliable mode!";
342  }
343 
344  TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
345 #if ART_SUPPORTS_DUPLICATE_EVENTS
346  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
347  {
348  hdr->is_complete = frag_count >= released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
349  }
350 #endif
351 
352  complete_buffer_(buffer);
353 
354  // Move this down here to avoid race condition
355  buffer_writes_pending_[buffer]--;
356  }
357  if (requests_)
358  {
359  requests_->SendRequest(true);
360  }
361  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
362 }
363 
364 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
365 {
366  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
367 }
368 
369 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
370 {
371  if (buffer < 0)
372  {
373  return 0;
374  }
375  ResetReadPos(buffer);
376  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
377 
378  size_t count = 0;
379 
380  while (MoreDataInBuffer(buffer))
381  {
382  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
383  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
384  if (type != Fragment::InvalidFragmentType && fragHdr->type != type)
385  {
386  continue;
387  }
388  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
389  ++count;
390  }
391 
392  return count;
393 }
394 
395 void artdaq::SharedMemoryEventManager::RunArt(const std::shared_ptr<art_config_file>& config_file, const std::shared_ptr<std::atomic<pid_t>>& pid_out)
396 {
397  do
398  {
399  auto start_time = std::chrono::steady_clock::now();
400  send_init_frags_();
401  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
402 
403  pid_t pid = 0;
404 
405  if (!manual_art_)
406  {
407  char* filename = new char[config_file->getFileName().length() + 1];
408  memcpy(filename, config_file->getFileName().c_str(), config_file->getFileName().length());
409  filename[config_file->getFileName().length()] = '\0'; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
410 
411 #if DEBUG_ART
412  std::string debugArgS = "--config-out=" + app_name + "_art.out";
413  char* debugArg = new char[debugArgS.length() + 1];
414  memcpy(debugArg, debugArgS.c_str(), debugArgS.length());
415  debugArg[debugArgS.length()] = '\0';
416 
417  std::vector<char*> args{const_cast<char*>("art"), const_cast<char*>("-c"), filename, debugArg, NULL}; // NOLINT(cppcoreguidelines-pro-type-const-cast)
418 #else
419  std::vector<char*> args{const_cast<char*>("art"), const_cast<char*>("-c"), filename, nullptr}; // NOLINT(cppcoreguidelines-pro-type-const-cast)
420 #endif
421 
422  pid = fork();
423  if (pid == 0)
424  { /* child */
425  // 23-May-2018, KAB: added the setting of the partition number env var
426  // in the environment of the child art process so that Globals.hh
427  // will pick it up there and provide it to the artdaq classes that
428  // are used in data transfers, etc. within the art process.
429  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
430  std::string envVarValue = std::to_string(GetPartitionNumber());
431  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
432  {
433  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
434  << "\" in the environment of a child art process. "
435  << "This may result in incorrect TCP port number "
436  << "assignments or other issues, and data may "
437  << "not flow through the system correctly.";
438  }
439  envVarKey = "ARTDAQ_APPLICATION_NAME";
440  envVarValue = app_name;
441  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
442  {
443  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
444  << "\" in the environment of a child art process. ";
445  }
446  envVarKey = "ARTDAQ_RANK";
447  envVarValue = std::to_string(my_rank);
448  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
449  {
450  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
451  << "\" in the environment of a child art process. ";
452  }
453 
454  execvp("art", &args[0]);
455  delete[] filename;
456  exit(1);
457  }
458  delete[] filename;
459  }
460  else
461  {
462  //Using cin/cout here to ensure console is active (artdaqDriver)
463  std::cout << "Please run the following command in a separate terminal:" << std::endl
464  << "art -c " << config_file->getFileName() << std::endl
465  << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
466  << "Finally, return to this window and enter the pid: " << std::endl;
467  std::cin >> pid;
468  }
469  *pid_out = pid;
470 
471  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
472  {
473  std::unique_lock<std::mutex> lk(art_process_mutex_);
474  art_processes_.insert(pid);
475  }
476  siginfo_t status;
477  auto sts = waitid(P_PID, pid, &status, WEXITED);
478  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
479  {
480  std::unique_lock<std::mutex> lk(art_process_mutex_);
481  art_processes_.erase(pid);
482  }
483  if (sts < 0)
484  {
485  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
486  }
487  else if (status.si_code == CLD_EXITED && status.si_status == 0)
488  {
489  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
490  }
491  else
492  {
493  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
494  if (art_lifetime < minimum_art_lifetime_s_)
495  {
496  restart_art_ = false;
497  }
498 
499  auto exit_type = "exited with status code";
500  switch (status.si_code)
501  {
502  case CLD_DUMPED:
503  case CLD_KILLED:
504  exit_type = "was killed with signal";
505  break;
506  case CLD_EXITED:
507  default:
508  break;
509  }
510 
511  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
512  << "art process " << pid << " " << exit_type << " " << status.si_status
513  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
514  << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
515  << (restart_art_ ? "restarting" : "not restarting");
516  }
517  } while (restart_art_);
518 }
519 
521 {
522  restart_art_ = always_restart_art_;
523  if (num_art_processes_ == 0)
524  {
525  return;
526  }
527  for (size_t ii = 0; ii < num_art_processes_; ++ii)
528  {
529  StartArtProcess(current_art_pset_);
530  }
531 }
532 
534 {
535  static std::mutex start_art_mutex;
536  std::unique_lock<std::mutex> lk(start_art_mutex);
537  //TraceLock lk(start_art_mutex, 15, "StartArtLock");
538  restart_art_ = always_restart_art_;
539  auto initialCount = GetAttachedCount();
540  auto startTime = std::chrono::steady_clock::now();
541 
542  if (pset != current_art_pset_ || !current_art_config_file_)
543  {
544  current_art_pset_ = pset;
545  current_art_config_file_ = std::make_shared<art_config_file>(pset /*, GetKey(), GetBroadcastKey()*/);
546  }
547  std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
548  boost::thread thread([=] { RunArt(current_art_config_file_, pid); });
549  thread.detach();
550 
551  auto currentCount = GetAttachedCount() - initialCount;
552  while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
553  {
554  usleep(10000);
555  currentCount = GetAttachedCount() - initialCount;
556  }
557  if ((currentCount < 1 || *pid <= 0) && manual_art_)
558  {
559  TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
560  return 0;
561  }
562  if (currentCount < 1 || *pid <= 0)
563  {
564  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
565  << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
566  return 0;
567  }
568 
569  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
570  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
571 
572  return *pid;
573 }
574 
576 {
577  restart_art_ = false;
578  //current_art_config_file_ = nullptr;
579  //current_art_pset_ = fhicl::ParameterSet();
580 
581  auto check_pids = [&](bool print) {
582  std::unique_lock<std::mutex> lk(art_process_mutex_);
583  for (auto pid = pids.begin(); pid != pids.end();)
584  {
585  // 08-May-2018, KAB: protect against killing invalid PIDS
586 
587  if (*pid <= 0)
588  {
589  TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
590  << ") from the shutdown list.";
591  pid = pids.erase(pid);
592  }
593  else if (kill(*pid, 0) < 0)
594  {
595  pid = pids.erase(pid);
596  }
597  else
598  {
599  if (print)
600  {
601  std::cout << *pid << " ";
602  }
603  ++pid;
604  }
605  }
606  };
607  auto count_pids = [&]() {
608  std::unique_lock<std::mutex> lk(art_process_mutex_);
609  return pids.size();
610  };
611  check_pids(false);
612  if (count_pids() == 0)
613  {
614  TLOG(14) << "All art processes already exited, nothing to do.";
615  usleep(1000);
616  return;
617  }
618 
619  if (!manual_art_)
620  {
621  int graceful_wait_ms = art_event_processing_time_us_ * size() * 10 / 1000;
622  int gentle_wait_ms = art_event_processing_time_us_ * size() * 2 / 1000;
623  int int_wait_ms = art_event_processing_time_us_ * size() / 1000;
624  auto shutdown_start = std::chrono::steady_clock::now();
625 
626 // if (!overwrite_mode_)
627  {
628  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
629  for (int ii = 0; ii < graceful_wait_ms; ++ii)
630  {
631  usleep(1000);
632 
633  check_pids(false);
634  if (count_pids() == 0)
635  {
636  TLOG(TLVL_INFO) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms.";
637  return;
638  }
639  }
640  }
641 
642  {
643  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
644  std::unique_lock<std::mutex> lk(art_process_mutex_);
645  for (auto pid : pids)
646  {
647  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
648  kill(pid, SIGQUIT);
649  }
650  }
651 
652  TLOG(TLVL_TRACE) << "Waiting up to " << gentle_wait_ms << " ms for all art processes to exit from SIGQUIT";
653  for (int ii = 0; ii < gentle_wait_ms; ++ii)
654  {
655  usleep(1000);
656 
657  check_pids(false);
658  if (count_pids() == 0)
659  {
660  TLOG(TLVL_INFO) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms (SIGQUIT).";
661  return;
662  }
663  }
664 
665  {
666  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
667  std::unique_lock<std::mutex> lk(art_process_mutex_);
668  for (auto pid : pids)
669  {
670  kill(pid, SIGINT);
671  }
672  }
673 
674  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit from SIGINT";
675  for (int ii = 0; ii < int_wait_ms; ++ii)
676  {
677  usleep(1000);
678 
679  check_pids(false);
680 
681  if (count_pids() == 0)
682  {
683  TLOG(TLVL_INFO) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms (SIGINT).";
684  return;
685  }
686  }
687 
688  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
689  while (count_pids() > 0)
690  {
691  {
692  std::unique_lock<std::mutex> lk(art_process_mutex_);
693  kill(*pids.begin(), SIGKILL);
694  usleep(1000);
695  }
696  check_pids(false);
697  }
698  TLOG(TLVL_INFO) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms (SIGKILL).";
699  }
700  else
701  {
702  std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
703  while (count_pids() > 0)
704  {
705  std::cout << "The following PIDs are running: ";
706  check_pids(true);
707  std::cout << std::endl;
708  std::string ignored;
709  std::cin >> ignored;
710  }
711  }
712 }
713 
714 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
715 {
716  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
717  if (restart_art_ || !always_restart_art_) // Art is running
718  {
719  endOfData();
720  }
721  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
722  {
723  broadcasts_.MarkBufferEmpty(ii, true);
724  }
725  if (newRun == 0)
726  {
727  newRun = run_id_ + 1;
728  }
729 
730  if (art_pset != current_art_pset_ || !current_art_config_file_)
731  {
732  current_art_pset_ = art_pset;
733  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
734  }
735 
736  if (n_art_processes != -1)
737  {
738  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
739  num_art_processes_ = n_art_processes;
740  }
741  startRun(newRun);
742  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
743 }
744 
746 {
747  running_ = false;
748  init_fragments_.clear();
749  received_init_frags_.clear();
750  TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
751  restart_art_ = false;
752 
753  auto start = std::chrono::steady_clock::now();
754  auto pendingWriteCount = std::accumulate(buffer_writes_pending_.begin(), buffer_writes_pending_.end(), 0, [](int a, auto& b) { return a + b.second.load(); });
755  TLOG(TLVL_DEBUG) << "endOfData: Waiting for " << pendingWriteCount << " pending writes to complete";
756  while (pendingWriteCount > 0 && TimeUtils::GetElapsedTimeMicroseconds(start) < 1000000)
757  {
758  usleep(10000);
759  pendingWriteCount = std::accumulate(buffer_writes_pending_.begin(), buffer_writes_pending_.end(), 0, [](int a, auto& b) { return a + b.second.load(); });
760  }
761 
762  size_t initialStoreSize = GetIncompleteEventCount();
763  TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
764  << " stale events from the SharedMemoryEventManager.";
765  int counter = initialStoreSize;
766  while (!active_buffers_.empty() && counter > 0)
767  {
768  complete_buffer_(*active_buffers_.begin());
769  counter--;
770  }
771  TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
772  << " stale events in the SharedMemoryEventManager.";
773 
774  TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
775  start = std::chrono::steady_clock::now();
776  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
777  auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1); //size();
778 
779  auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
780 
781  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
782  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
783  {
784  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
785  if (temp != lastReadCount)
786  {
787  TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
788  lastReadCount = temp;
789  start = std::chrono::steady_clock::now();
790  }
791  if (lastReadCount > 0)
792  {
793  TLOG(19) << "About to sleep " << outstanding_buffer_wait_time << " us - lastReadCount=" << lastReadCount << " size=" << size() << " end_of_data_wait_us=" << end_of_data_wait_us;
794  usleep(outstanding_buffer_wait_time);
795  }
796  }
797 
798  TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
799  << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
800 
801  TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
802  FragmentPtrs broadcast;
803  broadcast.emplace_back(Fragment::eodFrag(GetBufferCount()));
804  bool success = broadcastFragments_(broadcast);
805  if (!success)
806  {
807  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
808  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
809  {
810  broadcasts_.MarkBufferEmpty(ii, true);
811  }
812  broadcastFragments_(broadcast);
813  }
814  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
815  while (get_art_process_count_() > 0)
816  {
817  TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
818 
819  ShutdownArtProcesses(art_processes_);
820  }
821  TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
822 
823  ResetAttachedCount();
824 
825  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
826  for (size_t ii = 0; ii < size(); ++ii)
827  {
828  MarkBufferEmpty(ii, true);
829  }
830  // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again...
831  // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers";
832  // for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
833  // {
834  // broadcasts_.MarkBufferEmpty(ii, true);
835  // }
836  released_events_.clear();
837  released_incomplete_events_.clear();
838 
839  TLOG(TLVL_DEBUG) << "endOfData END";
840  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
841  return true;
842 }
843 
845 {
846  running_ = true;
847  init_fragments_.clear();
848  received_init_frags_.clear();
849  statsHelper_.resetStatistics();
850  TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
851  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
852  {
853  broadcasts_.MarkBufferEmpty(ii, true);
854  }
855  released_events_.clear();
856  released_incomplete_events_.clear();
857  StartArt();
858  run_id_ = runID;
859  {
860  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
861  subrun_event_map_.clear();
862  subrun_event_map_[0] = 1;
863  }
864  run_event_count_ = 0;
865  run_incomplete_event_count_ = 0;
866  requests_ = std::make_unique<RequestSender>(data_pset_);
867  if (requests_)
868  {
869  requests_->SetRunNumber(static_cast<uint32_t>(run_id_));
870  requests_->SendRoutingToken(queue_size_, run_id_);
871  }
872  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
873  << ", max queue size = "
874  << queue_size_
875  << ", queue size = "
876  << GetLockedBufferCount();
877  if (metricMan)
878  {
879  metricMan->sendMetric("Run Number", static_cast<uint64_t>(run_id_), "Run", 1, MetricMode::LastPoint | MetricMode::Persist);
880  }
881 }
882 
884 {
885  TLOG(TLVL_INFO) << "Ending run " << run_id_;
886  FragmentPtr endOfRunFrag(new Fragment(static_cast<size_t>(ceil(sizeof(my_rank) /
887  static_cast<double>(sizeof(Fragment::value_type))))));
888 
889  TLOG(TLVL_DEBUG) << "Shutting down RequestSender";
890  requests_.reset(nullptr);
891 
892  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
893  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
894  *endOfRunFrag->dataBegin() = my_rank;
895  FragmentPtrs broadcast;
896  broadcast.emplace_back(std::move(endOfRunFrag));
897  broadcastFragments_(broadcast);
898 
899  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
900  run_event_count_ = 0;
901  run_incomplete_event_count_ = 0;
902  oversize_fragment_count_ = 0;
903  {
904  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
905  subrun_event_map_.clear();
906  subrun_event_map_[0] = 1;
907  }
908  return true;
909 }
910 
912 {
913  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
914  if (boundary == 0 || boundary == Fragment::InvalidSequenceID)
915  {
916  return;
917  }
918 
919  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
920 
921  // Don't re-rollover to an already-defined subrun
922  if (!subrun_event_map_.empty() && subrun_event_map_.rbegin()->second == subrun)
923  {
924  return;
925  }
926  TLOG(TLVL_INFO) << "Will roll over to subrun " << subrun << " when I reach Sequence ID " << boundary;
927  subrun_event_map_[boundary] = subrun;
928  while (subrun_event_map_.size() > max_subrun_event_map_length_)
929  {
930  subrun_event_map_.erase(subrun_event_map_.begin());
931  }
932 }
933 
935 {
936  Fragment::sequence_id_t seqID = 0;
937  subrun_id_t subrun = 0;
938  {
939  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
940  for (auto& it : subrun_event_map_)
941  {
942  if (it.first >= seqID)
943  {
944  seqID = it.first + 1;
945  }
946  if (it.second >= subrun)
947  {
948  subrun = it.second + 1;
949  }
950  }
951  }
952  rolloverSubrun(seqID, subrun);
953 }
954 
956 {
957  if (metricMan)
958  {
959  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
960  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
961  }
962 
963  if (incomplete_event_report_interval_ms_ > 0 && (GetLockedBufferCount() != 0u))
964  {
965  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
966  {
967  return;
968  }
969 
970  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
971  std::ostringstream oss;
972  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
973  for (auto& ev : active_buffers_)
974  {
975  auto hdr = getEventHeader_(ev);
976  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
977  }
978  TLOG(TLVL_DEBUG) << oss.str();
979  }
980 }
981 
982 bool artdaq::SharedMemoryEventManager::broadcastFragments_(FragmentPtrs& frags)
983 {
984  if (frags.empty())
985  {
986  TLOG(TLVL_ERROR) << "Requested broadcast but no Fragments given!";
987  return false;
988  }
989  TLOG(TLVL_DEBUG) << "Broadcasting Fragments with seqID=" << frags.front()->sequenceID()
990  << ", type " << detail::RawFragmentHeader::SystemTypeToString(frags.front()->type())
991  << ", size=" << frags.front()->sizeBytes() << "B.";
992  auto buffer = broadcasts_.GetBufferForWriting(false);
993  TLOG(TLVL_DEBUG) << "broadcastFragments_: after getting buffer 1st buffer=" << buffer;
994  auto start_time = std::chrono::steady_clock::now();
995  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
996  {
997  usleep(10000);
998  buffer = broadcasts_.GetBufferForWriting(false);
999  }
1000  TLOG(TLVL_DEBUG) << "broadcastFragments_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
1001  if (buffer == -1)
1002  {
1003  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frags.front()->typeString() << " failed due to timeout waiting for buffer!";
1004  return false;
1005  }
1006 
1007  TLOG(TLVL_DEBUG) << "broadcastFragments_: Filling in RawEventHeader";
1008  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
1009  hdr->run_id = run_id_;
1010  hdr->subrun_id = GetSubrunForSequenceID(frags.front()->sequenceID());
1011  hdr->sequence_id = frags.front()->sequenceID();
1012  hdr->is_complete = true;
1013  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
1014 
1015  for (auto& frag : frags)
1016  {
1017  TLOG(TLVL_DEBUG) << "broadcastFragments_ before Write calls";
1018  if (frag->sequenceID() != hdr->sequence_id || frag->type() != frags.front()->type())
1019  {
1020  TLOG(TLVL_WARNING) << "Not sending fragment because its SequenceID or Type disagrees with leading Fragment";
1021  continue;
1022  }
1023  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
1024  }
1025 
1026  TLOG(TLVL_DEBUG) << "broadcastFragments_ Marking buffer full";
1027  broadcasts_.MarkBufferFull(buffer, -1);
1028  TLOG(TLVL_DEBUG) << "broadcastFragment_s Complete";
1029  return true;
1030 }
1031 
1032 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
1033 {
1034  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer)); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
1035 }
1036 
1038 {
1039  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
1040 
1041  TLOG(TLVL_TRACE) << "GetSubrunForSequenceID BEGIN map size = " << subrun_event_map_.size();
1042  auto it = subrun_event_map_.begin();
1043  subrun_id_t subrun = 1;
1044 
1045  while (it->first <= seqID && it != subrun_event_map_.end())
1046  {
1047  TLOG(TLVL_TRACE) << "Map has sequence ID " << it->first << ", subrun " << it->second << " (looking for <= " << seqID << ")";
1048  subrun = it->second;
1049  ++it;
1050  }
1051 
1052  TLOG(TLVL_DEBUG) << "GetSubrunForSequenceID returning subrun " << subrun << " for sequence ID " << seqID;
1053  return subrun;
1054 }
1055 
1056 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
1057 {
1058  TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
1059  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1060 
1061  TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
1062 
1063  auto buffers = GetBuffersOwnedByManager();
1064  for (auto& buf : buffers)
1065  {
1066  auto hdr = getEventHeader_(buf);
1067  if (hdr->sequence_id == seqID)
1068  {
1069  TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
1070  return buf;
1071  }
1072  }
1073 
1074 #if !ART_SUPPORTS_DUPLICATE_EVENTS
1075  if (released_incomplete_events_.count(seqID) != 0u)
1076  {
1077  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
1078  return -2;
1079  }
1080  if (released_events_.count(seqID) != 0u)
1081  {
1082  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been completed and released to art! Check configuration for inconsistent Fragment count per event!";
1083  return -2;
1084  }
1085 #endif
1086 
1087  if (!create_new)
1088  {
1089  return -1;
1090  }
1091 
1092  check_pending_buffers_(lk);
1093  int new_buffer = GetBufferForWriting(false);
1094 
1095  if (new_buffer == -1)
1096  {
1097  new_buffer = GetBufferForWriting(overwrite_mode_);
1098  }
1099 
1100  if (new_buffer == -1)
1101  {
1102  return -1;
1103  }
1104  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
1105  std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_.at(new_buffer));
1106  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
1107  auto hdr = getEventHeader_(new_buffer);
1108  hdr->is_complete = false;
1109  hdr->run_id = run_id_;
1110  hdr->subrun_id = GetSubrunForSequenceID(seqID);
1111  hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
1112  hdr->sequence_id = seqID;
1113  hdr->timestamp = timestamp;
1114  buffer_writes_pending_[new_buffer] = 0;
1115  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
1116  SetMFIteration("Sequence ID " + std::to_string(seqID));
1117 
1118  TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
1119  active_buffers_.insert(new_buffer);
1120  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1121  << size() << ","
1122  << ReadReadyCount() << ","
1123  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1124  << WriteReadyCount(false) << ","
1125  << pending_buffers_.size() << ","
1126  << active_buffers_.size() << ")";
1127 
1128  if (requests_)
1129  {
1130  if (timestamp != Fragment::InvalidTimestamp)
1131  {
1132  requests_->AddRequest(seqID, timestamp);
1133  }
1134  // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we
1135  // don't double-send requests, but still get the benefit of calling SendRequest 'often'.
1136  else
1137  {
1138  requests_->SendRequest();
1139  }
1140  }
1141  TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
1142  return new_buffer;
1143 }
1144 
1145 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
1146 {
1147  if (buffer == -1)
1148  {
1149  return true;
1150  }
1151  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
1152  {
1153  return true;
1154  }
1155  ResetReadPos(buffer);
1156  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
1157  return MoreDataInBuffer(buffer);
1158 }
1159 
1160 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
1161 {
1162  auto hdr = getEventHeader_(buffer);
1163  if (hdr->is_complete)
1164  {
1165  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
1166 
1167  {
1168  TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
1169 
1170  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1171  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1172  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1173  active_buffers_.erase(buffer);
1174  pending_buffers_.insert(buffer);
1175  released_events_.insert(hdr->sequence_id);
1176  while (released_events_.size() > max_event_list_length_)
1177  {
1178  released_events_.erase(released_events_.begin());
1179  }
1180 
1181  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1182  << size() << ","
1183  << ReadReadyCount() << ","
1184  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1185  << WriteReadyCount(false) << ","
1186  << pending_buffers_.size() << ","
1187  << active_buffers_.size() << ")";
1188  }
1189  if (requests_)
1190  {
1191  requests_->RemoveRequest(hdr->sequence_id);
1192  }
1193  }
1194  CheckPendingBuffers();
1195 }
1196 
1197 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
1198 {
1199  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1200 }
1201 
1203 {
1204  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
1205  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1206  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
1207  check_pending_buffers_(lk);
1208 }
1209 
1210 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
1211 {
1212  TLOG(14) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1213 
1214  auto buffers = GetBuffersOwnedByManager();
1215  for (auto buf : buffers)
1216  {
1217  if (ResetBuffer(buf) && (pending_buffers_.count(buf) == 0u))
1218  {
1219  TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1220  auto hdr = getEventHeader_(buf);
1221  if ((active_buffers_.count(buf) != 0u) && buffer_writes_pending_[buf].load() == 0)
1222  {
1223  if (requests_)
1224  {
1225  requests_->RemoveRequest(hdr->sequence_id);
1226  }
1227  TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
1228  active_buffers_.erase(buf);
1229  pending_buffers_.insert(buf);
1230  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1231  << size() << ","
1232  << ReadReadyCount() << ","
1233  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1234  << WriteReadyCount(false) << ","
1235  << pending_buffers_.size() << ","
1236  << active_buffers_.size() << ")";
1237 
1238  run_incomplete_event_count_++;
1239  if (metricMan)
1240  {
1241  metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
1242  }
1243  if (released_incomplete_events_.count(hdr->sequence_id) == 0u)
1244  {
1245  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1246  }
1247  else
1248  {
1249  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1250  }
1251  TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
1252  }
1253  }
1254  }
1255 
1256  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1257  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
1258 
1259  auto counter = 0;
1260  double eventSize = 0;
1261  for (auto buf : sorted_buffers)
1262  {
1263  auto hdr = getEventHeader_(buf);
1264  auto thisEventSize = BufferDataSize(buf);
1265 
1266  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
1267  << "event_size=" << thisEventSize << ", buffer_size=" << BufferSize();
1268  statsHelper_.addSample(EVENTS_RELEASED_STAT_KEY, thisEventSize);
1269 
1270  TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
1271  MarkBufferFull(buf);
1272  run_event_count_++;
1273  counter++;
1274  eventSize += thisEventSize;
1275  pending_buffers_.erase(buf);
1276  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1277  << size() << ","
1278  << ReadReadyCount() << ","
1279  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1280  << WriteReadyCount(false) << ","
1281  << pending_buffers_.size() << ","
1282  << active_buffers_.size() << ")";
1283  }
1284 
1285  if (requests_ && requests_->RoutingTokenSendsEnabled())
1286  {
1287  TLOG(TLVL_TRACE) << "Sent tokens: " << requests_->GetSentTokenCount() << ", Event count: " << run_event_count_;
1288  auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
1289  auto available_buffers = WriteReadyCount(overwrite_mode_);
1290 
1291  TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
1292  << ", tokens_to_send: " << available_buffers - outstanding_tokens;
1293 
1294  if (available_buffers > outstanding_tokens)
1295  {
1296  auto tokens_to_send = available_buffers - outstanding_tokens;
1297 
1298  while (tokens_to_send > 0)
1299  {
1300  TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
1301  requests_->SendRoutingToken(1, run_id_);
1302  tokens_to_send--;
1303  }
1304  }
1305  }
1306 
1307  if (statsHelper_.readyToReport())
1308  {
1309  std::string statString = buildStatisticsString_();
1310  TLOG(TLVL_INFO) << statString;
1311  }
1312 
1313  if (metricMan)
1314  {
1315  TLOG(14) << "check_pending_buffers_: Sending Metrics";
1316  metricMan->sendMetric("Event Rate", counter, "Events", 1, MetricMode::Rate);
1317  metricMan->sendMetric("Data Rate", eventSize, "Bytes", 1, MetricMode::Rate);
1318  if (counter > 0)
1319  {
1320  metricMan->sendMetric("Average Event Size", eventSize / counter, "Bytes", 1, MetricMode::Average);
1321  }
1322 
1323  metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
1324  metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
1325  if (requests_)
1326  {
1327  metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
1328  }
1329 
1330  auto bufferReport = GetBufferReport();
1331  int full = 0, empty = 0, writing = 0, reading = 0;
1332  for (auto& buf : bufferReport)
1333  {
1334  switch (buf.second)
1335  {
1336  case BufferSemaphoreFlags::Full:
1337  full++;
1338  break;
1339  case BufferSemaphoreFlags::Empty:
1340  empty++;
1341  break;
1342  case BufferSemaphoreFlags::Writing:
1343  writing++;
1344  break;
1345  case BufferSemaphoreFlags::Reading:
1346  reading++;
1347  break;
1348  }
1349  }
1350  auto total = size();
1351  TLOG(15) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
1352 
1353  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
1354  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
1355  metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
1356  metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
1357  if (total > 0)
1358  {
1359  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1360  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1361  }
1362  }
1363  TLOG(14) << "check_pending_buffers_ END";
1364 }
1365 
1366 void artdaq::SharedMemoryEventManager::send_init_frags_()
1367 {
1368  if (init_fragments_.size() >= init_fragment_count_ && init_fragment_count_ > 0)
1369  {
1370  TLOG(TLVL_INFO) << "Broadcasting " << init_fragments_.size() << " Init Fragment(s) to all art subprocesses...";
1371 
1372 #if 0
1373  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
1374  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1375  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1376  ostream.close();
1377 #endif
1378 
1379  broadcastFragments_(init_fragments_);
1380  TLOG(TLVL_TRACE) << "Init Fragment sent";
1381  }
1382  else if (init_fragment_count_ > 0 && init_fragments_.size() == 0)
1383  {
1384  TLOG(TLVL_WARNING) << "Cannot send Init Fragment(s) because I haven't yet received them! Set send_init_fragments to false or init_fragment_count to 0 if this process does not receive serialized art events to avoid potentially lengthy timeouts!";
1385  }
1386  else if (init_fragment_count_ > 0)
1387  {
1388  TLOG(TLVL_INFO) << "Cannot send Init Fragment(s) because I haven't yet received them (have " << init_fragments_.size() << " of " << init_fragment_count_ << ")!";
1389  }
1390  else
1391  {
1392  // Send an empty Init Fragment so that ArtdaqInput knows that this is a pure-Fragment input
1393  artdaq::FragmentPtrs begin_run_fragments_;
1394  begin_run_fragments_.emplace_back(new artdaq::Fragment());
1395  begin_run_fragments_.back()->setSystemType(artdaq::Fragment::InitFragmentType);
1396  broadcastFragments_(begin_run_fragments_);
1397  }
1398 }
1399 
1401 {
1402  static std::mutex init_fragment_mutex;
1403  std::lock_guard<std::mutex> lk(init_fragment_mutex);
1404  if (received_init_frags_.count(frag->fragmentID()) == 0)
1405  {
1406  TLOG(TLVL_DEBUG) << "Received Init Fragment from rank " << frag->fragmentID() << ". Now have " << init_fragments_.size() + 1 << " of " << init_fragment_count_;
1407  received_init_frags_.insert(frag->fragmentID());
1408  init_fragments_.push_back(std::move(frag));
1409 
1410  // Don't send until all init fragments have been received
1411  if (init_fragments_.size() >= init_fragment_count_)
1412  {
1413  send_init_frags_();
1414  }
1415  }
1416  else
1417  {
1418  TLOG(TLVL_TRACE) << "Ignoring duplicate Init Fragment from rank " << frag->fragmentID();
1419  }
1420 }
1421 
1423 {
1424  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
1425  if (art_pset != current_art_pset_ || !current_art_config_file_)
1426  {
1427  current_art_pset_ = art_pset;
1428  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
1429  }
1430  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
1431 }
1432 
1433 std::string artdaq::SharedMemoryEventManager::buildStatisticsString_() const
1434 {
1435  std::ostringstream oss;
1436  oss << app_name << " statistics:" << std::endl;
1437 
1438  artdaq::MonitoredQuantityPtr mqPtr =
1439  artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(EVENTS_RELEASED_STAT_KEY);
1440  if (mqPtr.get() != nullptr)
1441  {
1442  artdaq::MonitoredQuantityStats stats;
1443  mqPtr->getStats(stats);
1444  oss << " Event statistics: " << stats.recentSampleCount << " events released at " << stats.recentSampleRate
1445  << " events/sec, effective data rate = "
1446  << (stats.recentValueRate / 1024.0 / 1024.0)
1447  << " MB/sec, monitor window = " << stats.recentDuration
1448  << " sec, min::max event size = " << (stats.recentValueMin / 1024.0 / 1024.0)
1449  << "::" << (stats.recentValueMax / 1024.0 / 1024.0) << " MB" << std::endl;
1450  if (stats.recentSampleRate > 0.0)
1451  {
1452  oss << " Average time per event: ";
1453  oss << " elapsed time = " << (1.0 / stats.recentSampleRate) << " sec" << std::endl;
1454  }
1455  }
1456 
1457  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(FRAGMENTS_RECEIVED_STAT_KEY);
1458  if (mqPtr.get() != nullptr)
1459  {
1460  artdaq::MonitoredQuantityStats stats;
1461  mqPtr->getStats(stats);
1462  oss << " Fragment statistics: " << stats.recentSampleCount << " fragments received at " << stats.recentSampleRate
1463  << " fragments/sec, effective data rate = "
1464  << (stats.recentValueRate / 1024.0 / 1024.0)
1465  << " MB/sec, monitor window = " << stats.recentDuration
1466  << " sec, min::max fragment size = " << (stats.recentValueMin / 1024.0 / 1024.0)
1467  << "::" << (stats.recentValueMax / 1024.0 / 1024.0) << " MB" << std::endl;
1468  }
1469 
1470  oss << " Event counts: Run -- " << run_event_count_ << " Total, " << run_incomplete_event_count_ << " Incomplete."
1471  << " Subrun -- " << subrun_event_count_ << " Total, " << subrun_incomplete_event_count_ << " Incomplete. "
1472  << std::endl;
1473  return oss.str();
1474 }
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
void AddInitFragment(FragmentPtr &frag)
Set the stored Init fragment, if one has not yet been set already.
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
void RunArt(const std::shared_ptr< art_config_file > &config_file, const std::shared_ptr< std::atomic< pid_t >> &pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
void StartArt()
Start all the art processes.
subrun_id_t GetSubrunForSequenceID(Fragment::sequence_id_t seqID)
Get the subrun number that the given Sequence ID would be assigned to.
SharedMemoryEventManager(const fhicl::ParameterSet &pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
void rolloverSubrun()
Add a subrun transition immediately after the highest currently define sequence ID.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
static const std::string FRAGMENTS_RECEIVED_STAT_KEY
Key for Fragments Received MonitoredQuantity.
bool createCollectors(fhicl::ParameterSet const &pset, int defaultReportIntervalFragments, double defaultReportIntervalSeconds, double defaultMonitorWindow, std::string const &primaryStatKeyName)
Create MonitoredQuantity objects for all names registered with the StatisticsHelper.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
static const std::string EVENTS_RELEASED_STAT_KEY
Key for the Events Released MonitoredQuantity.
bool endOfData()
Indicate that the end of input has been reached to the art processes.
RawEvent::subrun_id_t subrun_id_t
Copy RawEvent::subrun_id_t into local scope.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...