artdaq  v3_07_01
SharedMemoryEventManager.cc
1 
2 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
3 #include <sys/wait.h>
4 #include "artdaq-core/Core/StatisticsCollection.hh"
5 #include "artdaq-core/Utilities/TraceLock.hh"
6 
7 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
8 
9 #define TLVL_BUFFER 40
10 #define TLVL_BUFLCK 41
11 
12 #define build_key(seed) seed + ((GetPartitionNumber() + 1) << 16) + (getpid() & 0xFFFF)
13 
14 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
15 std::mutex artdaq::SharedMemoryEventManager::subrun_event_map_mutex_;
16 const std::string artdaq::SharedMemoryEventManager::
17  FRAGMENTS_RECEIVED_STAT_KEY("SharedMemoryEventManagerFragmentsReceived");
18 const std::string artdaq::SharedMemoryEventManager::
19  EVENTS_RELEASED_STAT_KEY("SharedMemoryEventManagerEventsReleased");
20 
21 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
22  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", build_key(0xEE000000)),
23  pset.get<size_t>("buffer_count"),
24  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
25  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
26  !pset.get<bool>("broadcast_mode", false))
27  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
28  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
29  , queue_size_(pset.get<size_t>("buffer_count"))
30  , run_id_(0)
31  , max_subrun_event_map_length_(pset.get<size_t>("max_subrun_lookup_table_size", 100))
32  , max_event_list_length_(pset.get<size_t>("max_event_list_length", 100))
33  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
34  , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
35  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
36  , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
37  , running_(false)
38  , buffer_writes_pending_()
39  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
40  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
41  , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now())
42  , last_backpressure_report_time_(std::chrono::steady_clock::now())
43  , last_fragment_header_write_time_(std::chrono::steady_clock::now())
44  , metric_data_()
45  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
46  , run_event_count_(0)
47  , run_incomplete_event_count_(0)
48  , subrun_event_count_(0)
49  , subrun_incomplete_event_count_(0)
50  , oversize_fragment_count_(0)
51  , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
52  , art_processes_()
53  , restart_art_(false)
54  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
55  , manual_art_(pset.get<bool>("manual_art", false))
56  , current_art_pset_(art_pset)
57  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
58  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 1000000))
59  , requests_(nullptr)
60  , data_pset_(pset)
61  , dropped_data_()
62  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", build_key(0xBB000000)),
63  pset.get<size_t>("broadcast_buffer_count", 10),
64  pset.get<size_t>("broadcast_buffer_size", 0x100000),
65  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
66 {
67  subrun_event_map_[0] = 1;
68  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
69  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
70 
71  if (pset.get<bool>("use_art", true) == false)
72  {
73  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
74  num_art_processes_ = 0;
75  }
76  else
77  {
78  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
79  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
80  }
81  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
82 
83  if (overwrite_mode_ && num_art_processes_ > 0)
84  {
85  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
86  }
87  else if (overwrite_mode_)
88  {
89  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
90  }
91 
92  for (size_t ii = 0; ii < size(); ++ii)
93  {
94  buffer_writes_pending_[ii] = 0;
95  }
96 
97  if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
98 
99  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
100  SetRank(my_rank);
101  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
102 
105 
106  // fetch the monitoring parameters and create the MonitoredQuantity instances
107  statsHelper_.createCollectors(pset, 100, 30.0, 60.0, EVENTS_RELEASED_STAT_KEY);
108 
109  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
110 }
111 
113 {
114  TLOG(TLVL_TRACE) << "DESTRUCTOR";
115  if (running_) endOfData();
116  TLOG(TLVL_TRACE) << "Destructor END";
117 }
118 
119 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
120 {
121  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
122  << ", sequence_id=" << frag.sequence_id;
123  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
124  TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
125  if (buffer == -1) return false;
126  if (buffer == -2)
127  {
128  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
129  return true;
130  }
131 
132  auto hdr = getEventHeader_(buffer);
133  if (update_run_ids_)
134  {
135  hdr->run_id = run_id_;
136  }
137  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
138 
139  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
140  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
141 
142  TLOG(TLVL_TRACE) << "Checking for complete event";
143  auto fragmentCount = GetFragmentCount(frag.sequence_id);
144  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
145  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
146  << ", fragmentCount=" << fragmentCount
147  << ", num_fragments_per_event=" << num_fragments_per_event_
148  << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
149 
150  complete_buffer_(buffer);
151  if (requests_) requests_->SendRequest(true);
152 
153  TLOG(TLVL_TRACE) << "AddFragment END";
154  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
155  return true;
156 }
157 
158 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
159 {
160  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
161  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
162  auto data = frag->headerAddress();
163  auto start = std::chrono::steady_clock::now();
164  bool sts = false;
165  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
166  {
167  sts = AddFragment(hdr, data);
168  if (!sts) usleep(1000);
169  }
170  if (!sts)
171  {
172  outfrag = std::move(frag);
173  }
174  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
175  return sts;
176 }
177 
178 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
179 {
180  TLOG(14) << "WriteFragmentHeader BEGIN";
181  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
182 
183  if (buffer < 0)
184  {
185  if (buffer == -1 && !dropIfNoBuffersAvailable)
186  {
187  std::unique_lock<std::mutex> bp_lk(sequence_id_mutex_);
188  if (TimeUtils::GetElapsedTime(last_backpressure_report_time_) > 1.0)
189  {
190  TLOG(TLVL_WARNING) << app_name << ": Back-pressure condition: All Shared Memory buffers have been full for " << TimeUtils::GetElapsedTime(last_fragment_header_write_time_) << " s!";
191  last_backpressure_report_time_ = std::chrono::steady_clock::now();
192  }
193  return nullptr;
194  }
195  if (buffer == -2)
196  {
197  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
198  }
199  else
200  {
201  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
202  }
203  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
204 
205  TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
206  return dropped_data_[frag.fragment_id]->dataBegin();
207  }
208 
209  last_backpressure_report_time_ = std::chrono::steady_clock::now();
210  last_fragment_header_write_time_ = std::chrono::steady_clock::now();
211  // Increment this as soon as we know we want to use the buffer
212  buffer_writes_pending_[buffer]++;
213 
214  if (metricMan)
215  {
216  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
217  }
218 
219  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
220 
221  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
222 
223  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
224 
225  //TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
226  auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
227  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
228 
229  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
230  if (frag.word_count - frag.num_words() > 0)
231  {
232  auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
233 
234  if (!sts)
235  {
236  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words();
237  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType;
238  TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
239  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
240 
241  oversize_fragment_count_++;
242 
243  if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
244  {
245  throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
246  }
247 
248  TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin();
249  return dropped_data_[frag.fragment_id]->dataBegin();
250  }
251  }
252  TLOG(14) << "WriteFragmentHeader END";
253  return pos;
254 }
255 
256 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
257 {
258  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
259  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
260  if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
261  if (buffer == -2) { return; }
262 
263  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
264  {
265  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
266 
267  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
268 
269  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
270 
271  //TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
272 
273  TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")";
274  auto hdr = getEventHeader_(buffer);
275  if (update_run_ids_)
276  {
277  hdr->run_id = run_id_;
278  }
279  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
280 
281  TLOG(TLVL_TRACE) << "DoneWritingFragment: Updating buffer touch time";
282  TouchBuffer(buffer);
283 
284  buffer_writes_pending_[buffer]--;
285  if (buffer_writes_pending_[buffer] != 0)
286  {
287  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
288  return;
289  }
290  TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
291  auto frag_count = GetFragmentCount(frag.sequence_id);
292  hdr->is_complete = frag_count >= num_fragments_per_event_;
293 
294  if (frag_count > num_fragments_per_event_)
295  {
296  TLOG(TLVL_WARNING) << "DoneWritingFragment: This Event has more Fragments ( " << frag_count << " ) than specified in configuration ( " << num_fragments_per_event_ << " )!"
297  << " This is probably due to a misconfiguration and is *not* a reliable mode!";
298  }
299 
300  TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
301 #if ART_SUPPORTS_DUPLICATE_EVENTS
302  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
303  {
304  hdr->is_complete = frag_count >= released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
305  }
306 #endif
307  }
308 
309  complete_buffer_(buffer);
310  if (requests_) requests_->SendRequest(true);
311  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
312 }
313 
314 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
315 {
316  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
317 }
318 
319 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
320 {
321  if (buffer == -1) return 0;
322  ResetReadPos(buffer);
323  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
324 
325  size_t count = 0;
326 
327  while (MoreDataInBuffer(buffer))
328  {
329  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
330  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
331  if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
332  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
333  ++count;
334  }
335 
336  return count;
337 }
338 
339 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out)
340 {
341  do
342  {
343  auto start_time = std::chrono::steady_clock::now();
344  send_init_frag_();
345  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
346 
347  pid_t pid = 0;
348 
349  if (!manual_art_)
350  {
351  char* filename = new char[config_file->getFileName().length() + 1];
352  strcpy(filename, config_file->getFileName().c_str());
353 
354 #if DEBUG_ART
355  std::string debugArgS = "--config-out=" + app_name + "_art.out";
356  char* debugArg = new char[debugArgS.length() + 1];
357  strcpy(debugArg, debugArgS.c_str());
358 
359  std::vector<char*> args{(char*)"art", (char*)"-c", filename, debugArg, NULL};
360 #else
361  std::vector<char*> args{(char*)"art", (char*)"-c", filename, NULL};
362 #endif
363 
364  pid = fork();
365  if (pid == 0)
366  { /* child */
367  // 23-May-2018, KAB: added the setting of the partition number env var
368  // in the environment of the child art process so that Globals.hh
369  // will pick it up there and provide it to the artdaq classes that
370  // are used in data transfers, etc. within the art process.
371  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
372  std::string envVarValue = std::to_string(GetPartitionNumber());
373  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
374  {
375  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
376  << "\" in the environment of a child art process. "
377  << "This may result in incorrect TCP port number "
378  << "assignments or other issues, and data may "
379  << "not flow through the system correctly.";
380  }
381  envVarKey = "ARTDAQ_APPLICATION_NAME";
382  envVarValue = app_name;
383  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
384  {
385  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
386  << "\" in the environment of a child art process. ";
387  }
388  envVarKey = "ARTDAQ_RANK";
389  envVarValue = std::to_string(my_rank);
390  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
391  {
392  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
393  << "\" in the environment of a child art process. ";
394  }
395 
396  execvp("art", &args[0]);
397  delete[] filename;
398  exit(1);
399  }
400  delete[] filename;
401  }
402  else
403  {
404  //Using cin/cout here to ensure console is active (artdaqDriver)
405  std::cout << "Please run the following command in a separate terminal:" << std::endl
406  << "art -c " << config_file->getFileName() << std::endl
407  << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
408  << "Finally, return to this window and enter the pid: " << std::endl;
409  std::cin >> pid;
410  }
411  *pid_out = pid;
412 
413  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
414  {
415  std::unique_lock<std::mutex> lk(art_process_mutex_);
416  art_processes_.insert(pid);
417  }
418  siginfo_t status;
419  auto sts = waitid(P_PID, pid, &status, WEXITED);
420  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
421  {
422  std::unique_lock<std::mutex> lk(art_process_mutex_);
423  art_processes_.erase(pid);
424  }
425  if (sts < 0)
426  {
427  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
428  }
429  else if (status.si_code == CLD_EXITED && status.si_status == 0)
430  {
431  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
432  }
433  else
434  {
435  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
436  if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
437 
438  auto exit_type = "exited with status code";
439  switch (status.si_code)
440  {
441  case CLD_DUMPED:
442  case CLD_KILLED:
443  exit_type = "was killed with signal";
444  break;
445  case CLD_EXITED:
446  default:
447  break;
448  }
449 
450  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
451  << "art process " << pid << " " << exit_type << " " << status.si_status
452  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
453  << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
454  << (restart_art_ ? "restarting" : "not restarting");
455  }
456  } while (restart_art_);
457 }
458 
460 {
461  restart_art_ = always_restart_art_;
462  if (num_art_processes_ == 0) return;
463  for (size_t ii = 0; ii < num_art_processes_; ++ii)
464  {
465  StartArtProcess(current_art_pset_);
466  }
467 }
468 
470 {
471  static std::mutex start_art_mutex;
472  std::unique_lock<std::mutex> lk(start_art_mutex);
473  //TraceLock lk(start_art_mutex, 15, "StartArtLock");
474  restart_art_ = always_restart_art_;
475  auto initialCount = GetAttachedCount();
476  auto startTime = std::chrono::steady_clock::now();
477 
478  if (pset != current_art_pset_ || !current_art_config_file_)
479  {
480  current_art_pset_ = pset;
481  current_art_config_file_ = std::make_shared<art_config_file>(pset /*, GetKey(), GetBroadcastKey()*/);
482  }
483  std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
484  boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
485  thread.detach();
486 
487  auto currentCount = GetAttachedCount() - initialCount;
488  while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
489  {
490  usleep(10000);
491  currentCount = GetAttachedCount() - initialCount;
492  }
493  if ((currentCount < 1 || *pid <= 0) && manual_art_)
494  {
495  TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
496  return 0;
497  }
498  else if (currentCount < 1 || *pid <= 0)
499  {
500  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
501  << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
502  return 0;
503  }
504  else
505  {
506  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
507  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
508 
509  return *pid;
510  }
511 }
512 
514 {
515  restart_art_ = false;
516  //current_art_config_file_ = nullptr;
517  //current_art_pset_ = fhicl::ParameterSet();
518 
519  auto check_pids = [&](bool print) {
520  std::unique_lock<std::mutex> lk(art_process_mutex_);
521  for (auto pid = pids.begin(); pid != pids.end();)
522  {
523  // 08-May-2018, KAB: protect against killing invalid PIDS
524 
525  if (*pid <= 0)
526  {
527  TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
528  << ") from the shutdown list.";
529  pid = pids.erase(pid);
530  }
531  else if (kill(*pid, 0) < 0)
532  {
533  pid = pids.erase(pid);
534  }
535  else
536  {
537  if (print) std::cout << *pid << " ";
538  ++pid;
539  }
540  }
541  };
542  auto count_pids = [&]() {
543  std::unique_lock<std::mutex> lk(art_process_mutex_);
544  return pids.size();
545  };
546  check_pids(false);
547  if (count_pids() == 0)
548  {
549  TLOG(14) << "All art processes already exited, nothing to do.";
550  usleep(1000);
551  return;
552  }
553 
554  if (!manual_art_)
555  {
556  int graceful_wait_ms = art_event_processing_time_us_ * size() * 10 / 1000;
557  int gentle_wait_ms = art_event_processing_time_us_ * size() * 2 / 1000;
558  int int_wait_ms = art_event_processing_time_us_ * size() / 1000;
559  auto shutdown_start = std::chrono::steady_clock::now();
560 
561 
562  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
563  for (int ii = 0; ii < graceful_wait_ms; ++ii)
564  {
565  usleep(1000);
566 
567  check_pids(false);
568  if (count_pids() == 0)
569  {
570  TLOG(TLVL_TRACE) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms.";
571  return;
572  }
573  }
574 
575  {
576  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
577  std::unique_lock<std::mutex> lk(art_process_mutex_);
578  for (auto pid : pids)
579  {
580  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
581  kill(pid, SIGQUIT);
582  }
583  }
584 
585 
586  TLOG(TLVL_TRACE) << "Waiting up to " << gentle_wait_ms << " ms for all art processes to exit from SIGQUIT";
587  for (int ii = 0; ii < gentle_wait_ms; ++ii)
588  {
589  usleep(1000);
590 
591  check_pids(false);
592  if (count_pids() == 0)
593  {
594  TLOG(TLVL_TRACE) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms.";
595  return;
596  }
597  }
598 
599  {
600  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
601  std::unique_lock<std::mutex> lk(art_process_mutex_);
602  for (auto pid : pids)
603  {
604  kill(pid, SIGINT);
605  }
606  }
607 
608  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit from SIGINT";
609  for (int ii = 0; ii < int_wait_ms; ++ii)
610  {
611  usleep(1000);
612 
613  check_pids(false);
614 
615  if (count_pids() == 0)
616  {
617  TLOG(TLVL_TRACE) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms.";
618  return;
619  }
620  }
621 
622  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
623  while (count_pids() > 0)
624  {
625  {
626  std::unique_lock<std::mutex> lk(art_process_mutex_);
627  kill(*pids.begin(), SIGKILL);
628  usleep(1000);
629  }
630  check_pids(false);
631  }
632  }
633  else
634  {
635  std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
636  while (count_pids() > 0)
637  {
638  std::cout << "The following PIDs are running: ";
639  check_pids(true);
640  std::cout << std::endl;
641  std::string ignored;
642  std::cin >> ignored;
643  }
644  }
645 }
646 
647 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
648 {
649  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
650  if (restart_art_ || !always_restart_art_) // Art is running
651  {
652  endOfData();
653  }
654  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
655  {
656  broadcasts_.MarkBufferEmpty(ii, true);
657  }
658  if (newRun == 0) newRun = run_id_ + 1;
659 
660  if (art_pset != current_art_pset_ || !current_art_config_file_)
661  {
662  current_art_pset_ = art_pset;
663  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
664  }
665 
666  if (n_art_processes != -1)
667  {
668  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
669  num_art_processes_ = n_art_processes;
670  }
671  startRun(newRun);
672  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
673 }
674 
676 {
677  running_ = false;
678  init_fragment_.reset(nullptr);
679  TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
680  restart_art_ = false;
681 
682  size_t initialStoreSize = GetIncompleteEventCount();
683  TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
684  << " stale events from the SharedMemoryEventManager.";
685  int counter = initialStoreSize;
686  while (active_buffers_.size() > 0 && counter > 0)
687  {
688  complete_buffer_(*active_buffers_.begin());
689  counter--;
690  }
691  TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
692  << " stale events in the SharedMemoryEventManager.";
693 
694  TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
695  auto start = std::chrono::steady_clock::now();
696  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
697  auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1); //size();
698 
699  auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
700 
701  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
702  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
703  {
704  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
705  if (temp != lastReadCount)
706  {
707  TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
708  lastReadCount = temp;
709  start = std::chrono::steady_clock::now();
710  }
711  if (lastReadCount > 0)
712  {
713  TRACE(19, "About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu", outstanding_buffer_wait_time, lastReadCount, size(), end_of_data_wait_us);
714  usleep(outstanding_buffer_wait_time);
715  }
716  }
717 
718  TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
719  << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
720 
721  TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
722  FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
723  bool success = broadcastFragment_(std::move(outFrag), outFrag);
724  if (!success)
725  {
726  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
727  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
728  {
729  broadcasts_.MarkBufferEmpty(ii, true);
730  }
731  broadcastFragment_(std::move(outFrag), outFrag);
732  }
733  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
734  while (get_art_process_count_() > 0)
735  {
736  TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
737 
738  ShutdownArtProcesses(art_processes_);
739  }
740  TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
741 
742  ResetAttachedCount();
743 
744  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
745  for (size_t ii = 0; ii < size(); ++ii)
746  {
747  MarkBufferEmpty(ii, true);
748  }
749  // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again...
750  // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers";
751  // for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
752  // {
753  // broadcasts_.MarkBufferEmpty(ii, true);
754  // }
755  released_events_.clear();
756  released_incomplete_events_.clear();
757 
758  TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestSender";
759  requests_.reset(nullptr);
760 
761  TLOG(TLVL_DEBUG) << "endOfData END";
762  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
763  return true;
764 }
765 
767 {
768  running_ = true;
769  init_fragment_.reset(nullptr);
770  statsHelper_.resetStatistics();
771  TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
772  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
773  {
774  broadcasts_.MarkBufferEmpty(ii, true);
775  }
776  released_events_.clear();
777  released_incomplete_events_.clear();
778  StartArt();
779  run_id_ = runID;
780  {
781  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
782  subrun_event_map_.clear();
783  subrun_event_map_[0] = 1;
784  }
785  run_event_count_ = 0;
786  run_incomplete_event_count_ = 0;
787  requests_.reset(new RequestSender(data_pset_));
788  if (requests_)
789  {
790  requests_->SetRunNumber(static_cast<uint32_t>(run_id_));
791  requests_->SendRoutingToken(queue_size_, run_id_);
792  }
793  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
794  << ", max queue size = "
795  << queue_size_
796  << ", queue size = "
797  << GetLockedBufferCount();
798  if (metricMan)
799  {
800  metricMan->sendMetric("Run Number", static_cast<unsigned long>(run_id_), "Run", 1, MetricMode::LastPoint);
801  }
802 }
803 
805 {
806  TLOG(TLVL_INFO) << "Ending run " << run_id_;
807  FragmentPtr endOfRunFrag(new Fragment(static_cast<size_t>(ceil(sizeof(my_rank) /
808  static_cast<double>(sizeof(Fragment::value_type))))));
809 
810  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
811  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
812  *endOfRunFrag->dataBegin() = my_rank;
813  broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
814 
815  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
816  run_event_count_ = 0;
817  run_incomplete_event_count_ = 0;
818  oversize_fragment_count_ = 0;
819  {
820  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
821  subrun_event_map_.clear();
822  subrun_event_map_[0] = 1;
823  }
824  return true;
825 }
826 
828 {
829  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
830  if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
831 
832  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
833 
834  TLOG(TLVL_INFO) << "Will roll over to subrun " << subrun << " when I reach Sequence ID " << boundary;
835  subrun_event_map_[boundary] = subrun;
836  while (subrun_event_map_.size() > max_subrun_event_map_length_)
837  {
838  subrun_event_map_.erase(subrun_event_map_.begin());
839  }
840 }
841 
843 {
844  Fragment::sequence_id_t seqID = 0;
845  subrun_id_t subrun = 0;
846  {
847  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
848  for (auto& it : subrun_event_map_)
849  {
850  if (it.first >= seqID) seqID = it.first + 1;
851  if (it.second >= subrun) subrun = it.second + 1;
852  }
853  }
854  rolloverSubrun(seqID, subrun);
855 }
856 
858 {
859  if (metricMan)
860  {
861  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
862  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
863  }
864 
865  if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
866  {
867  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
868  return;
869 
870  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
871  std::ostringstream oss;
872  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
873  for (auto& ev : active_buffers_)
874  {
875  auto hdr = getEventHeader_(ev);
876  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
877  }
878  TLOG(TLVL_DEBUG) << oss.str();
879  }
880 }
881 
882 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
883 {
884  TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
885  auto buffer = broadcasts_.GetBufferForWriting(false);
886  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
887  auto start_time = std::chrono::steady_clock::now();
888  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
889  {
890  usleep(10000);
891  buffer = broadcasts_.GetBufferForWriting(false);
892  }
893  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
894  if (buffer == -1)
895  {
896  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
897  outFrag.swap(frag);
898  return false;
899  }
900 
901  TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
902  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
903  hdr->run_id = run_id_;
904  hdr->subrun_id = GetSubrunForSequenceID(frag->sequenceID());
905  hdr->sequence_id = frag->sequenceID();
906  hdr->is_complete = true;
907  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
908 
909  TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
910  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
911 
912  TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
913  broadcasts_.MarkBufferFull(buffer, -1);
914  outFrag.swap(frag);
915  TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
916  return true;
917 }
918 
919 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
920 {
921  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
922 }
923 
925 {
926  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
927 
928  TLOG(TLVL_TRACE) << "GetSubrunForSequenceID BEGIN map size = " << subrun_event_map_.size();
929  auto it = subrun_event_map_.begin();
930  subrun_id_t subrun = 1;
931 
932  while (it->first <= seqID && it != subrun_event_map_.end())
933  {
934  TLOG(TLVL_TRACE) << "Map has sequence ID " << it->first << ", subrun " << it->second << " (looking for <= " << seqID << ")";
935  subrun = it->second;
936  ++it;
937  }
938 
939  TLOG(TLVL_DEBUG) << "GetSubrunForSequenceID returning subrun " << subrun << " for sequence ID " << seqID;
940  return subrun;
941 }
942 
943 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
944 {
945  TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
946  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
947 
948  TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
949 
950  auto buffers = GetBuffersOwnedByManager();
951  for (auto& buf : buffers)
952  {
953  auto hdr = getEventHeader_(buf);
954  if (hdr->sequence_id == seqID)
955  {
956  TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
957  return buf;
958  }
959  }
960 
961 #if !ART_SUPPORTS_DUPLICATE_EVENTS
962  if (released_incomplete_events_.count(seqID))
963  {
964  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
965  return -2;
966  }
967  if (released_events_.count(seqID))
968  {
969  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been completed and released to art! Check configuration for inconsistent Fragment count per event!";
970  return -2;
971  }
972 #endif
973 
974  if (!create_new) return -1;
975 
976  check_pending_buffers_(lk);
977  int new_buffer = GetBufferForWriting(false);
978 
979  if (new_buffer == -1)
980  {
981  new_buffer = GetBufferForWriting(overwrite_mode_);
982  }
983 
984  if (new_buffer == -1) return -1;
985  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
986  std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
987  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
988  //TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
989  auto hdr = getEventHeader_(new_buffer);
990  hdr->is_complete = false;
991  hdr->run_id = run_id_;
992  hdr->subrun_id = GetSubrunForSequenceID(seqID);
993  hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
994  hdr->sequence_id = seqID;
995  buffer_writes_pending_[new_buffer] = 0;
996  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
997  SetMFIteration("Sequence ID " + std::to_string(seqID));
998 
999  TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
1000  active_buffers_.insert(new_buffer);
1001  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1002  << size() << ","
1003  << ReadReadyCount() << ","
1004  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1005  << WriteReadyCount(false) << ","
1006  << pending_buffers_.size() << ","
1007  << active_buffers_.size() << ")";
1008 
1009  if (requests_)
1010  {
1011  if (timestamp != Fragment::InvalidTimestamp)
1012  {
1013  requests_->AddRequest(seqID, timestamp);
1014  }
1015  // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we
1016  // don't double-send requests, but still get the benefit of calling SendRequest 'often'.
1017  else
1018  {
1019  requests_->SendRequest();
1020  }
1021  }
1022  TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
1023  return new_buffer;
1024 }
1025 
1026 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
1027 {
1028  if (buffer == -1) return true;
1029  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
1030  {
1031  return true;
1032  }
1033  ResetReadPos(buffer);
1034  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
1035  return MoreDataInBuffer(buffer);
1036 }
1037 
1038 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
1039 {
1040  auto hdr = getEventHeader_(buffer);
1041  if (hdr->is_complete)
1042  {
1043  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
1044 
1045  {
1046  TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
1047 
1048  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1049  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1050  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1051  active_buffers_.erase(buffer);
1052  pending_buffers_.insert(buffer);
1053  released_events_.insert(hdr->sequence_id);
1054  while (released_events_.size() > max_event_list_length_)
1055  {
1056  released_events_.erase(released_events_.begin());
1057  }
1058 
1059  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1060  << size() << ","
1061  << ReadReadyCount() << ","
1062  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1063  << WriteReadyCount(false) << ","
1064  << pending_buffers_.size() << ","
1065  << active_buffers_.size() << ")";
1066  }
1067  if (requests_)
1068  {
1069  requests_->RemoveRequest(hdr->sequence_id);
1070  }
1071  }
1072  CheckPendingBuffers();
1073 }
1074 
1075 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
1076 {
1077  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1078 }
1079 
1081 {
1082  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
1083  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1084  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
1085  check_pending_buffers_(lk);
1086 }
1087 
1088 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
1089 {
1090  TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1091 
1092  auto buffers = GetBuffersOwnedByManager();
1093  for (auto buf : buffers)
1094  {
1095  if (ResetBuffer(buf) && !pending_buffers_.count(buf))
1096  {
1097  TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1098  auto hdr = getEventHeader_(buf);
1099  if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_))
1100  {
1101  if (requests_)
1102  {
1103  requests_->RemoveRequest(hdr->sequence_id);
1104  }
1105  TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
1106  active_buffers_.erase(buf);
1107  pending_buffers_.insert(buf);
1108  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1109  << size() << ","
1110  << ReadReadyCount() << ","
1111  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1112  << WriteReadyCount(false) << ","
1113  << pending_buffers_.size() << ","
1114  << active_buffers_.size() << ")";
1115 
1116  run_incomplete_event_count_++;
1117  if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
1118  if (!released_incomplete_events_.count(hdr->sequence_id))
1119  {
1120  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1121  }
1122  else
1123  {
1124  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1125  }
1126  TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
1127  }
1128  }
1129  }
1130 
1131  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1132  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
1133 
1134  auto counter = 0;
1135  double eventSize = 0;
1136  for (auto buf : sorted_buffers)
1137  {
1138  auto hdr = getEventHeader_(buf);
1139  auto thisEventSize = BufferDataSize(buf);
1140 
1141  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
1142  << "event_size=" << thisEventSize << ", buffer_size=" << BufferSize();
1143  statsHelper_.addSample(EVENTS_RELEASED_STAT_KEY, thisEventSize);
1144 
1145  TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
1146  MarkBufferFull(buf);
1147  run_event_count_++;
1148  counter++;
1149  eventSize += thisEventSize;
1150  pending_buffers_.erase(buf);
1151  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1152  << size() << ","
1153  << ReadReadyCount() << ","
1154  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1155  << WriteReadyCount(false) << ","
1156  << pending_buffers_.size() << ","
1157  << active_buffers_.size() << ")";
1158  }
1159 
1160  if (requests_ && requests_->RoutingTokenSendsEnabled())
1161  {
1162  TLOG(TLVL_TRACE) << "Sent tokens: " << requests_->GetSentTokenCount() << ", Event count: " << run_event_count_;
1163  auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
1164  auto available_buffers = WriteReadyCount(overwrite_mode_);
1165 
1166  TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
1167  << ", tokens_to_send: " << available_buffers - outstanding_tokens;
1168 
1169  if (available_buffers > outstanding_tokens)
1170  {
1171  auto tokens_to_send = available_buffers - outstanding_tokens;
1172 
1173  while (tokens_to_send > 0)
1174  {
1175  TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
1176  requests_->SendRoutingToken(1, run_id_);
1177  tokens_to_send--;
1178  }
1179  }
1180  }
1181 
1182  if (statsHelper_.readyToReport())
1183  {
1184  std::string statString = buildStatisticsString_();
1185  TLOG(TLVL_INFO) << statString;
1186  }
1187 
1188  metric_data_.event_count += counter;
1189  metric_data_.event_size += eventSize;
1190 
1191  if (metricMan && TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500) // Limit to 2 Hz updates
1192  {
1193  TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
1194  metricMan->sendMetric("Event Rate", metric_data_.event_count, "Events/s", 1, MetricMode::Rate);
1195  if (metric_data_.event_count > 0) metricMan->sendMetric("Average Event Size", metric_data_.event_size / metric_data_.event_count, "Bytes", 1, MetricMode::Average);
1196  metric_data_ = MetricData();
1197 
1198  metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
1199  metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
1200  if (requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
1201 
1202  auto bufferReport = GetBufferReport();
1203  int full = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Full; });
1204  int empty = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Empty; });
1205  int writing = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Writing; });
1206  int reading = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Reading; });
1207  auto total = size();
1208  TLOG(TLVL_DEBUG) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
1209 
1210  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
1211  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
1212  metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
1213  metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
1214  if (total > 0)
1215  {
1216  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1217  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1218  }
1219 
1220  last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now();
1221  }
1222  TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
1223 }
1224 
1225 void artdaq::SharedMemoryEventManager::send_init_frag_()
1226 {
1227  if (init_fragment_ != nullptr)
1228  {
1229  TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
1230 
1231 #if 0
1232  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
1233  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1234  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1235  ostream.close();
1236 #endif
1237 
1238  broadcastFragment_(std::move(init_fragment_), init_fragment_);
1239  TLOG(TLVL_TRACE) << "Init Fragment sent";
1240  }
1241  else if (send_init_fragments_)
1242  {
1243  TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
1244  }
1245 }
1246 
1248 {
1249  if (!init_fragment_ || init_fragment_ == nullptr)
1250  {
1251  init_fragment_.swap(frag);
1252  send_init_frag_();
1253  }
1254 }
1255 
1257 {
1258  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
1259  if (art_pset != current_art_pset_ || !current_art_config_file_)
1260  {
1261  current_art_pset_ = art_pset;
1262  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
1263  }
1264  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
1265 }
1266 
1267 std::string artdaq::SharedMemoryEventManager::buildStatisticsString_() const
1268 {
1269  std::ostringstream oss;
1270  oss << app_name << " statistics:" << std::endl;
1271 
1272  artdaq::MonitoredQuantityPtr mqPtr =
1273  artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(EVENTS_RELEASED_STAT_KEY);
1274  if (mqPtr.get() != 0)
1275  {
1276  artdaq::MonitoredQuantityStats stats;
1277  mqPtr->getStats(stats);
1278  oss << " Event statistics: " << stats.recentSampleCount << " events released at " << stats.recentSampleRate
1279  << " events/sec, effective data rate = "
1280  << (stats.recentValueRate * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0)
1281  << " MB/sec, monitor window = " << stats.recentDuration
1282  << " sec, min::max event size = " << (stats.recentValueMin * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0)
1283  << "::" << (stats.recentValueMax * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0) << " MB" << std::endl;
1284  if (stats.recentSampleRate > 0.0)
1285  {
1286  oss << " Average time per event: ";
1287  oss << " elapsed time = " << (1.0 / stats.recentSampleRate) << " sec" << std::endl;
1288  }
1289  }
1290 
1291  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(FRAGMENTS_RECEIVED_STAT_KEY);
1292  if (mqPtr.get() != 0)
1293  {
1294  artdaq::MonitoredQuantityStats stats;
1295  mqPtr->getStats(stats);
1296  oss << " Fragment statistics: " << stats.recentSampleCount << " fragments received at " << stats.recentSampleRate
1297  << " fragments/sec, effective data rate = "
1298  << (stats.recentValueRate * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0)
1299  << " MB/sec, monitor window = " << stats.recentDuration
1300  << " sec, min::max fragment size = " << (stats.recentValueMin * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0)
1301  << "::" << (stats.recentValueMax * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0) << " MB" << std::endl;
1302  }
1303 
1304  oss << " Event counts: Run -- " << run_event_count_ << " Total, " << run_incomplete_event_count_ << " Incomplete."
1305  << " Subrun -- " << subrun_event_count_ << " Total, " << subrun_incomplete_event_count_ << " Incomplete. "
1306  << std::endl;
1307  return oss.str();
1308 }
void RunArt(std::shared_ptr< art_config_file > config_file, std::shared_ptr< std::atomic< pid_t >> pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
void StartArt()
Start all the art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
subrun_id_t GetSubrunForSequenceID(Fragment::sequence_id_t seqID)
Get the subrun number that the given Sequence ID would be assigned to.
void rolloverSubrun()
Add a subrun transition immediately after the highest currently define sequence ID.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
static const std::string FRAGMENTS_RECEIVED_STAT_KEY
Key for Fragments Received MonitoredQuantity.
bool createCollectors(fhicl::ParameterSet const &pset, int defaultReportIntervalFragments, double defaultReportIntervalSeconds, double defaultMonitorWindow, std::string const &primaryStatKeyName)
Create MonitoredQuantity objects for all names registered with the StatisticsHelper.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
static const std::string EVENTS_RELEASED_STAT_KEY
Key for the Events Released MonitoredQuantity.
bool endOfData()
Indicate that the end of input has been reached to the art processes.
RawEvent::subrun_id_t subrun_id_t
Copy RawEvent::subrun_id_t into local scope.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...