artdaq  v3_04_01
SharedMemoryEventManager.cc
1 
2 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
3 
4 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
5 #include <sys/wait.h>
6 #include "artdaq-core/Core/StatisticsCollection.hh"
7 #include "artdaq-core/Utilities/TraceLock.hh"
8 
9 #define TLVL_BUFFER 40
10 #define TLVL_BUFLCK 41
11 
12 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
13 std::mutex artdaq::SharedMemoryEventManager::subrun_event_map_mutex_;
14 
15 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
16  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
17  pset.get<size_t>("buffer_count"),
18  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
19  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
20  !pset.get<bool>("broadcast_mode", false))
21  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
22  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
23  , queue_size_(pset.get<size_t>("buffer_count"))
24  , run_id_(0)
25  , max_subrun_event_map_length_(pset.get<size_t>("max_subrun_lookup_table_size", 100))
26  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
27  , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
28  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
29  , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
30  , running_(false)
31  , buffer_writes_pending_()
32  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
33  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
34  , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now())
35  , last_backpressure_report_time_(std::chrono::steady_clock::now())
36  , last_fragment_header_write_time_(std::chrono::steady_clock::now())
37  , metric_data_()
38  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
39  , run_event_count_(0)
40  , run_incomplete_event_count_(0)
41  , subrun_event_count_(0)
42  , subrun_incomplete_event_count_(0)
43  , oversize_fragment_count_(0)
44  , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
45  , art_processes_()
46  , restart_art_(false)
47  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
48  , manual_art_(pset.get<bool>("manual_art", false))
49  , current_art_pset_(art_pset)
50  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
51  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 1000000))
52  , requests_(nullptr)
53  , data_pset_(pset)
54  , dropped_data_()
55  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
56  pset.get<size_t>("broadcast_buffer_count", 10),
57  pset.get<size_t>("broadcast_buffer_size", 0x100000),
58  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
59 {
60  subrun_event_map_[0] = 1;
61  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
62  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
63 
64  if (pset.get<bool>("use_art", true) == false)
65  {
66  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
67  num_art_processes_ = 0;
68  }
69  else
70  {
71  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
72  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
73  }
74  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
75 
76  if (overwrite_mode_ && num_art_processes_ > 0)
77  {
78  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
79  }
80  else if (overwrite_mode_)
81  {
82  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
83  }
84 
85  for (size_t ii = 0; ii < size(); ++ii)
86  {
87  buffer_writes_pending_[ii] = 0;
88  }
89 
90  if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
91 
92  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
93  SetRank(my_rank);
94  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
95 
96  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
97 }
98 
100 {
101  TLOG(TLVL_TRACE) << "DESTRUCTOR";
102  if (running_) endOfData();
103  TLOG(TLVL_TRACE) << "Destructor END";
104 }
105 
106 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
107 {
108  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
109  << ", sequence_id=" << frag.sequence_id;
110  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
111  TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
112  if (buffer == -1) return false;
113  if (buffer == -2)
114  {
115  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
116  return true;
117  }
118 
119  auto hdr = getEventHeader_(buffer);
120  if (update_run_ids_)
121  {
122  hdr->run_id = run_id_;
123  }
124  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
125 
126  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
127  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
128 
129  TLOG(TLVL_TRACE) << "Checking for complete event";
130  auto fragmentCount = GetFragmentCount(frag.sequence_id);
131  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
132  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
133  << ", fragmentCount=" << fragmentCount
134  << ", num_fragments_per_event=" << num_fragments_per_event_
135  << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
136 
137  complete_buffer_(buffer);
138  if (requests_) requests_->SendRequest(true);
139 
140  TLOG(TLVL_TRACE) << "AddFragment END";
141  return true;
142 }
143 
144 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
145 {
146  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
147  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
148  auto data = frag->headerAddress();
149  auto start = std::chrono::steady_clock::now();
150  bool sts = false;
151  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
152  {
153  sts = AddFragment(hdr, data);
154  if (!sts) usleep(1000);
155  }
156  if (!sts)
157  {
158  outfrag = std::move(frag);
159  }
160  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
161  return sts;
162 }
163 
164 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
165 {
166  TLOG(14) << "WriteFragmentHeader BEGIN";
167  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
168 
169  if (buffer < 0)
170  {
171  if (buffer == -1 && !dropIfNoBuffersAvailable)
172  {
173  std::unique_lock<std::mutex> bp_lk(sequence_id_mutex_);
174  if (TimeUtils::GetElapsedTime(last_backpressure_report_time_) > 1.0)
175  {
176  TLOG(TLVL_WARNING) << app_name << ": Back-pressure condition: All Shared Memory buffers have been full for " << TimeUtils::GetElapsedTime(last_fragment_header_write_time_) << " s!";
177  last_backpressure_report_time_ = std::chrono::steady_clock::now();
178  }
179  return nullptr;
180  }
181  if (buffer == -2)
182  {
183  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
184  }
185  else
186  {
187  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
188  }
189  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
190 
191  TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
192  return dropped_data_[frag.fragment_id]->dataBegin();
193  }
194 
195  last_backpressure_report_time_ = std::chrono::steady_clock::now();
196  last_fragment_header_write_time_ = std::chrono::steady_clock::now();
197  // Increment this as soon as we know we want to use the buffer
198  buffer_writes_pending_[buffer]++;
199 
200  if (metricMan)
201  {
202  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
203  }
204 
205  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
206 
207  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
208 
209  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
210 
211  //TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
212  auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
213  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
214 
215  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
216  if (frag.word_count - frag.num_words() > 0)
217  {
218  auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
219 
220  if (!sts)
221  {
222  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words();
223  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType;
224  TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
225  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
226 
227  oversize_fragment_count_++;
228 
229  if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
230  {
231  throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
232  }
233 
234  TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin();
235  return dropped_data_[frag.fragment_id]->dataBegin();
236  }
237  }
238  TLOG(14) << "WriteFragmentHeader END";
239  return pos;
240 }
241 
242 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
243 {
244  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
245  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
246  if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
247  if (buffer == -2) { return; }
248 
249  {
250  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
251 
252  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
253 
254  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
255 
256  //TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
257 
258  TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")";
259  auto hdr = getEventHeader_(buffer);
260  if (update_run_ids_)
261  {
262  hdr->run_id = run_id_;
263  }
264  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
265 
266  TLOG(TLVL_TRACE) << "DoneWritingFragment: Updating buffer touch time";
267  TouchBuffer(buffer);
268 
269  buffer_writes_pending_[buffer]--;
270  if (buffer_writes_pending_[buffer] != 0)
271  {
272  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
273  return;
274  }
275  TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
276  auto frag_count = GetFragmentCount(frag.sequence_id);
277  hdr->is_complete = frag_count == num_fragments_per_event_;
278  TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
279 #if ART_SUPPORTS_DUPLICATE_EVENTS
280  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
281  {
282  hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
283  }
284 #endif
285  }
286 
287  complete_buffer_(buffer);
288  if (requests_) requests_->SendRequest(true);
289  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
290 }
291 
292 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
293 {
294  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
295 }
296 
297 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
298 {
299  if (buffer == -1) return 0;
300  ResetReadPos(buffer);
301  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
302 
303  size_t count = 0;
304 
305  while (MoreDataInBuffer(buffer))
306  {
307  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
308  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
309  if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
310  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
311  ++count;
312  }
313 
314  return count;
315 }
316 
317 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out)
318 {
319  do
320  {
321  auto start_time = std::chrono::steady_clock::now();
322  send_init_frag_();
323  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
324 
325  pid_t pid = 0;
326 
327  if (!manual_art_)
328  {
329  char* filename = new char[config_file->getFileName().length() + 1];
330  strcpy(filename, config_file->getFileName().c_str());
331 
332  std::vector<char*> args{(char*)"art", (char*)"-c", filename, NULL};
333  pid = fork();
334  if (pid == 0)
335  { /* child */
336  // 23-May-2018, KAB: added the setting of the partition number env var
337  // in the environment of the child art process so that Globals.hh
338  // will pick it up there and provide it to the artdaq classes that
339  // are used in data transfers, etc. within the art process.
340  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
341  std::string envVarValue = std::to_string(GetPartitionNumber());
342  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
343  {
344  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
345  << "\" in the environment of a child art process. "
346  << "This may result in incorrect TCP port number "
347  << "assignments or other issues, and data may "
348  << "not flow through the system correctly.";
349  }
350  envVarKey = "ARTDAQ_APPLICATION_NAME";
351  envVarValue = app_name;
352  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
353  {
354  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
355  << "\" in the environment of a child art process. ";
356  }
357 
358  execvp("art", &args[0]);
359  delete[] filename;
360  exit(1);
361  }
362  delete[] filename;
363  }
364  else
365  {
366  //Using cin/cout here to ensure console is active (artdaqDriver)
367  std::cout << "Please run the following command in a separate terminal:" << std::endl
368  << "art -c " << config_file->getFileName() << std::endl
369  << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
370  << "Finally, return to this window and enter the pid: " << std::endl;
371  std::cin >> pid;
372  }
373  *pid_out = pid;
374 
375  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
376  {
377  std::unique_lock<std::mutex> lk(art_process_mutex_);
378  art_processes_.insert(pid);
379  }
380  siginfo_t status;
381  auto sts = waitid(P_PID, pid, &status, WEXITED);
382  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
383  {
384  std::unique_lock<std::mutex> lk(art_process_mutex_);
385  art_processes_.erase(pid);
386  }
387  if (sts < 0)
388  {
389  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
390  }
391  else if (status.si_code == CLD_EXITED && status.si_status == 0)
392  {
393  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
394  }
395  else
396  {
397  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
398  if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
399 
400  auto exit_type = "exited with status code";
401  switch (status.si_code)
402  {
403  case CLD_DUMPED:
404  case CLD_KILLED:
405  exit_type = "was killed with signal";
406  break;
407  case CLD_EXITED:
408  default:
409  break;
410  }
411 
412  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
413  << "art process " << pid << " " << exit_type << " " << status.si_status
414  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
415  << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
416  << (restart_art_ ? "restarting" : "not restarting");
417  }
418  } while (restart_art_);
419 }
420 
422 {
423  restart_art_ = always_restart_art_;
424  if (num_art_processes_ == 0) return;
425  for (size_t ii = 0; ii < num_art_processes_; ++ii)
426  {
427  StartArtProcess(current_art_pset_);
428  }
429 }
430 
432 {
433  static std::mutex start_art_mutex;
434  std::unique_lock<std::mutex> lk(start_art_mutex);
435  //TraceLock lk(start_art_mutex, 15, "StartArtLock");
436  restart_art_ = always_restart_art_;
437  auto initialCount = GetAttachedCount();
438  auto startTime = std::chrono::steady_clock::now();
439 
440  if (pset != current_art_pset_ || !current_art_config_file_)
441  {
442  current_art_pset_ = pset;
443  current_art_config_file_ = std::make_shared<art_config_file>(pset /*, GetKey(), GetBroadcastKey()*/);
444  }
445  std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
446  boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
447  thread.detach();
448 
449  auto currentCount = GetAttachedCount() - initialCount;
450  while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
451  {
452  usleep(10000);
453  currentCount = GetAttachedCount() - initialCount;
454  }
455  if ((currentCount < 1 || *pid <= 0) && manual_art_)
456  {
457  TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
458  return 0;
459  }
460  else if (currentCount < 1 || *pid <= 0)
461  {
462  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
463  << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
464  return 0;
465  }
466  else
467  {
468  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
469  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
470 
471  return *pid;
472  }
473 }
474 
476 {
477  restart_art_ = false;
478  //current_art_config_file_ = nullptr;
479  //current_art_pset_ = fhicl::ParameterSet();
480 
481  auto check_pids = [&](bool print) {
482  std::unique_lock<std::mutex> lk(art_process_mutex_);
483  for (auto pid = pids.begin(); pid != pids.end();)
484  {
485  // 08-May-2018, KAB: protect against killing invalid PIDS
486 
487  if (*pid <= 0)
488  {
489  TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
490  << ") from the shutdown list.";
491  pid = pids.erase(pid);
492  }
493  else if (kill(*pid, 0) < 0)
494  {
495  pid = pids.erase(pid);
496  }
497  else
498  {
499  if (print) std::cout << *pid << " ";
500  ++pid;
501  }
502  }
503  };
504  auto count_pids = [&]() {
505  std::unique_lock<std::mutex> lk(art_process_mutex_);
506  return pids.size();
507  };
508  check_pids(false);
509  if (count_pids() == 0)
510  {
511  TLOG(14) << "All art processes already exited, nothing to do.";
512  usleep(1000);
513  return;
514  }
515 
516  if (!manual_art_)
517  {
518  {
519  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
520  std::unique_lock<std::mutex> lk(art_process_mutex_);
521  for (auto pid : pids)
522  {
523  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
524  kill(pid, SIGQUIT);
525  }
526  }
527 
528  int graceful_wait_ms = 5000;
529  int int_wait_ms = 1000;
530 
531  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
532  for (int ii = 0; ii < graceful_wait_ms; ++ii)
533  {
534  usleep(1000);
535 
536  check_pids(false);
537  if (count_pids() == 0)
538  {
539  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
540  return;
541  }
542  }
543 
544  {
545  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
546  std::unique_lock<std::mutex> lk(art_process_mutex_);
547  for (auto pid : pids)
548  {
549  kill(pid, SIGINT);
550  }
551  }
552 
553  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
554  for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
555  {
556  usleep(1000);
557 
558  check_pids(false);
559 
560  if (count_pids() == 0)
561  {
562  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
563  return;
564  }
565  }
566 
567  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
568  while (count_pids() > 0)
569  {
570  {
571  std::unique_lock<std::mutex> lk(art_process_mutex_);
572  kill(*pids.begin(), SIGKILL);
573  usleep(1000);
574  }
575  check_pids(false);
576  }
577  }
578  else
579  {
580  std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
581  while (count_pids() > 0)
582  {
583  std::cout << "The following PIDs are running: ";
584  check_pids(true);
585  std::cout << std::endl;
586  std::string ignored;
587  std::cin >> ignored;
588  }
589  }
590 }
591 
592 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
593 {
594  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
595  if (restart_art_ || !always_restart_art_) // Art is running
596  {
597  endOfData();
598  }
599  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
600  {
601  broadcasts_.MarkBufferEmpty(ii, true);
602  }
603  if (newRun == 0) newRun = run_id_ + 1;
604 
605  if (art_pset != current_art_pset_ || !current_art_config_file_)
606  {
607  current_art_pset_ = art_pset;
608  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
609  }
610 
611  if (n_art_processes != -1)
612  {
613  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
614  num_art_processes_ = n_art_processes;
615  }
616  startRun(newRun);
617  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
618 }
619 
621 {
622  running_ = false;
623  init_fragment_.reset(nullptr);
624  TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
625  restart_art_ = false;
626 
627  size_t initialStoreSize = GetIncompleteEventCount();
628  TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
629  << " stale events from the SharedMemoryEventManager.";
630  int counter = initialStoreSize;
631  while (active_buffers_.size() > 0 && counter > 0)
632  {
633  complete_buffer_(*active_buffers_.begin());
634  counter--;
635  }
636  TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
637  << " stale events in the SharedMemoryEventManager.";
638 
639  TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
640  auto start = std::chrono::steady_clock::now();
641  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
642  auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1); //size();
643 
644  auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
645 
646  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
647  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
648  {
649  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
650  if (temp != lastReadCount)
651  {
652  TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
653  lastReadCount = temp;
654  start = std::chrono::steady_clock::now();
655  }
656  if (lastReadCount > 0)
657  {
658  TRACE(19, "About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu", outstanding_buffer_wait_time, lastReadCount, size(), end_of_data_wait_us);
659  usleep(outstanding_buffer_wait_time);
660  }
661  }
662 
663  TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
664  << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
665 
666  TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
667  FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
668  bool success = broadcastFragment_(std::move(outFrag), outFrag);
669  if (!success)
670  {
671  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
672  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
673  {
674  broadcasts_.MarkBufferEmpty(ii, true);
675  }
676  broadcastFragment_(std::move(outFrag), outFrag);
677  }
678  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
679 
680  if (get_art_process_count_() > 0)
681  {
682  TLOG(TLVL_DEBUG) << "Allowing " << get_art_process_count_() << " art processes the chance to end gracefully";
683  if (end_of_data_wait_us == 0)
684  {
685  TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
686  end_of_data_wait_us = 100 * 1000000;
687  }
688 
689  auto sleep_count = (end_of_data_wait_us / 10000) + 1;
690  for (size_t ii = 0; ii < sleep_count; ++ii)
691  {
692  usleep(10000);
693  if (get_art_process_count_() == 0) break;
694  }
695  }
696 
697  while (get_art_process_count_() > 0)
698  {
699  TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
700 
701  ShutdownArtProcesses(art_processes_);
702  }
703  TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
704 
705  ResetAttachedCount();
706 
707  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
708  for (size_t ii = 0; ii < size(); ++ii)
709  {
710  MarkBufferEmpty(ii, true);
711  }
712  // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again...
713  // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers";
714  // for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
715  // {
716  // broadcasts_.MarkBufferEmpty(ii, true);
717  // }
718  released_incomplete_events_.clear();
719 
720  TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestSender";
721  requests_.reset(nullptr);
722 
723  TLOG(TLVL_DEBUG) << "endOfData END";
724  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
725  return true;
726 }
727 
729 {
730  running_ = true;
731  init_fragment_.reset(nullptr);
732  TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
733  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
734  {
735  broadcasts_.MarkBufferEmpty(ii, true);
736  }
737  StartArt();
738  run_id_ = runID;
739  {
740  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
741  subrun_event_map_.clear();
742  subrun_event_map_[0] = 1;
743  }
744  run_event_count_ = 0;
745  run_incomplete_event_count_ = 0;
746  requests_.reset(new RequestSender(data_pset_));
747  if (requests_)
748  {
749  requests_->SetRunNumber(static_cast<uint32_t>(run_id_));
750  requests_->SendRoutingToken(queue_size_, run_id_);
751  }
752  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
753  << ", max queue size = "
754  << queue_size_
755  << ", queue size = "
756  << GetLockedBufferCount();
757  if (metricMan)
758  {
759  metricMan->sendMetric("Run Number", static_cast<unsigned long>(run_id_), "Run", 1, MetricMode::LastPoint);
760  }
761 }
762 
764 {
765  TLOG(TLVL_INFO) << "Ending run " << run_id_;
766  FragmentPtr endOfRunFrag(new Fragment(static_cast<size_t>(ceil(sizeof(my_rank) /
767  static_cast<double>(sizeof(Fragment::value_type))))));
768 
769  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
770  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
771  *endOfRunFrag->dataBegin() = my_rank;
772  broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
773 
774  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
775  run_event_count_ = 0;
776  run_incomplete_event_count_ = 0;
777  oversize_fragment_count_ = 0;
778  {
779  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
780  subrun_event_map_.clear();
781  subrun_event_map_[0] = 1;
782  }
783  return true;
784 }
785 
787 {
788  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
789  if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
790 
791  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
792 
793  TLOG(TLVL_INFO) << "Will roll over to subrun " << subrun << " when I reach Sequence ID " << boundary;
794  subrun_event_map_[boundary] = subrun;
795  while (subrun_event_map_.size() > max_subrun_event_map_length_)
796  {
797  subrun_event_map_.erase(subrun_event_map_.begin());
798  }
799 }
800 
802 {
803  Fragment::sequence_id_t seqID = 0;
804  subrun_id_t subrun = 0;
805  {
806  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
807  for (auto& it : subrun_event_map_)
808  {
809  if (it.first >= seqID) seqID = it.first + 1;
810  if (it.second >= subrun) subrun = it.second + 1;
811  }
812  }
813  rolloverSubrun(seqID, subrun);
814 }
815 
817 {
818  if (metricMan)
819  {
820  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
821  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
822  }
823 
824  if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
825  {
826  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
827  return;
828 
829  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
830  std::ostringstream oss;
831  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
832  for (auto& ev : active_buffers_)
833  {
834  auto hdr = getEventHeader_(ev);
835  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
836  }
837  TLOG(TLVL_DEBUG) << oss.str();
838  }
839 }
840 
841 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
842 {
843  TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
844  auto buffer = broadcasts_.GetBufferForWriting(false);
845  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
846  auto start_time = std::chrono::steady_clock::now();
847  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
848  {
849  usleep(10000);
850  buffer = broadcasts_.GetBufferForWriting(false);
851  }
852  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
853  if (buffer == -1)
854  {
855  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
856  outFrag.swap(frag);
857  return false;
858  }
859 
860  TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
861  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
862  hdr->run_id = run_id_;
863  hdr->subrun_id = GetSubrunForSequenceID(frag->sequenceID());
864  hdr->sequence_id = frag->sequenceID();
865  hdr->is_complete = true;
866  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
867 
868  TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
869  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
870 
871  TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
872  broadcasts_.MarkBufferFull(buffer, -1);
873  outFrag.swap(frag);
874  TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
875  return true;
876 }
877 
878 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
879 {
880  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
881 }
882 
884 {
885  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
886 
887  TLOG(TLVL_TRACE) << "GetSubrunForSequenceID BEGIN map size = " << subrun_event_map_.size();
888  auto it = subrun_event_map_.begin();
889  subrun_id_t subrun = 1;
890 
891  while (it->first <= seqID && it != subrun_event_map_.end())
892  {
893  TLOG(TLVL_TRACE) << "Map has sequence ID " << it->first << ", subrun " << it->second << " (looking for <= " << seqID << ")";
894  subrun = it->second;
895  ++it;
896  }
897 
898  TLOG(TLVL_DEBUG) << "GetSubrunForSequenceID returning subrun " << subrun << " for sequence ID " << seqID;
899  return subrun;
900 }
901 
902 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
903 {
904  TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
905  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
906 
907  TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
908 
909  auto buffers = GetBuffersOwnedByManager();
910  for (auto& buf : buffers)
911  {
912  auto hdr = getEventHeader_(buf);
913  if (hdr->sequence_id == seqID)
914  {
915  TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
916  return buf;
917  }
918  }
919 
920 #if !ART_SUPPORTS_DUPLICATE_EVENTS
921  if (released_incomplete_events_.count(seqID))
922  {
923  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
924  return -2;
925  }
926 #endif
927 
928  if (!create_new) return -1;
929 
930  check_pending_buffers_(lk);
931  int new_buffer = GetBufferForWriting(false);
932 
933  if (new_buffer == -1)
934  {
935  new_buffer = GetBufferForWriting(overwrite_mode_);
936  }
937 
938  if (new_buffer == -1) return -1;
939  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
940  std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
941  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
942  //TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
943  auto hdr = getEventHeader_(new_buffer);
944  hdr->is_complete = false;
945  hdr->run_id = run_id_;
946  hdr->subrun_id = GetSubrunForSequenceID(seqID);
947  hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
948  hdr->sequence_id = seqID;
949  buffer_writes_pending_[new_buffer] = 0;
950  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
951  SetMFIteration("Sequence ID " + std::to_string(seqID));
952 
953  TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
954  active_buffers_.insert(new_buffer);
955  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
956  << size() << ","
957  << ReadReadyCount() << ","
958  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
959  << WriteReadyCount(false) << ","
960  << pending_buffers_.size() << ","
961  << active_buffers_.size() << ")";
962 
963  if (requests_)
964  {
965  if (timestamp != Fragment::InvalidTimestamp)
966  {
967  requests_->AddRequest(seqID, timestamp);
968  }
969  // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we
970  // don't double-send requests, but still get the benefit of calling SendRequest 'often'.
971  else
972  {
973  requests_->SendRequest();
974  }
975  }
976  TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
977  return new_buffer;
978 }
979 
980 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
981 {
982  if (buffer == -1) return true;
983  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
984  {
985  return true;
986  }
987  ResetReadPos(buffer);
988  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
989  return MoreDataInBuffer(buffer);
990 }
991 
992 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
993 {
994  auto hdr = getEventHeader_(buffer);
995  if (hdr->is_complete)
996  {
997  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
998 
999  {
1000  TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
1001 
1002  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1003  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1004  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1005  active_buffers_.erase(buffer);
1006  pending_buffers_.insert(buffer);
1007 
1008  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1009  << size() << ","
1010  << ReadReadyCount() << ","
1011  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1012  << WriteReadyCount(false) << ","
1013  << pending_buffers_.size() << ","
1014  << active_buffers_.size() << ")";
1015  }
1016  if (requests_)
1017  {
1018  requests_->RemoveRequest(hdr->sequence_id);
1019  }
1020  }
1021  CheckPendingBuffers();
1022 }
1023 
1024 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
1025 {
1026  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1027 }
1028 
1030 {
1031  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
1032  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1033  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
1034  check_pending_buffers_(lk);
1035 }
1036 
1037 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
1038 {
1039  TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1040 
1041  auto buffers = GetBuffersOwnedByManager();
1042  for (auto buf : buffers)
1043  {
1044  if (ResetBuffer(buf) && !pending_buffers_.count(buf))
1045  {
1046  TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1047  auto hdr = getEventHeader_(buf);
1048  if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_))
1049  {
1050  if (requests_)
1051  {
1052  requests_->RemoveRequest(hdr->sequence_id);
1053  }
1054  TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
1055  active_buffers_.erase(buf);
1056  pending_buffers_.insert(buf);
1057  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1058  << size() << ","
1059  << ReadReadyCount() << ","
1060  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1061  << WriteReadyCount(false) << ","
1062  << pending_buffers_.size() << ","
1063  << active_buffers_.size() << ")";
1064 
1065  run_incomplete_event_count_++;
1066  if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
1067  if (!released_incomplete_events_.count(hdr->sequence_id))
1068  {
1069  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1070  }
1071  else
1072  {
1073  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1074  }
1075  TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
1076  }
1077  }
1078  }
1079 
1080  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1081  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
1082 
1083  auto counter = 0;
1084  double eventSize = 0;
1085  for (auto buf : sorted_buffers)
1086  {
1087  auto hdr = getEventHeader_(buf);
1088 
1089  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
1090  << "event_size=" << BufferDataSize(buf) << ", buffer_size=" << BufferSize();
1091 
1092  TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
1093  MarkBufferFull(buf);
1094  run_event_count_++;
1095  counter++;
1096  eventSize += BufferDataSize(buf);
1097  pending_buffers_.erase(buf);
1098  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1099  << size() << ","
1100  << ReadReadyCount() << ","
1101  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1102  << WriteReadyCount(false) << ","
1103  << pending_buffers_.size() << ","
1104  << active_buffers_.size() << ")";
1105  }
1106 
1107  if (requests_)
1108  {
1109  TLOG(TLVL_TRACE) << "Sent tokens: " << requests_->GetSentTokenCount() << ", Event count: " << run_event_count_;
1110  auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
1111  auto available_buffers = WriteReadyCount(overwrite_mode_);
1112 
1113  TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
1114  << ", tokens_to_send: " << available_buffers - outstanding_tokens;
1115 
1116  if (available_buffers > outstanding_tokens)
1117  {
1118  auto tokens_to_send = available_buffers - outstanding_tokens;
1119 
1120  while (tokens_to_send > 0)
1121  {
1122  TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
1123  requests_->SendRoutingToken(1, run_id_);
1124  tokens_to_send--;
1125  }
1126  }
1127  }
1128 
1129  metric_data_.event_count += counter;
1130  metric_data_.event_size += eventSize;
1131 
1132  if (metricMan && TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500) // Limit to 2 Hz updates
1133  {
1134  TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
1135  metricMan->sendMetric("Event Rate", metric_data_.event_count, "Events/s", 1, MetricMode::Rate);
1136  if (metric_data_.event_count > 0) metricMan->sendMetric("Average Event Size", metric_data_.event_size / metric_data_.event_count, "Bytes", 1, MetricMode::Average);
1137  metric_data_ = MetricData();
1138 
1139  metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
1140  metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
1141  if (requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
1142 
1143  auto bufferReport = GetBufferReport();
1144  int full = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Full; });
1145  int empty = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Empty; });
1146  int writing = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Writing; });
1147  int reading = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Reading; });
1148  auto total = size();
1149  TLOG(TLVL_DEBUG) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
1150 
1151  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
1152  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
1153  metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
1154  metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
1155  if (total > 0)
1156  {
1157  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1158  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1159  }
1160 
1161  last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now();
1162  }
1163  TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
1164 }
1165 
1166 void artdaq::SharedMemoryEventManager::send_init_frag_()
1167 {
1168  if (init_fragment_ != nullptr)
1169  {
1170  TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
1171 
1172 #if 0
1173  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
1174  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1175  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1176  ostream.close();
1177 #endif
1178 
1179  broadcastFragment_(std::move(init_fragment_), init_fragment_);
1180  TLOG(TLVL_TRACE) << "Init Fragment sent";
1181  }
1182  else if (send_init_fragments_)
1183  {
1184  TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
1185  }
1186 }
1187 
1189 {
1190  if (!init_fragment_ || init_fragment_ == nullptr)
1191  {
1192  init_fragment_.swap(frag);
1193  send_init_frag_();
1194  }
1195 }
1196 
1198 {
1199  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
1200  if (art_pset != current_art_pset_ || !current_art_config_file_)
1201  {
1202  current_art_pset_ = art_pset;
1203  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
1204  }
1205  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
1206 }
1207 
1208 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
1209 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager)
1210 #endif
void RunArt(std::shared_ptr< art_config_file > config_file, std::shared_ptr< std::atomic< pid_t >> pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
The SharedMemoryEventManager is a SharedMemoryManger which tracks events as they are built...
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
void StartArt()
Start all the art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
subrun_id_t GetSubrunForSequenceID(Fragment::sequence_id_t seqID)
Get the subrun number that the given Sequence ID would be assigned to.
void rolloverSubrun()
Add a subrun transition immediately after the highest currently define sequence ID.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
bool endOfData()
Indicate that the end of input has been reached to the art processes.
RawEvent::subrun_id_t subrun_id_t
Copy RawEvent::subrun_id_t into local scope.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...