artdaq  v3_05_00
SharedMemoryEventManager.cc
1 
2 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
3 
4 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
5 #include <sys/wait.h>
6 #include "artdaq-core/Core/StatisticsCollection.hh"
7 #include "artdaq-core/Utilities/TraceLock.hh"
8 
9 #define TLVL_BUFFER 40
10 #define TLVL_BUFLCK 41
11 
12 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
13 std::mutex artdaq::SharedMemoryEventManager::subrun_event_map_mutex_;
14 
15 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
16  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
17  pset.get<size_t>("buffer_count"),
18  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
19  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
20  !pset.get<bool>("broadcast_mode", false))
21  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
22  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
23  , queue_size_(pset.get<size_t>("buffer_count"))
24  , run_id_(0)
25  , max_subrun_event_map_length_(pset.get<size_t>("max_subrun_lookup_table_size", 100))
26  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
27  , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
28  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
29  , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
30  , running_(false)
31  , buffer_writes_pending_()
32  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
33  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
34  , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now())
35  , last_backpressure_report_time_(std::chrono::steady_clock::now())
36  , last_fragment_header_write_time_(std::chrono::steady_clock::now())
37  , metric_data_()
38  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
39  , run_event_count_(0)
40  , run_incomplete_event_count_(0)
41  , subrun_event_count_(0)
42  , subrun_incomplete_event_count_(0)
43  , oversize_fragment_count_(0)
44  , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
45  , art_processes_()
46  , restart_art_(false)
47  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
48  , manual_art_(pset.get<bool>("manual_art", false))
49  , current_art_pset_(art_pset)
50  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
51  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 1000000))
52  , requests_(nullptr)
53  , data_pset_(pset)
54  , dropped_data_()
55  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
56  pset.get<size_t>("broadcast_buffer_count", 10),
57  pset.get<size_t>("broadcast_buffer_size", 0x100000),
58  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
59 {
60  subrun_event_map_[0] = 1;
61  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
62  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
63 
64  if (pset.get<bool>("use_art", true) == false)
65  {
66  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
67  num_art_processes_ = 0;
68  }
69  else
70  {
71  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
72  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
73  }
74  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
75 
76  if (overwrite_mode_ && num_art_processes_ > 0)
77  {
78  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
79  }
80  else if (overwrite_mode_)
81  {
82  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
83  }
84 
85  for (size_t ii = 0; ii < size(); ++ii)
86  {
87  buffer_writes_pending_[ii] = 0;
88  }
89 
90  if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
91 
92  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
93  SetRank(my_rank);
94  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
95 
96  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
97 }
98 
100 {
101  TLOG(TLVL_TRACE) << "DESTRUCTOR";
102  if (running_) endOfData();
103  TLOG(TLVL_TRACE) << "Destructor END";
104 }
105 
106 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
107 {
108  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
109  << ", sequence_id=" << frag.sequence_id;
110  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
111  TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
112  if (buffer == -1) return false;
113  if (buffer == -2)
114  {
115  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
116  return true;
117  }
118 
119  auto hdr = getEventHeader_(buffer);
120  if (update_run_ids_)
121  {
122  hdr->run_id = run_id_;
123  }
124  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
125 
126  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
127  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
128 
129  TLOG(TLVL_TRACE) << "Checking for complete event";
130  auto fragmentCount = GetFragmentCount(frag.sequence_id);
131  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
132  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
133  << ", fragmentCount=" << fragmentCount
134  << ", num_fragments_per_event=" << num_fragments_per_event_
135  << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
136 
137  complete_buffer_(buffer);
138  if (requests_) requests_->SendRequest(true);
139 
140  TLOG(TLVL_TRACE) << "AddFragment END";
141  return true;
142 }
143 
144 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
145 {
146  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
147  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
148  auto data = frag->headerAddress();
149  auto start = std::chrono::steady_clock::now();
150  bool sts = false;
151  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
152  {
153  sts = AddFragment(hdr, data);
154  if (!sts) usleep(1000);
155  }
156  if (!sts)
157  {
158  outfrag = std::move(frag);
159  }
160  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
161  return sts;
162 }
163 
164 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
165 {
166  TLOG(14) << "WriteFragmentHeader BEGIN";
167  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
168 
169  if (buffer < 0)
170  {
171  if (buffer == -1 && !dropIfNoBuffersAvailable)
172  {
173  std::unique_lock<std::mutex> bp_lk(sequence_id_mutex_);
174  if (TimeUtils::GetElapsedTime(last_backpressure_report_time_) > 1.0)
175  {
176  TLOG(TLVL_WARNING) << app_name << ": Back-pressure condition: All Shared Memory buffers have been full for " << TimeUtils::GetElapsedTime(last_fragment_header_write_time_) << " s!";
177  last_backpressure_report_time_ = std::chrono::steady_clock::now();
178  }
179  return nullptr;
180  }
181  if (buffer == -2)
182  {
183  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
184  }
185  else
186  {
187  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
188  }
189  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
190 
191  TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
192  return dropped_data_[frag.fragment_id]->dataBegin();
193  }
194 
195  last_backpressure_report_time_ = std::chrono::steady_clock::now();
196  last_fragment_header_write_time_ = std::chrono::steady_clock::now();
197  // Increment this as soon as we know we want to use the buffer
198  buffer_writes_pending_[buffer]++;
199 
200  if (metricMan)
201  {
202  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
203  }
204 
205  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
206 
207  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
208 
209  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
210 
211  //TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
212  auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
213  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
214 
215  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
216  if (frag.word_count - frag.num_words() > 0)
217  {
218  auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
219 
220  if (!sts)
221  {
222  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words();
223  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType;
224  TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
225  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
226 
227  oversize_fragment_count_++;
228 
229  if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
230  {
231  throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
232  }
233 
234  TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin();
235  return dropped_data_[frag.fragment_id]->dataBegin();
236  }
237  }
238  TLOG(14) << "WriteFragmentHeader END";
239  return pos;
240 }
241 
242 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
243 {
244  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
245  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
246  if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
247  if (buffer == -2) { return; }
248 
249  {
250  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
251 
252  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
253 
254  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
255 
256  //TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
257 
258  TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")";
259  auto hdr = getEventHeader_(buffer);
260  if (update_run_ids_)
261  {
262  hdr->run_id = run_id_;
263  }
264  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
265 
266  TLOG(TLVL_TRACE) << "DoneWritingFragment: Updating buffer touch time";
267  TouchBuffer(buffer);
268 
269  buffer_writes_pending_[buffer]--;
270  if (buffer_writes_pending_[buffer] != 0)
271  {
272  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
273  return;
274  }
275  TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
276  auto frag_count = GetFragmentCount(frag.sequence_id);
277  hdr->is_complete = frag_count == num_fragments_per_event_;
278  TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
279 #if ART_SUPPORTS_DUPLICATE_EVENTS
280  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
281  {
282  hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
283  }
284 #endif
285  }
286 
287  complete_buffer_(buffer);
288  if (requests_) requests_->SendRequest(true);
289  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
290 }
291 
292 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
293 {
294  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
295 }
296 
297 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
298 {
299  if (buffer == -1) return 0;
300  ResetReadPos(buffer);
301  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
302 
303  size_t count = 0;
304 
305  while (MoreDataInBuffer(buffer))
306  {
307  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
308  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
309  if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
310  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
311  ++count;
312  }
313 
314  return count;
315 }
316 
317 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out)
318 {
319  do
320  {
321  auto start_time = std::chrono::steady_clock::now();
322  send_init_frag_();
323  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
324 
325  pid_t pid = 0;
326 
327  if (!manual_art_)
328  {
329  char* filename = new char[config_file->getFileName().length() + 1];
330  strcpy(filename, config_file->getFileName().c_str());
331 
332 #if DEBUG_ART
333  std::string debugArgS = "--config-out=" + app_name + "_art.out";
334  char* debugArg = new char[debugArgS.length() + 1];
335  strcpy(debugArg, debugArgS.c_str());
336 
337  std::vector<char*> args{(char*)"art", (char*)"-c", filename, debugArg, NULL};
338 #else
339  std::vector<char*> args{(char*)"art", (char*)"-c", filename, NULL};
340 #endif
341 
342  pid = fork();
343  if (pid == 0)
344  { /* child */
345  // 23-May-2018, KAB: added the setting of the partition number env var
346  // in the environment of the child art process so that Globals.hh
347  // will pick it up there and provide it to the artdaq classes that
348  // are used in data transfers, etc. within the art process.
349  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
350  std::string envVarValue = std::to_string(GetPartitionNumber());
351  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
352  {
353  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
354  << "\" in the environment of a child art process. "
355  << "This may result in incorrect TCP port number "
356  << "assignments or other issues, and data may "
357  << "not flow through the system correctly.";
358  }
359  envVarKey = "ARTDAQ_APPLICATION_NAME";
360  envVarValue = app_name;
361  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
362  {
363  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
364  << "\" in the environment of a child art process. ";
365  }
366  envVarKey = "ARTDAQ_RANK";
367  envVarValue = std::to_string(my_rank);
368  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
369  {
370  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
371  << "\" in the environment of a child art process. ";
372  }
373 
374  execvp("art", &args[0]);
375  delete[] filename;
376  exit(1);
377  }
378  delete[] filename;
379  }
380  else
381  {
382  //Using cin/cout here to ensure console is active (artdaqDriver)
383  std::cout << "Please run the following command in a separate terminal:" << std::endl
384  << "art -c " << config_file->getFileName() << std::endl
385  << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
386  << "Finally, return to this window and enter the pid: " << std::endl;
387  std::cin >> pid;
388  }
389  *pid_out = pid;
390 
391  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
392  {
393  std::unique_lock<std::mutex> lk(art_process_mutex_);
394  art_processes_.insert(pid);
395  }
396  siginfo_t status;
397  auto sts = waitid(P_PID, pid, &status, WEXITED);
398  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
399  {
400  std::unique_lock<std::mutex> lk(art_process_mutex_);
401  art_processes_.erase(pid);
402  }
403  if (sts < 0)
404  {
405  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
406  }
407  else if (status.si_code == CLD_EXITED && status.si_status == 0)
408  {
409  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
410  }
411  else
412  {
413  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
414  if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
415 
416  auto exit_type = "exited with status code";
417  switch (status.si_code)
418  {
419  case CLD_DUMPED:
420  case CLD_KILLED:
421  exit_type = "was killed with signal";
422  break;
423  case CLD_EXITED:
424  default:
425  break;
426  }
427 
428  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
429  << "art process " << pid << " " << exit_type << " " << status.si_status
430  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
431  << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
432  << (restart_art_ ? "restarting" : "not restarting");
433  }
434  } while (restart_art_);
435 }
436 
438 {
439  restart_art_ = always_restart_art_;
440  if (num_art_processes_ == 0) return;
441  for (size_t ii = 0; ii < num_art_processes_; ++ii)
442  {
443  StartArtProcess(current_art_pset_);
444  }
445 }
446 
448 {
449  static std::mutex start_art_mutex;
450  std::unique_lock<std::mutex> lk(start_art_mutex);
451  //TraceLock lk(start_art_mutex, 15, "StartArtLock");
452  restart_art_ = always_restart_art_;
453  auto initialCount = GetAttachedCount();
454  auto startTime = std::chrono::steady_clock::now();
455 
456  if (pset != current_art_pset_ || !current_art_config_file_)
457  {
458  current_art_pset_ = pset;
459  current_art_config_file_ = std::make_shared<art_config_file>(pset /*, GetKey(), GetBroadcastKey()*/);
460  }
461  std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
462  boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
463  thread.detach();
464 
465  auto currentCount = GetAttachedCount() - initialCount;
466  while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
467  {
468  usleep(10000);
469  currentCount = GetAttachedCount() - initialCount;
470  }
471  if ((currentCount < 1 || *pid <= 0) && manual_art_)
472  {
473  TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
474  return 0;
475  }
476  else if (currentCount < 1 || *pid <= 0)
477  {
478  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
479  << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
480  return 0;
481  }
482  else
483  {
484  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
485  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
486 
487  return *pid;
488  }
489 }
490 
492 {
493  restart_art_ = false;
494  //current_art_config_file_ = nullptr;
495  //current_art_pset_ = fhicl::ParameterSet();
496 
497  auto check_pids = [&](bool print) {
498  std::unique_lock<std::mutex> lk(art_process_mutex_);
499  for (auto pid = pids.begin(); pid != pids.end();)
500  {
501  // 08-May-2018, KAB: protect against killing invalid PIDS
502 
503  if (*pid <= 0)
504  {
505  TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
506  << ") from the shutdown list.";
507  pid = pids.erase(pid);
508  }
509  else if (kill(*pid, 0) < 0)
510  {
511  pid = pids.erase(pid);
512  }
513  else
514  {
515  if (print) std::cout << *pid << " ";
516  ++pid;
517  }
518  }
519  };
520  auto count_pids = [&]() {
521  std::unique_lock<std::mutex> lk(art_process_mutex_);
522  return pids.size();
523  };
524  check_pids(false);
525  if (count_pids() == 0)
526  {
527  TLOG(14) << "All art processes already exited, nothing to do.";
528  usleep(1000);
529  return;
530  }
531 
532  if (!manual_art_)
533  {
534  {
535  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
536  std::unique_lock<std::mutex> lk(art_process_mutex_);
537  for (auto pid : pids)
538  {
539  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
540  kill(pid, SIGQUIT);
541  }
542  }
543 
544  int graceful_wait_ms = 5000;
545  int int_wait_ms = 1000;
546 
547  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
548  for (int ii = 0; ii < graceful_wait_ms; ++ii)
549  {
550  usleep(1000);
551 
552  check_pids(false);
553  if (count_pids() == 0)
554  {
555  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
556  return;
557  }
558  }
559 
560  {
561  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
562  std::unique_lock<std::mutex> lk(art_process_mutex_);
563  for (auto pid : pids)
564  {
565  kill(pid, SIGINT);
566  }
567  }
568 
569  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
570  for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
571  {
572  usleep(1000);
573 
574  check_pids(false);
575 
576  if (count_pids() == 0)
577  {
578  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
579  return;
580  }
581  }
582 
583  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
584  while (count_pids() > 0)
585  {
586  {
587  std::unique_lock<std::mutex> lk(art_process_mutex_);
588  kill(*pids.begin(), SIGKILL);
589  usleep(1000);
590  }
591  check_pids(false);
592  }
593  }
594  else
595  {
596  std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
597  while (count_pids() > 0)
598  {
599  std::cout << "The following PIDs are running: ";
600  check_pids(true);
601  std::cout << std::endl;
602  std::string ignored;
603  std::cin >> ignored;
604  }
605  }
606 }
607 
608 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
609 {
610  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
611  if (restart_art_ || !always_restart_art_) // Art is running
612  {
613  endOfData();
614  }
615  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
616  {
617  broadcasts_.MarkBufferEmpty(ii, true);
618  }
619  if (newRun == 0) newRun = run_id_ + 1;
620 
621  if (art_pset != current_art_pset_ || !current_art_config_file_)
622  {
623  current_art_pset_ = art_pset;
624  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
625  }
626 
627  if (n_art_processes != -1)
628  {
629  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
630  num_art_processes_ = n_art_processes;
631  }
632  startRun(newRun);
633  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
634 }
635 
637 {
638  running_ = false;
639  init_fragment_.reset(nullptr);
640  TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
641  restart_art_ = false;
642 
643  size_t initialStoreSize = GetIncompleteEventCount();
644  TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
645  << " stale events from the SharedMemoryEventManager.";
646  int counter = initialStoreSize;
647  while (active_buffers_.size() > 0 && counter > 0)
648  {
649  complete_buffer_(*active_buffers_.begin());
650  counter--;
651  }
652  TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
653  << " stale events in the SharedMemoryEventManager.";
654 
655  TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
656  auto start = std::chrono::steady_clock::now();
657  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
658  auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1); //size();
659 
660  auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
661 
662  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
663  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
664  {
665  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
666  if (temp != lastReadCount)
667  {
668  TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
669  lastReadCount = temp;
670  start = std::chrono::steady_clock::now();
671  }
672  if (lastReadCount > 0)
673  {
674  TRACE(19, "About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu", outstanding_buffer_wait_time, lastReadCount, size(), end_of_data_wait_us);
675  usleep(outstanding_buffer_wait_time);
676  }
677  }
678 
679  TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
680  << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
681 
682  TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
683  FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
684  bool success = broadcastFragment_(std::move(outFrag), outFrag);
685  if (!success)
686  {
687  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
688  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
689  {
690  broadcasts_.MarkBufferEmpty(ii, true);
691  }
692  broadcastFragment_(std::move(outFrag), outFrag);
693  }
694  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
695 
696  if (get_art_process_count_() > 0)
697  {
698  TLOG(TLVL_DEBUG) << "Allowing " << get_art_process_count_() << " art processes the chance to end gracefully";
699  if (end_of_data_wait_us == 0)
700  {
701  TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
702  end_of_data_wait_us = 100 * 1000000;
703  }
704 
705  auto sleep_count = (end_of_data_wait_us / 10000) + 1;
706  for (size_t ii = 0; ii < sleep_count; ++ii)
707  {
708  usleep(10000);
709  if (get_art_process_count_() == 0) break;
710  }
711  }
712 
713  while (get_art_process_count_() > 0)
714  {
715  TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
716 
717  ShutdownArtProcesses(art_processes_);
718  }
719  TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
720 
721  ResetAttachedCount();
722 
723  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
724  for (size_t ii = 0; ii < size(); ++ii)
725  {
726  MarkBufferEmpty(ii, true);
727  }
728  // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again...
729  // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers";
730  // for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
731  // {
732  // broadcasts_.MarkBufferEmpty(ii, true);
733  // }
734  released_incomplete_events_.clear();
735 
736  TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestSender";
737  requests_.reset(nullptr);
738 
739  TLOG(TLVL_DEBUG) << "endOfData END";
740  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
741  return true;
742 }
743 
745 {
746  running_ = true;
747  init_fragment_.reset(nullptr);
748  TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
749  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
750  {
751  broadcasts_.MarkBufferEmpty(ii, true);
752  }
753  StartArt();
754  run_id_ = runID;
755  {
756  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
757  subrun_event_map_.clear();
758  subrun_event_map_[0] = 1;
759  }
760  run_event_count_ = 0;
761  run_incomplete_event_count_ = 0;
762  requests_.reset(new RequestSender(data_pset_));
763  if (requests_)
764  {
765  requests_->SetRunNumber(static_cast<uint32_t>(run_id_));
766  requests_->SendRoutingToken(queue_size_, run_id_);
767  }
768  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
769  << ", max queue size = "
770  << queue_size_
771  << ", queue size = "
772  << GetLockedBufferCount();
773  if (metricMan)
774  {
775  metricMan->sendMetric("Run Number", static_cast<unsigned long>(run_id_), "Run", 1, MetricMode::LastPoint);
776  }
777 }
778 
780 {
781  TLOG(TLVL_INFO) << "Ending run " << run_id_;
782  FragmentPtr endOfRunFrag(new Fragment(static_cast<size_t>(ceil(sizeof(my_rank) /
783  static_cast<double>(sizeof(Fragment::value_type))))));
784 
785  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
786  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
787  *endOfRunFrag->dataBegin() = my_rank;
788  broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
789 
790  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
791  run_event_count_ = 0;
792  run_incomplete_event_count_ = 0;
793  oversize_fragment_count_ = 0;
794  {
795  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
796  subrun_event_map_.clear();
797  subrun_event_map_[0] = 1;
798  }
799  return true;
800 }
801 
803 {
804  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
805  if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
806 
807  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
808 
809  TLOG(TLVL_INFO) << "Will roll over to subrun " << subrun << " when I reach Sequence ID " << boundary;
810  subrun_event_map_[boundary] = subrun;
811  while (subrun_event_map_.size() > max_subrun_event_map_length_)
812  {
813  subrun_event_map_.erase(subrun_event_map_.begin());
814  }
815 }
816 
818 {
819  Fragment::sequence_id_t seqID = 0;
820  subrun_id_t subrun = 0;
821  {
822  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
823  for (auto& it : subrun_event_map_)
824  {
825  if (it.first >= seqID) seqID = it.first + 1;
826  if (it.second >= subrun) subrun = it.second + 1;
827  }
828  }
829  rolloverSubrun(seqID, subrun);
830 }
831 
833 {
834  if (metricMan)
835  {
836  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
837  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
838  }
839 
840  if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
841  {
842  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
843  return;
844 
845  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
846  std::ostringstream oss;
847  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
848  for (auto& ev : active_buffers_)
849  {
850  auto hdr = getEventHeader_(ev);
851  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
852  }
853  TLOG(TLVL_DEBUG) << oss.str();
854  }
855 }
856 
857 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
858 {
859  TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
860  auto buffer = broadcasts_.GetBufferForWriting(false);
861  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
862  auto start_time = std::chrono::steady_clock::now();
863  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
864  {
865  usleep(10000);
866  buffer = broadcasts_.GetBufferForWriting(false);
867  }
868  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
869  if (buffer == -1)
870  {
871  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
872  outFrag.swap(frag);
873  return false;
874  }
875 
876  TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
877  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
878  hdr->run_id = run_id_;
879  hdr->subrun_id = GetSubrunForSequenceID(frag->sequenceID());
880  hdr->sequence_id = frag->sequenceID();
881  hdr->is_complete = true;
882  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
883 
884  TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
885  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
886 
887  TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
888  broadcasts_.MarkBufferFull(buffer, -1);
889  outFrag.swap(frag);
890  TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
891  return true;
892 }
893 
894 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
895 {
896  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
897 }
898 
900 {
901  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
902 
903  TLOG(TLVL_TRACE) << "GetSubrunForSequenceID BEGIN map size = " << subrun_event_map_.size();
904  auto it = subrun_event_map_.begin();
905  subrun_id_t subrun = 1;
906 
907  while (it->first <= seqID && it != subrun_event_map_.end())
908  {
909  TLOG(TLVL_TRACE) << "Map has sequence ID " << it->first << ", subrun " << it->second << " (looking for <= " << seqID << ")";
910  subrun = it->second;
911  ++it;
912  }
913 
914  TLOG(TLVL_DEBUG) << "GetSubrunForSequenceID returning subrun " << subrun << " for sequence ID " << seqID;
915  return subrun;
916 }
917 
918 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
919 {
920  TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
921  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
922 
923  TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
924 
925  auto buffers = GetBuffersOwnedByManager();
926  for (auto& buf : buffers)
927  {
928  auto hdr = getEventHeader_(buf);
929  if (hdr->sequence_id == seqID)
930  {
931  TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
932  return buf;
933  }
934  }
935 
936 #if !ART_SUPPORTS_DUPLICATE_EVENTS
937  if (released_incomplete_events_.count(seqID))
938  {
939  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
940  return -2;
941  }
942 #endif
943 
944  if (!create_new) return -1;
945 
946  check_pending_buffers_(lk);
947  int new_buffer = GetBufferForWriting(false);
948 
949  if (new_buffer == -1)
950  {
951  new_buffer = GetBufferForWriting(overwrite_mode_);
952  }
953 
954  if (new_buffer == -1) return -1;
955  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
956  std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
957  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
958  //TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
959  auto hdr = getEventHeader_(new_buffer);
960  hdr->is_complete = false;
961  hdr->run_id = run_id_;
962  hdr->subrun_id = GetSubrunForSequenceID(seqID);
963  hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
964  hdr->sequence_id = seqID;
965  buffer_writes_pending_[new_buffer] = 0;
966  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
967  SetMFIteration("Sequence ID " + std::to_string(seqID));
968 
969  TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
970  active_buffers_.insert(new_buffer);
971  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
972  << size() << ","
973  << ReadReadyCount() << ","
974  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
975  << WriteReadyCount(false) << ","
976  << pending_buffers_.size() << ","
977  << active_buffers_.size() << ")";
978 
979  if (requests_)
980  {
981  if (timestamp != Fragment::InvalidTimestamp)
982  {
983  requests_->AddRequest(seqID, timestamp);
984  }
985  // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we
986  // don't double-send requests, but still get the benefit of calling SendRequest 'often'.
987  else
988  {
989  requests_->SendRequest();
990  }
991  }
992  TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
993  return new_buffer;
994 }
995 
996 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
997 {
998  if (buffer == -1) return true;
999  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
1000  {
1001  return true;
1002  }
1003  ResetReadPos(buffer);
1004  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
1005  return MoreDataInBuffer(buffer);
1006 }
1007 
1008 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
1009 {
1010  auto hdr = getEventHeader_(buffer);
1011  if (hdr->is_complete)
1012  {
1013  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
1014 
1015  {
1016  TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
1017 
1018  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1019  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1020  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1021  active_buffers_.erase(buffer);
1022  pending_buffers_.insert(buffer);
1023 
1024  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1025  << size() << ","
1026  << ReadReadyCount() << ","
1027  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1028  << WriteReadyCount(false) << ","
1029  << pending_buffers_.size() << ","
1030  << active_buffers_.size() << ")";
1031  }
1032  if (requests_)
1033  {
1034  requests_->RemoveRequest(hdr->sequence_id);
1035  }
1036  }
1037  CheckPendingBuffers();
1038 }
1039 
1040 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
1041 {
1042  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1043 }
1044 
1046 {
1047  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
1048  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1049  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
1050  check_pending_buffers_(lk);
1051 }
1052 
1053 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
1054 {
1055  TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1056 
1057  auto buffers = GetBuffersOwnedByManager();
1058  for (auto buf : buffers)
1059  {
1060  if (ResetBuffer(buf) && !pending_buffers_.count(buf))
1061  {
1062  TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1063  auto hdr = getEventHeader_(buf);
1064  if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_))
1065  {
1066  if (requests_)
1067  {
1068  requests_->RemoveRequest(hdr->sequence_id);
1069  }
1070  TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
1071  active_buffers_.erase(buf);
1072  pending_buffers_.insert(buf);
1073  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1074  << size() << ","
1075  << ReadReadyCount() << ","
1076  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1077  << WriteReadyCount(false) << ","
1078  << pending_buffers_.size() << ","
1079  << active_buffers_.size() << ")";
1080 
1081  run_incomplete_event_count_++;
1082  if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
1083  if (!released_incomplete_events_.count(hdr->sequence_id))
1084  {
1085  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1086  }
1087  else
1088  {
1089  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1090  }
1091  TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
1092  }
1093  }
1094  }
1095 
1096  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1097  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
1098 
1099  auto counter = 0;
1100  double eventSize = 0;
1101  for (auto buf : sorted_buffers)
1102  {
1103  auto hdr = getEventHeader_(buf);
1104 
1105  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
1106  << "event_size=" << BufferDataSize(buf) << ", buffer_size=" << BufferSize();
1107 
1108  TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
1109  MarkBufferFull(buf);
1110  run_event_count_++;
1111  counter++;
1112  eventSize += BufferDataSize(buf);
1113  pending_buffers_.erase(buf);
1114  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1115  << size() << ","
1116  << ReadReadyCount() << ","
1117  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1118  << WriteReadyCount(false) << ","
1119  << pending_buffers_.size() << ","
1120  << active_buffers_.size() << ")";
1121  }
1122 
1123  if (requests_)
1124  {
1125  TLOG(TLVL_TRACE) << "Sent tokens: " << requests_->GetSentTokenCount() << ", Event count: " << run_event_count_;
1126  auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
1127  auto available_buffers = WriteReadyCount(overwrite_mode_);
1128 
1129  TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
1130  << ", tokens_to_send: " << available_buffers - outstanding_tokens;
1131 
1132  if (available_buffers > outstanding_tokens)
1133  {
1134  auto tokens_to_send = available_buffers - outstanding_tokens;
1135 
1136  while (tokens_to_send > 0)
1137  {
1138  TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
1139  requests_->SendRoutingToken(1, run_id_);
1140  tokens_to_send--;
1141  }
1142  }
1143  }
1144 
1145  metric_data_.event_count += counter;
1146  metric_data_.event_size += eventSize;
1147 
1148  if (metricMan && TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500) // Limit to 2 Hz updates
1149  {
1150  TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
1151  metricMan->sendMetric("Event Rate", metric_data_.event_count, "Events/s", 1, MetricMode::Rate);
1152  if (metric_data_.event_count > 0) metricMan->sendMetric("Average Event Size", metric_data_.event_size / metric_data_.event_count, "Bytes", 1, MetricMode::Average);
1153  metric_data_ = MetricData();
1154 
1155  metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
1156  metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
1157  if (requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
1158 
1159  auto bufferReport = GetBufferReport();
1160  int full = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Full; });
1161  int empty = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Empty; });
1162  int writing = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Writing; });
1163  int reading = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Reading; });
1164  auto total = size();
1165  TLOG(TLVL_DEBUG) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
1166 
1167  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
1168  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
1169  metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
1170  metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
1171  if (total > 0)
1172  {
1173  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1174  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1175  }
1176 
1177  last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now();
1178  }
1179  TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
1180 }
1181 
1182 void artdaq::SharedMemoryEventManager::send_init_frag_()
1183 {
1184  if (init_fragment_ != nullptr)
1185  {
1186  TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
1187 
1188 #if 0
1189  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
1190  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1191  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1192  ostream.close();
1193 #endif
1194 
1195  broadcastFragment_(std::move(init_fragment_), init_fragment_);
1196  TLOG(TLVL_TRACE) << "Init Fragment sent";
1197  }
1198  else if (send_init_fragments_)
1199  {
1200  TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
1201  }
1202 }
1203 
1205 {
1206  if (!init_fragment_ || init_fragment_ == nullptr)
1207  {
1208  init_fragment_.swap(frag);
1209  send_init_frag_();
1210  }
1211 }
1212 
1214 {
1215  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
1216  if (art_pset != current_art_pset_ || !current_art_config_file_)
1217  {
1218  current_art_pset_ = art_pset;
1219  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
1220  }
1221  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
1222 }
1223 
1224 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
1225 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager)
1226 #endif
void RunArt(std::shared_ptr< art_config_file > config_file, std::shared_ptr< std::atomic< pid_t >> pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
The SharedMemoryEventManager is a SharedMemoryManger which tracks events as they are built...
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
void StartArt()
Start all the art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
subrun_id_t GetSubrunForSequenceID(Fragment::sequence_id_t seqID)
Get the subrun number that the given Sequence ID would be assigned to.
void rolloverSubrun()
Add a subrun transition immediately after the highest currently define sequence ID.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
bool endOfData()
Indicate that the end of input has been reached to the art processes.
RawEvent::subrun_id_t subrun_id_t
Copy RawEvent::subrun_id_t into local scope.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...