artdaq  v3_03_00
SharedMemoryEventManager.cc
1 
2 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
3 
4 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
5 #include "artdaq-core/Core/StatisticsCollection.hh"
6 #include "artdaq-core/Utilities/TraceLock.hh"
7 #include <sys/wait.h>
8 
9 #define TLVL_BUFFER 40
10 #define TLVL_BUFLCK 41
11 
12 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
13 
14 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
15  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
16  pset.get<size_t>("buffer_count"),
17  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
18  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
19  !pset.get<bool>("broadcast_mode", false))
20  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
21  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
22  , queue_size_(pset.get<size_t>("buffer_count"))
23  , run_id_(0)
24  , subrun_id_(0)
25  , subrun_rollover_event_(Fragment::InvalidSequenceID)
26  , last_released_event_(0)
27  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
28  , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
29  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
30  , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
31  , running_(false)
32  , buffer_writes_pending_()
33  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
34  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
35  , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now())
36  , metric_data_()
37  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
38  , run_event_count_(0)
39  , run_incomplete_event_count_(0)
40  , subrun_event_count_(0)
41  , subrun_incomplete_event_count_(0)
42  , oversize_fragment_count_(0)
43  , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
44  , art_processes_()
45  , restart_art_(false)
46  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
47  , manual_art_(pset.get<bool>("manual_art", false))
48  , current_art_pset_(art_pset)
49  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
50  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 100000))
51  , requests_(nullptr)
52  , data_pset_(pset)
53  , dropped_data_()
54  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
55  pset.get<size_t>("broadcast_buffer_count", 10),
56  pset.get<size_t>("broadcast_buffer_size", 0x100000),
57  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
58 {
59  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
60  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
61 
62  if (pset.get<bool>("use_art", true) == false)
63  {
64  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
65  num_art_processes_ = 0;
66  }
67  else
68  {
69  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
70  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
71  }
72  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
73 
74  if (overwrite_mode_ && num_art_processes_ > 0)
75  {
76  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
77  }
78  else if (overwrite_mode_)
79  {
80  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
81  }
82 
83  for (size_t ii = 0; ii < size(); ++ii)
84  {
85  buffer_writes_pending_[ii] = 0;
86  }
87 
88  if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
89 
90  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
91  SetRank(my_rank);
92  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
93 
94 
95  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
96 }
97 
99 {
100  TLOG(TLVL_TRACE) << "DESTRUCTOR";
101  if (running_) endOfData();
102  TLOG(TLVL_TRACE) << "Destructor END";
103 }
104 
105 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
106 {
107  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
108  << ", sequence_id=" << frag.sequence_id;
109  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
110  TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
111  if (buffer == -1) return false;
112  if (buffer == -2)
113  {
114  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
115  return true;
116  }
117 
118  auto hdr = getEventHeader_(buffer);
119  if (update_run_ids_)
120  {
121  hdr->run_id = run_id_;
122  hdr->subrun_id = subrun_id_;
123  }
124 
125  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
126  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
127 
128  TLOG(TLVL_TRACE) << "Checking for complete event";
129  auto fragmentCount = GetFragmentCount(frag.sequence_id);
130  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
131  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
132  << ", fragmentCount=" << fragmentCount
133  << ", num_fragments_per_event=" << num_fragments_per_event_
134  << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
135 
136  complete_buffer_(buffer);
137  if (requests_) requests_->SendRequest(true);
138 
139  TLOG(TLVL_TRACE) << "AddFragment END";
140  return true;
141 }
142 
143 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
144 {
145  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
146  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
147  auto data = frag->headerAddress();
148  auto start = std::chrono::steady_clock::now();
149  bool sts = false;
150  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
151  {
152  sts = AddFragment(hdr, data);
153  if (!sts) usleep(1000);
154  }
155  if (!sts)
156  {
157  outfrag = std::move(frag);
158  }
159  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
160  return sts;
161 }
162 
163 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
164 {
165  TLOG(14) << "WriteFragmentHeader BEGIN";
166  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
167 
168  if (buffer < 0)
169  {
170  if (buffer == -1 && !dropIfNoBuffersAvailable) return nullptr;
171  if (buffer == -2)
172  {
173  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
174  }
175  else
176  {
177  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
178  }
179  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
180 
181  TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
182  return dropped_data_[frag.fragment_id]->dataBegin();
183  }
184 
185  // Increment this as soon as we know we want to use the buffer
186  buffer_writes_pending_[buffer]++;
187 
188  if (metricMan)
189  {
190  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
191  }
192 
193  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
194 
195  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
196 
197  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
198 
199  //TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
200  auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
201  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
202 
203  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
204  if (frag.word_count - frag.num_words() > 0)
205  {
206  auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
207 
208  if (!sts)
209  {
210  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words();
211  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType;
212  TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
213  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
214 
215  oversize_fragment_count_++;
216 
217  if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
218  {
219  throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
220  }
221 
222  TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin();
223  return dropped_data_[frag.fragment_id]->dataBegin();
224  }
225  }
226  TLOG(14) << "WriteFragmentHeader END";
227  return pos;
228 
229 }
230 
231 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
232 {
233  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
234  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
235  if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
236  if (buffer == -2) { return; }
237 
238  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
239 
240  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
241 
242  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
243 
244  //TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
245 
246  TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")";
247  auto hdr = getEventHeader_(buffer);
248  if (update_run_ids_)
249  {
250  hdr->run_id = run_id_;
251  hdr->subrun_id = subrun_id_;
252  }
253 
254  buffer_writes_pending_[buffer]--;
255  if (buffer_writes_pending_[buffer] != 0)
256  {
257  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
258  return;
259  }
260  TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
261  auto frag_count = GetFragmentCount(frag.sequence_id);
262  hdr->is_complete = frag_count == num_fragments_per_event_;
263  TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
264 #if ART_SUPPORTS_DUPLICATE_EVENTS
265  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
266  {
267  hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
268  }
269 #endif
270 
271  complete_buffer_(buffer);
272  if (requests_) requests_->SendRequest(true);
273  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
274 }
275 
276 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
277 {
278  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
279 }
280 
281 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
282 {
283  if (buffer == -1) return 0;
284  ResetReadPos(buffer);
285  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
286 
287  size_t count = 0;
288 
289  while (MoreDataInBuffer(buffer))
290  {
291  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
292  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
293  if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
294  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
295  ++count;
296  }
297 
298  return count;
299 }
300 
301 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out)
302 {
303  do
304  {
305  auto start_time = std::chrono::steady_clock::now();
306  send_init_frag_();
307  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
308 
309  pid_t pid = 0;
310 
311  if (!manual_art_)
312  {
313  char* filename = new char[config_file->getFileName().length() + 1];
314  strcpy(filename, config_file->getFileName().c_str());
315 
316  std::vector<char*> args{ (char*)"art", (char*)"-c", filename, NULL };
317  pid = fork();
318  if (pid == 0)
319  { /* child */
320  // 23-May-2018, KAB: added the setting of the partition number env var
321  // in the environment of the child art process so that Globals.hh
322  // will pick it up there and provide it to the artdaq classes that
323  // are used in data transfers, etc. within the art process.
324  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
325  std::string envVarValue = std::to_string(GetPartitionNumber());
326  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
327  {
328  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
329  << "\" in the environment of a child art process. "
330  << "This may result in incorrect TCP port number "
331  << "assignments or other issues, and data may "
332  << "not flow through the system correctly.";
333  }
334 
335  execvp("art", &args[0]);
336  delete[] filename;
337  exit(1);
338  }
339  delete[] filename;
340  }
341  else
342  {
343  //Using cin/cout here to ensure console is active (artdaqDriver)
344  std::cout << "Please run the following command in a separate terminal:" << std::endl
345  << "art -c " << config_file->getFileName() << std::endl
346  << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
347  << "Finally, return to this window and enter the pid: " << std::endl;
348  std::cin >> pid;
349  }
350  *pid_out = pid;
351 
352  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
353  {
354  std::unique_lock<std::mutex> lk(art_process_mutex_);
355  art_processes_.insert(pid);
356  }
357  siginfo_t status;
358  auto sts = waitid(P_PID, pid, &status, WEXITED);
359  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
360  {
361  std::unique_lock<std::mutex> lk(art_process_mutex_);
362  art_processes_.erase(pid);
363  }
364  if (sts < 0)
365  {
366  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
367  }
368  else if (status.si_code == CLD_EXITED && status.si_status == 0)
369  {
370  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
371  }
372  else
373  {
374  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
375  if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
376 
377  auto exit_type = "exited with status code";
378  switch (status.si_code)
379  {
380  case CLD_DUMPED:
381  case CLD_KILLED:
382  exit_type = "was killed with signal";
383  break;
384  case CLD_EXITED:
385  default:
386  break;
387  }
388 
389  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
390  << "art process " << pid << " " << exit_type << " " << status.si_status
391  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
392  << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
393  << (restart_art_ ? "restarting" : "not restarting");
394  }
395  } while (restart_art_);
396 }
397 
399 {
400  restart_art_ = always_restart_art_;
401  if (num_art_processes_ == 0) return;
402  for (size_t ii = 0; ii < num_art_processes_; ++ii)
403  {
404  StartArtProcess(current_art_pset_);
405  }
406 }
407 
409 {
410  static std::mutex start_art_mutex;
411  std::unique_lock<std::mutex> lk(start_art_mutex);
412  //TraceLock lk(start_art_mutex, 15, "StartArtLock");
413  restart_art_ = always_restart_art_;
414  auto initialCount = GetAttachedCount();
415  auto startTime = std::chrono::steady_clock::now();
416 
417  if (pset != current_art_pset_ || !current_art_config_file_)
418  {
419  current_art_pset_ = pset;
420  current_art_config_file_ = std::make_shared<art_config_file>(pset/*, GetKey(), GetBroadcastKey()*/);
421  }
422  std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
423  boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
424  thread.detach();
425 
426  auto currentCount = GetAttachedCount() - initialCount;
427  while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
428  {
429  usleep(10000);
430  currentCount = GetAttachedCount() - initialCount;
431  }
432  if ((currentCount < 1 || *pid <= 0) && manual_art_)
433  {
434  TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
435  return 0;
436  }
437  else if (currentCount < 1 || *pid <= 0)
438  {
439  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
440  << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
441  return 0;
442  }
443  else
444  {
445  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
446  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
447 
448  return *pid;
449  }
450 
451 }
452 
454 {
455  restart_art_ = false;
456  //current_art_config_file_ = nullptr;
457  //current_art_pset_ = fhicl::ParameterSet();
458 
459  auto check_pids = [&](bool print) {
460 
461  for (auto pid = pids.begin(); pid != pids.end();)
462  {
463  // 08-May-2018, KAB: protect against killing invalid PIDS
464 
465  std::unique_lock<std::mutex> lk(art_process_mutex_);
466  if (*pid <= 0)
467  {
468  TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
469  << ") from the shutdown list.";
470  pid = pids.erase(pid);
471  }
472  else if (kill(*pid, 0) < 0)
473  {
474  pid = pids.erase(pid);
475  }
476  else
477  {
478  if (print) std::cout << *pid << " ";
479  ++pid;
480  }
481  }
482  };
483  check_pids(false);
484  if (pids.size() == 0)
485  {
486  TLOG(14) << "All art processes already exited, nothing to do.";
487  usleep(1000);
488  return;
489  }
490 
491  if (!manual_art_)
492  {
493  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
494  for (auto pid : pids)
495  {
496  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
497  kill(pid, SIGQUIT);
498  }
499 
500  int graceful_wait_ms = 5000;
501  int int_wait_ms = 1000;
502 
503  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
504  for (int ii = 0; ii < graceful_wait_ms; ++ii)
505  {
506  usleep(1000);
507 
508  check_pids(false);
509  if (pids.size() == 0)
510  {
511  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
512  return;
513  }
514  }
515 
516  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
517  for (auto pid : pids)
518  {
519  kill(pid, SIGINT);
520  }
521 
522  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
523  for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
524  {
525  usleep(1000);
526 
527  check_pids(false);
528 
529  if (pids.size() == 0)
530  {
531  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
532  return;
533  }
534  }
535 
536  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
537  while (pids.size() > 0)
538  {
539  kill(*pids.begin(), SIGKILL);
540  usleep(1000);
541 
542  check_pids(false);
543  }
544  }
545  else
546  {
547  std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
548  while (pids.size() > 0)
549  {
550  std::cout << "The following PIDs are running: ";
551  check_pids(true);
552  std::cout << std::endl;
553  std::string ignored;
554  std::cin >> ignored;
555  }
556  }
557 }
558 
559 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
560 {
561  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
562  if (restart_art_ || !always_restart_art_) // Art is running
563  {
564  endOfData();
565  }
566  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
567  {
568  broadcasts_.MarkBufferEmpty(ii, true);
569  }
570  if (newRun == 0) newRun = run_id_ + 1;
571 
572  if (art_pset != current_art_pset_ || !current_art_config_file_)
573  {
574  current_art_pset_ = art_pset;
575  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
576  }
577 
578  if (n_art_processes != -1)
579  {
580  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
581  num_art_processes_ = n_art_processes;
582  }
583  startRun(newRun);
584  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
585 }
586 
588 {
589  running_ = false;
590  init_fragment_.reset(nullptr);
591  TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
592  restart_art_ = false;
593 
594  size_t initialStoreSize = GetIncompleteEventCount();
595  TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
596  << " stale events from the SharedMemoryEventManager.";
597  int counter = initialStoreSize;
598  while (active_buffers_.size() > 0 && counter > 0)
599  {
600  complete_buffer_(*active_buffers_.begin());
601  counter--;
602  }
603  TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
604  << " stale events in the SharedMemoryEventManager.";
605 
606 
607  TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
608  auto start = std::chrono::steady_clock::now();
609  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
610  auto end_of_data_wait_us = art_event_processing_time_us_ * lastReadCount;//size();
611 
612  auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
613 
614  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
615  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
616  {
617  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
618  if (temp != lastReadCount)
619  {
620  TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
621  lastReadCount = temp;
622  start = std::chrono::steady_clock::now();
623  }
624  if (lastReadCount > 0) {
625  TRACE(19,"About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu",outstanding_buffer_wait_time,lastReadCount,size(),end_of_data_wait_us );
626  usleep(outstanding_buffer_wait_time);
627  }
628  }
629 
630  TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
631  << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
632 
633  TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
634  FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
635  bool success = broadcastFragment_(std::move(outFrag), outFrag);
636  if (!success)
637  {
638  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
639  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
640  {
641  broadcasts_.MarkBufferEmpty(ii, true);
642  }
643  broadcastFragment_(std::move(outFrag), outFrag);
644  }
645  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
646 
647  if (get_art_process_count_() > 0)
648  {
649  TLOG(TLVL_DEBUG) << "Allowing " << get_art_process_count_() << " art processes the chance to end gracefully";
650  if (end_of_data_wait_us == 0)
651  {
652  TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
653  end_of_data_wait_us = 100 * 1000000;
654  }
655 
656  auto sleep_count = (end_of_data_wait_us / 10000) + 1;
657  for (size_t ii = 0; ii < sleep_count; ++ii)
658  {
659  usleep(10000);
660  if (get_art_process_count_() == 0) break;
661  }
662  }
663 
664  while (get_art_process_count_() > 0)
665  {
666  TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
667 
668  ShutdownArtProcesses(art_processes_);
669  }
670  TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
671 
672  ResetAttachedCount();
673 
674  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
675  for (size_t ii = 0; ii < size(); ++ii)
676  {
677  MarkBufferEmpty(ii, true);
678  }
679  // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again...
680  // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers";
681  // for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
682  // {
683  // broadcasts_.MarkBufferEmpty(ii, true);
684  // }
685  released_incomplete_events_.clear();
686 
687  TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestReceiver";
688  requests_.reset(nullptr);
689 
690  TLOG(TLVL_DEBUG) << "endOfData END";
691  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
692  return true;
693 }
694 
696 {
697  running_ = true;
698  init_fragment_.reset(nullptr);
699  TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
700  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
701  {
702  broadcasts_.MarkBufferEmpty(ii, true);
703  }
704  StartArt();
705  run_id_ = runID;
706  subrun_id_ = 1;
707  subrun_rollover_event_ = Fragment::InvalidSequenceID;
708  last_released_event_ = 0;
709  requests_.reset(new RequestSender(data_pset_));
710  if (requests_)
711  {
712  requests_->SendRoutingToken(queue_size_);
713  }
714  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
715  << ", max queue size = "
716  << queue_size_
717  << ", queue size = "
718  << GetLockedBufferCount();
719  if (metricMan)
720  {
721  double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
722  metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
723  }
724 }
725 
727 {
728  ++subrun_id_;
729  subrun_rollover_event_ = Fragment::InvalidSequenceID;
730  if (metricMan)
731  {
732  double runSubrun = run_id_ + ((double)subrun_id_ / 10000);
733  metricMan->sendMetric("Run Number", runSubrun, "Run:Subrun", 1, MetricMode::LastPoint);
734  }
735 }
736 
738 {
739  TLOG(TLVL_INFO) << "Ending run " << run_id_;
740  FragmentPtr endOfRunFrag(new
741  Fragment(static_cast<size_t>
742  (ceil(sizeof(my_rank) /
743  static_cast<double>(sizeof(Fragment::value_type))))));
744 
745  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
746  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
747  *endOfRunFrag->dataBegin() = my_rank;
748  broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
749 
750  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
751  run_event_count_ = 0;
752  run_incomplete_event_count_ = 0;
753  oversize_fragment_count_ = 0;
754  return true;
755 }
756 
758 {
759  TLOG(TLVL_INFO) << "Ending subrun " << subrun_id_;
760  std::unique_ptr<artdaq::Fragment>
761  endOfSubrunFrag(new
762  Fragment(static_cast<size_t>
763  (ceil(sizeof(my_rank) /
764  static_cast<double>(sizeof(Fragment::value_type))))));
765 
766  TLOG(TLVL_DEBUG) << "Broadcasting EndOfSubrun Fragment";
767  endOfSubrunFrag->setSequenceID(subrun_rollover_event_);
768  endOfSubrunFrag->setSystemType(Fragment::EndOfSubrunFragmentType);
769  *endOfSubrunFrag->dataBegin() = my_rank;
770 
771  broadcastFragment_(std::move(endOfSubrunFrag), endOfSubrunFrag);
772 
773  TLOG(TLVL_INFO) << "Subrun " << subrun_id_ << " in run " << run_id_ << " has ended. There were " << subrun_event_count_ << " events in this subrun.";
774  subrun_event_count_ = 0;
775  subrun_incomplete_event_count_ = 0;
776 
777  return true;
778 }
779 
781 {
782  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
783  if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
784 
785  if (boundary < last_released_event_)
786  {
787  auto logLevel = TLVL_ERROR;
788  bool processAnyway = false;
789  if (last_released_event_ - boundary < 100)
790  {
791  logLevel = TLVL_WARNING;
792  processAnyway = true;
793  }
794  TLOG(logLevel) << "Subrun rollover requested for event that is in the past. (last_released_event="
795  << last_released_event_ << ",requested_rollover_boundary=" << boundary << ").";
796  if (!processAnyway) return;
797  }
798  TLOG(TLVL_INFO) << "Will roll over when I reach Sequence ID " << boundary;
799 
800  // JCF, May-11-2018
801 
802  // subrun_rollover_event_ is used in check_pending_buffers to
803  // trigger an endSubrun()/startSubrun(), but if the last event
804  // sent was right before the boundary we might as well switch
805  // to the new subrun here
806 
807  if (boundary == last_released_event_ + 1) {
808  TLOG(TLVL_INFO) << "rolloverSubrun: Last released event had sequence id " << last_released_event_ << \
809  ", boundary is sequence id " << boundary << ", so will start a new subrun here";
810  endSubrun();
811  startSubrun();
812  subrun_rollover_event_ = std::numeric_limits<sequence_id_t>::max();
813  }
814  else {
815  subrun_rollover_event_ = boundary;
816  }
817 }
818 
820 {
821  if (metricMan)
822  {
823  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
824  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
825  }
826 
827  if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
828  {
829  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
830  return;
831 
832  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
833  std::ostringstream oss;
834  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
835  for (auto& ev : active_buffers_)
836  {
837  auto hdr = getEventHeader_(ev);
838  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
839  }
840  TLOG(TLVL_DEBUG) << oss.str();
841  }
842 }
843 
844 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
845 {
846  TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
847  auto buffer = broadcasts_.GetBufferForWriting(false);
848  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
849  auto start_time = std::chrono::steady_clock::now();
850  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
851  {
852  usleep(10000);
853  buffer = broadcasts_.GetBufferForWriting(false);
854  }
855  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
856  if (buffer == -1)
857  {
858  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
859  outFrag.swap(frag);
860  return false;
861  }
862 
863  TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
864  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
865  hdr->run_id = run_id_;
866  hdr->subrun_id = subrun_id_;
867  hdr->sequence_id = frag->sequenceID();
868  hdr->is_complete = true;
869  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
870 
871  TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
872  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
873 
874  TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
875  broadcasts_.MarkBufferFull(buffer, -1);
876  outFrag.swap(frag);
877  TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
878  return true;
879 }
880 
881 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
882 {
883  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
884 }
885 
886 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
887 {
888  TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
889  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
890 
891  TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
892 
893  auto buffers = GetBuffersOwnedByManager();
894  for (auto& buf : buffers)
895  {
896  auto hdr = getEventHeader_(buf);
897  if (hdr->sequence_id == seqID)
898  {
899  TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
900  return buf;
901  }
902  }
903 
904 #if !ART_SUPPORTS_DUPLICATE_EVENTS
905  if (released_incomplete_events_.count(seqID))
906  {
907  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
908  return -2;
909  }
910 #endif
911 
912  if (!create_new) return -1;
913 
914  check_pending_buffers_(lk);
915  int new_buffer = GetBufferForWriting(false);
916 
917  if (new_buffer == -1)
918  {
919  new_buffer = GetBufferForWriting(overwrite_mode_);
920  }
921 
922  if (new_buffer == -1) return -1;
923  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
924  std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
925  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
926  //TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
927  auto hdr = getEventHeader_(new_buffer);
928  hdr->is_complete = false;
929  hdr->run_id = run_id_;
930  hdr->subrun_id = subrun_id_;
931  hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
932  hdr->sequence_id = seqID;
933  buffer_writes_pending_[new_buffer] = 0;
934  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
935  SetMFIteration("Sequence ID " + std::to_string(seqID));
936 
937  TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
938  active_buffers_.insert(new_buffer);
939  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
940  << size() << ","
941  << ReadReadyCount() << ","
942  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
943  << WriteReadyCount(false) << ","
944  << pending_buffers_.size() << ","
945  << active_buffers_.size() << ")";
946 
947  if (requests_)
948  {
949  if (timestamp != Fragment::InvalidTimestamp)
950  {
951  requests_->AddRequest(seqID, timestamp);
952  }
953  // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we
954  // don't double-send requests, but still get the benefit of calling SendRequest 'often'.
955  else
956  {
957  requests_->SendRequest();
958  }
959  }
960  TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
961  return new_buffer;
962 }
963 
964 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
965 {
966  if (buffer == -1) return true;
967  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
968  {
969  return true;
970  }
971  ResetReadPos(buffer);
972  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
973  return MoreDataInBuffer(buffer);
974 }
975 
976 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
977 {
978  auto hdr = getEventHeader_(buffer);
979  if (hdr->is_complete)
980  {
981  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
982 
983  {
984  TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
985 
986  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
987  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
988  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
989  active_buffers_.erase(buffer);
990  pending_buffers_.insert(buffer);
991 
992  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
993  << size() << ","
994  << ReadReadyCount() << ","
995  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
996  << WriteReadyCount(false) << ","
997  << pending_buffers_.size() << ","
998  << active_buffers_.size() << ")";
999  }
1000  if (requests_)
1001  {
1002  requests_->RemoveRequest(hdr->sequence_id);
1003  }
1004  }
1005  CheckPendingBuffers();
1006 }
1007 
1008 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
1009 {
1010  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1011 }
1012 
1014 {
1015  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
1016  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1017  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
1018  check_pending_buffers_(lk);
1019 }
1020 
1021 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
1022 {
1023  TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1024 
1025  auto buffers = GetBuffersOwnedByManager();
1026  for (auto buf : buffers)
1027  {
1028  if (ResetBuffer(buf) && !pending_buffers_.count(buf))
1029  {
1030  TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1031  auto hdr = getEventHeader_(buf);
1032  if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_))
1033  {
1034  if (requests_)
1035  {
1036  requests_->RemoveRequest(hdr->sequence_id);
1037  }
1038  TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
1039  active_buffers_.erase(buf);
1040  pending_buffers_.insert(buf);
1041  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1042  << size() << ","
1043  << ReadReadyCount() << ","
1044  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1045  << WriteReadyCount(false) << ","
1046  << pending_buffers_.size() << ","
1047  << active_buffers_.size() << ")";
1048 
1049  subrun_incomplete_event_count_++;
1050  run_incomplete_event_count_++;
1051  if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
1052  if (!released_incomplete_events_.count(hdr->sequence_id))
1053  {
1054  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1055  }
1056  else
1057  {
1058  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1059  }
1060  TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
1061  }
1062 
1063  }
1064  }
1065 
1066  Fragment::sequence_id_t lowestSeqId = Fragment::InvalidSequenceID;
1067 
1068  // Only use "weak ordering" when buffers are available for writing
1069  if (ReadyForWrite(false))
1070  {
1071  for (auto buf : active_buffers_)
1072  {
1073  auto hdr = getEventHeader_(buf);
1074  TLOG(TLVL_TRACE) << "Buffer: " << buf << ", SeqID: " << hdr->sequence_id << ", ACTIVE";
1075  if (hdr->sequence_id < lowestSeqId)
1076  {
1077  lowestSeqId = hdr->sequence_id;
1078  }
1079  }
1080  TLOG(TLVL_TRACE) << "Lowest SeqID held: " << lowestSeqId;
1081  }
1082 
1083  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1084  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
1085 
1086  auto counter = 0;
1087  double eventSize = 0;
1088  for (auto buf : sorted_buffers)
1089  {
1090  auto hdr = getEventHeader_(buf);
1091  if (hdr->sequence_id > lowestSeqId) break;
1092 
1093  if (hdr->sequence_id >= subrun_rollover_event_)
1094  {
1095  TLOG(TLVL_INFO) << "Subrun rollover reached at event " << hdr->sequence_id << " (boundary=" << subrun_rollover_event_ << "), last released event is " << last_released_event_ << ".";
1096  endSubrun();
1097  startSubrun();
1098  }
1099  if (hdr->sequence_id > last_released_event_) last_released_event_ = hdr->sequence_id;
1100 
1101  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
1102  << "event_size=" << BufferDataSize(buf) << ", buffer_size=" << BufferSize();
1103 
1104  TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
1105  MarkBufferFull(buf);
1106  subrun_event_count_++;
1107  run_event_count_++;
1108  counter++;
1109  eventSize += BufferDataSize(buf);
1110  pending_buffers_.erase(buf);
1111  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1112  << size() << ","
1113  << ReadReadyCount() << ","
1114  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1115  << WriteReadyCount(false) << ","
1116  << pending_buffers_.size() << ","
1117  << active_buffers_.size() << ")";
1118  }
1119 
1120  if (requests_)
1121  {
1122  auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
1123  auto available_buffers = WriteReadyCount(overwrite_mode_);
1124 
1125  TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
1126  << ", tokens_to_send: " << available_buffers - outstanding_tokens;
1127 
1128  if (available_buffers > outstanding_tokens)
1129  {
1130  auto tokens_to_send = available_buffers - outstanding_tokens;
1131 
1132  while (tokens_to_send > 0)
1133  {
1134  TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
1135  requests_->SendRoutingToken(1);
1136  tokens_to_send--;
1137  }
1138  }
1139  }
1140 
1141  metric_data_.event_count += counter;
1142  metric_data_.event_size += eventSize;
1143 
1144  if (metricMan && TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500) // Limit to 2 Hz updates
1145  {
1146  TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
1147  metricMan->sendMetric("Event Rate", metric_data_.event_count, "Events/s", 1, MetricMode::Rate);
1148  if (metric_data_.event_count > 0) metricMan->sendMetric("Average Event Size", metric_data_.event_size / metric_data_.event_count, "Bytes", 1, MetricMode::Average);
1149  metric_data_ = MetricData();
1150 
1151  metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
1152  metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
1153  metricMan->sendMetric("Events Released to art this subrun", subrun_event_count_, "Events", 2, MetricMode::LastPoint);
1154  metricMan->sendMetric("Incomplete Events Released to art this subrun", subrun_incomplete_event_count_, "Events", 2, MetricMode::LastPoint);
1155  if(requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
1156 
1157  auto bufferReport = GetBufferReport();
1158  int full = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Full; });
1159  int empty = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Empty; });
1160  int writing = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Writing; });
1161  int reading = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Reading; });
1162  auto total = size();
1163  TLOG(TLVL_DEBUG) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
1164 
1165  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
1166  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
1167  metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
1168  metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
1169  if(total > 0)
1170  {
1171  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1172  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1173  }
1174 
1175  last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now();
1176  }
1177  TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
1178 }
1179 
1180 void artdaq::SharedMemoryEventManager::send_init_frag_()
1181 {
1182  if (init_fragment_ != nullptr)
1183  {
1184  TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
1185 
1186 #if 0
1187  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
1188  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1189  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1190  ostream.close();
1191 #endif
1192 
1193  broadcastFragment_(std::move(init_fragment_), init_fragment_);
1194  TLOG(TLVL_TRACE) << "Init Fragment sent";
1195  }
1196  else if (send_init_fragments_)
1197  {
1198  TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
1199  }
1200 }
1201 
1203 {
1204  if (!init_fragment_ || init_fragment_ == nullptr)
1205  {
1206  init_fragment_.swap(frag);
1207  send_init_frag_();
1208  }
1209 }
1210 
1212 {
1213  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
1214  if (art_pset != current_art_pset_ || !current_art_config_file_)
1215  {
1216  current_art_pset_ = art_pset;
1217  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
1218  }
1219  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
1220 }
1221 
1222 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
1223 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager)
1224 #endif
void RunArt(std::shared_ptr< art_config_file > config_file, std::shared_ptr< std::atomic< pid_t >> pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
The SharedMemoryEventManager is a SharedMemoryManger which tracks events as they are built...
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
void StartArt()
Start all the art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
void rolloverSubrun(sequence_id_t boundary)
Rollover the subrun after the specified event.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
void startSubrun()
Start a new Subrun, incrementing the subrun number.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endSubrun()
Send an EndOfSubRunFragment to the art thread.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
bool endOfData()
Indicate that the end of input has been reached to the art processes.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...