artdaq  v3_08_00
SharedMemoryEventManager.cc
1 
2 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
3 #include <sys/wait.h>
4 #include "artdaq-core/Core/StatisticsCollection.hh"
5 #include "artdaq-core/Utilities/TraceLock.hh"
6 
7 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
8 
9 #define TLVL_BUFFER 40
10 #define TLVL_BUFLCK 41
11 
12 #define build_key(seed) seed + ((GetPartitionNumber() + 1) << 16) + (getpid() & 0xFFFF)
13 
14 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
15 std::mutex artdaq::SharedMemoryEventManager::subrun_event_map_mutex_;
16 const std::string artdaq::SharedMemoryEventManager::
17  FRAGMENTS_RECEIVED_STAT_KEY("SharedMemoryEventManagerFragmentsReceived");
18 const std::string artdaq::SharedMemoryEventManager::
19  EVENTS_RELEASED_STAT_KEY("SharedMemoryEventManagerEventsReleased");
20 
21 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
22  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", build_key(0xEE000000)),
23  pset.get<size_t>("buffer_count"),
24  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
25  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
26  !pset.get<bool>("broadcast_mode", false))
27  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
28  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
29  , queue_size_(pset.get<size_t>("buffer_count"))
30  , run_id_(0)
31  , max_subrun_event_map_length_(pset.get<size_t>("max_subrun_lookup_table_size", 100))
32  , max_event_list_length_(pset.get<size_t>("max_event_list_length", 100))
33  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
34  , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
35  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
36  , init_fragment_count_(pset.get<size_t>("init_fragment_count", pset.get<bool>("send_init_fragments", true) ? 1 : 0))
37  , running_(false)
38  , buffer_writes_pending_()
39  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
40  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
41  , last_backpressure_report_time_(std::chrono::steady_clock::now())
42  , last_fragment_header_write_time_(std::chrono::steady_clock::now())
43  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
44  , run_event_count_(0)
45  , run_incomplete_event_count_(0)
46  , subrun_event_count_(0)
47  , subrun_incomplete_event_count_(0)
48  , oversize_fragment_count_(0)
49  , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
50  , art_processes_()
51  , restart_art_(false)
52  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
53  , manual_art_(pset.get<bool>("manual_art", false))
54  , current_art_pset_(art_pset)
55  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
56  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 1000000))
57  , requests_(nullptr)
58  , data_pset_(pset)
59  , dropped_data_()
60  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", build_key(0xBB000000)),
61  pset.get<size_t>("broadcast_buffer_count", 10),
62  pset.get<size_t>("broadcast_buffer_size", 0x100000),
63  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
64 {
65  subrun_event_map_[0] = 1;
66  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
67  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
68 
69  if (pset.get<bool>("use_art", true) == false)
70  {
71  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
72  num_art_processes_ = 0;
73  }
74  else
75  {
76  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
77  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
78  }
79  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
80 
81  if (overwrite_mode_ && num_art_processes_ > 0)
82  {
83  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
84  }
85  else if (overwrite_mode_)
86  {
87  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
88  }
89 
90  for (size_t ii = 0; ii < size(); ++ii)
91  {
92  buffer_writes_pending_[ii] = 0;
93  }
94 
95  if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
96 
97  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
98  SetRank(my_rank);
99  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
100 
103 
104  // fetch the monitoring parameters and create the MonitoredQuantity instances
105  statsHelper_.createCollectors(pset, 100, 30.0, 60.0, EVENTS_RELEASED_STAT_KEY);
106 
107  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
108 }
109 
111 {
112  TLOG(TLVL_TRACE) << "DESTRUCTOR";
113  if (running_) endOfData();
114  TLOG(TLVL_TRACE) << "Destructor END";
115 }
116 
117 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
118 {
119  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
120  << ", sequence_id=" << frag.sequence_id;
121  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
122  TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
123  if (buffer == -1) return false;
124  if (buffer == -2)
125  {
126  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
127  return true;
128  }
129 
130  auto hdr = getEventHeader_(buffer);
131  if (update_run_ids_)
132  {
133  hdr->run_id = run_id_;
134  }
135  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
136 
137  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
138  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
139 
140  TLOG(TLVL_TRACE) << "Checking for complete event";
141  auto fragmentCount = GetFragmentCount(frag.sequence_id);
142  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
143  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
144  << ", fragmentCount=" << fragmentCount
145  << ", num_fragments_per_event=" << num_fragments_per_event_
146  << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
147 
148  complete_buffer_(buffer);
149  if (requests_) requests_->SendRequest(true);
150 
151  TLOG(TLVL_TRACE) << "AddFragment END";
152  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
153  return true;
154 }
155 
156 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
157 {
158  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
159  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
160  auto data = frag->headerAddress();
161  auto start = std::chrono::steady_clock::now();
162  bool sts = false;
163  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
164  {
165  sts = AddFragment(hdr, data);
166  if (!sts) usleep(1000);
167  }
168  if (!sts)
169  {
170  outfrag = std::move(frag);
171  }
172  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
173  return sts;
174 }
175 
176 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
177 {
178  TLOG(14) << "WriteFragmentHeader BEGIN";
179  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
180 
181  if (buffer < 0)
182  {
183  if (buffer == -1 && !dropIfNoBuffersAvailable)
184  {
185  std::unique_lock<std::mutex> bp_lk(sequence_id_mutex_);
186  if (TimeUtils::GetElapsedTime(last_backpressure_report_time_) > 1.0)
187  {
188  TLOG(TLVL_WARNING) << app_name << ": Back-pressure condition: All Shared Memory buffers have been full for " << TimeUtils::GetElapsedTime(last_fragment_header_write_time_) << " s!";
189  last_backpressure_report_time_ = std::chrono::steady_clock::now();
190  }
191  return nullptr;
192  }
193  if (buffer == -2)
194  {
195  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
196  }
197  else
198  {
199  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
200  }
201  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
202 
203  TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
204  return dropped_data_[frag.fragment_id]->dataBegin();
205  }
206 
207  last_backpressure_report_time_ = std::chrono::steady_clock::now();
208  last_fragment_header_write_time_ = std::chrono::steady_clock::now();
209  // Increment this as soon as we know we want to use the buffer
210  buffer_writes_pending_[buffer]++;
211 
212  if (metricMan)
213  {
214  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
215  }
216 
217  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
218 
219  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
220 
221  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
222 
223  //TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
224  auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
225  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
226 
227  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
228  if (frag.word_count - frag.num_words() > 0)
229  {
230  auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
231 
232  if (!sts)
233  {
234  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words();
235  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType;
236  TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
237  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
238 
239  oversize_fragment_count_++;
240 
241  if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
242  {
243  throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
244  }
245 
246  TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin();
247  return dropped_data_[frag.fragment_id]->dataBegin();
248  }
249  }
250  TLOG(14) << "WriteFragmentHeader END";
251  return pos;
252 }
253 
254 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
255 {
256  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
257  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
258  if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
259  if (buffer == -2) { return; }
260 
261  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
262  {
263  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
264 
265  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
266 
267  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
268 
269  //TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
270 
271  TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")";
272  auto hdr = getEventHeader_(buffer);
273  if (update_run_ids_)
274  {
275  hdr->run_id = run_id_;
276  }
277  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
278 
279  TLOG(TLVL_TRACE) << "DoneWritingFragment: Updating buffer touch time";
280  TouchBuffer(buffer);
281 
282  buffer_writes_pending_[buffer]--;
283  if (buffer_writes_pending_[buffer] != 0)
284  {
285  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
286  return;
287  }
288  TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
289  auto frag_count = GetFragmentCount(frag.sequence_id);
290  hdr->is_complete = frag_count >= num_fragments_per_event_;
291 
292  if (frag_count > num_fragments_per_event_)
293  {
294  TLOG(TLVL_WARNING) << "DoneWritingFragment: This Event has more Fragments ( " << frag_count << " ) than specified in configuration ( " << num_fragments_per_event_ << " )!"
295  << " This is probably due to a misconfiguration and is *not* a reliable mode!";
296  }
297 
298  TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
299 #if ART_SUPPORTS_DUPLICATE_EVENTS
300  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
301  {
302  hdr->is_complete = frag_count >= released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
303  }
304 #endif
305  }
306 
307  complete_buffer_(buffer);
308  if (requests_) requests_->SendRequest(true);
309  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
310 }
311 
312 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
313 {
314  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
315 }
316 
317 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
318 {
319  if (buffer == -1) return 0;
320  ResetReadPos(buffer);
321  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
322 
323  size_t count = 0;
324 
325  while (MoreDataInBuffer(buffer))
326  {
327  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
328  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
329  if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
330  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
331  ++count;
332  }
333 
334  return count;
335 }
336 
337 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out)
338 {
339  do
340  {
341  auto start_time = std::chrono::steady_clock::now();
342  send_init_frags_();
343  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
344 
345  pid_t pid = 0;
346 
347  if (!manual_art_)
348  {
349  char* filename = new char[config_file->getFileName().length() + 1];
350  strcpy(filename, config_file->getFileName().c_str());
351 
352 #if DEBUG_ART
353  std::string debugArgS = "--config-out=" + app_name + "_art.out";
354  char* debugArg = new char[debugArgS.length() + 1];
355  strcpy(debugArg, debugArgS.c_str());
356 
357  std::vector<char*> args{(char*)"art", (char*)"-c", filename, debugArg, NULL};
358 #else
359  std::vector<char*> args{(char*)"art", (char*)"-c", filename, NULL};
360 #endif
361 
362  pid = fork();
363  if (pid == 0)
364  { /* child */
365  // 23-May-2018, KAB: added the setting of the partition number env var
366  // in the environment of the child art process so that Globals.hh
367  // will pick it up there and provide it to the artdaq classes that
368  // are used in data transfers, etc. within the art process.
369  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
370  std::string envVarValue = std::to_string(GetPartitionNumber());
371  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
372  {
373  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
374  << "\" in the environment of a child art process. "
375  << "This may result in incorrect TCP port number "
376  << "assignments or other issues, and data may "
377  << "not flow through the system correctly.";
378  }
379  envVarKey = "ARTDAQ_APPLICATION_NAME";
380  envVarValue = app_name;
381  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
382  {
383  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
384  << "\" in the environment of a child art process. ";
385  }
386  envVarKey = "ARTDAQ_RANK";
387  envVarValue = std::to_string(my_rank);
388  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
389  {
390  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
391  << "\" in the environment of a child art process. ";
392  }
393 
394  execvp("art", &args[0]);
395  delete[] filename;
396  exit(1);
397  }
398  delete[] filename;
399  }
400  else
401  {
402  //Using cin/cout here to ensure console is active (artdaqDriver)
403  std::cout << "Please run the following command in a separate terminal:" << std::endl
404  << "art -c " << config_file->getFileName() << std::endl
405  << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
406  << "Finally, return to this window and enter the pid: " << std::endl;
407  std::cin >> pid;
408  }
409  *pid_out = pid;
410 
411  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
412  {
413  std::unique_lock<std::mutex> lk(art_process_mutex_);
414  art_processes_.insert(pid);
415  }
416  siginfo_t status;
417  auto sts = waitid(P_PID, pid, &status, WEXITED);
418  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
419  {
420  std::unique_lock<std::mutex> lk(art_process_mutex_);
421  art_processes_.erase(pid);
422  }
423  if (sts < 0)
424  {
425  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
426  }
427  else if (status.si_code == CLD_EXITED && status.si_status == 0)
428  {
429  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
430  }
431  else
432  {
433  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
434  if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
435 
436  auto exit_type = "exited with status code";
437  switch (status.si_code)
438  {
439  case CLD_DUMPED:
440  case CLD_KILLED:
441  exit_type = "was killed with signal";
442  break;
443  case CLD_EXITED:
444  default:
445  break;
446  }
447 
448  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
449  << "art process " << pid << " " << exit_type << " " << status.si_status
450  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
451  << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
452  << (restart_art_ ? "restarting" : "not restarting");
453  }
454  } while (restart_art_);
455 }
456 
458 {
459  restart_art_ = always_restart_art_;
460  if (num_art_processes_ == 0) return;
461  for (size_t ii = 0; ii < num_art_processes_; ++ii)
462  {
463  StartArtProcess(current_art_pset_);
464  }
465 }
466 
468 {
469  static std::mutex start_art_mutex;
470  std::unique_lock<std::mutex> lk(start_art_mutex);
471  //TraceLock lk(start_art_mutex, 15, "StartArtLock");
472  restart_art_ = always_restart_art_;
473  auto initialCount = GetAttachedCount();
474  auto startTime = std::chrono::steady_clock::now();
475 
476  if (pset != current_art_pset_ || !current_art_config_file_)
477  {
478  current_art_pset_ = pset;
479  current_art_config_file_ = std::make_shared<art_config_file>(pset /*, GetKey(), GetBroadcastKey()*/);
480  }
481  std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
482  boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
483  thread.detach();
484 
485  auto currentCount = GetAttachedCount() - initialCount;
486  while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
487  {
488  usleep(10000);
489  currentCount = GetAttachedCount() - initialCount;
490  }
491  if ((currentCount < 1 || *pid <= 0) && manual_art_)
492  {
493  TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
494  return 0;
495  }
496  else if (currentCount < 1 || *pid <= 0)
497  {
498  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
499  << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
500  return 0;
501  }
502  else
503  {
504  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
505  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
506 
507  return *pid;
508  }
509 }
510 
512 {
513  restart_art_ = false;
514  //current_art_config_file_ = nullptr;
515  //current_art_pset_ = fhicl::ParameterSet();
516 
517  auto check_pids = [&](bool print) {
518  std::unique_lock<std::mutex> lk(art_process_mutex_);
519  for (auto pid = pids.begin(); pid != pids.end();)
520  {
521  // 08-May-2018, KAB: protect against killing invalid PIDS
522 
523  if (*pid <= 0)
524  {
525  TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
526  << ") from the shutdown list.";
527  pid = pids.erase(pid);
528  }
529  else if (kill(*pid, 0) < 0)
530  {
531  pid = pids.erase(pid);
532  }
533  else
534  {
535  if (print) std::cout << *pid << " ";
536  ++pid;
537  }
538  }
539  };
540  auto count_pids = [&]() {
541  std::unique_lock<std::mutex> lk(art_process_mutex_);
542  return pids.size();
543  };
544  check_pids(false);
545  if (count_pids() == 0)
546  {
547  TLOG(14) << "All art processes already exited, nothing to do.";
548  usleep(1000);
549  return;
550  }
551 
552  if (!manual_art_)
553  {
554  int graceful_wait_ms = art_event_processing_time_us_ * size() * 10 / 1000;
555  int gentle_wait_ms = art_event_processing_time_us_ * size() * 2 / 1000;
556  int int_wait_ms = art_event_processing_time_us_ * size() / 1000;
557  auto shutdown_start = std::chrono::steady_clock::now();
558 
559  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
560  for (int ii = 0; ii < graceful_wait_ms; ++ii)
561  {
562  usleep(1000);
563 
564  check_pids(false);
565  if (count_pids() == 0)
566  {
567  TLOG(TLVL_TRACE) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms.";
568  return;
569  }
570  }
571 
572  {
573  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
574  std::unique_lock<std::mutex> lk(art_process_mutex_);
575  for (auto pid : pids)
576  {
577  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
578  kill(pid, SIGQUIT);
579  }
580  }
581 
582  TLOG(TLVL_TRACE) << "Waiting up to " << gentle_wait_ms << " ms for all art processes to exit from SIGQUIT";
583  for (int ii = 0; ii < gentle_wait_ms; ++ii)
584  {
585  usleep(1000);
586 
587  check_pids(false);
588  if (count_pids() == 0)
589  {
590  TLOG(TLVL_TRACE) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms.";
591  return;
592  }
593  }
594 
595  {
596  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
597  std::unique_lock<std::mutex> lk(art_process_mutex_);
598  for (auto pid : pids)
599  {
600  kill(pid, SIGINT);
601  }
602  }
603 
604  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit from SIGINT";
605  for (int ii = 0; ii < int_wait_ms; ++ii)
606  {
607  usleep(1000);
608 
609  check_pids(false);
610 
611  if (count_pids() == 0)
612  {
613  TLOG(TLVL_TRACE) << "All art processes exited after " << TimeUtils::GetElapsedTimeMilliseconds(shutdown_start) << " ms.";
614  return;
615  }
616  }
617 
618  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
619  while (count_pids() > 0)
620  {
621  {
622  std::unique_lock<std::mutex> lk(art_process_mutex_);
623  kill(*pids.begin(), SIGKILL);
624  usleep(1000);
625  }
626  check_pids(false);
627  }
628  }
629  else
630  {
631  std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
632  while (count_pids() > 0)
633  {
634  std::cout << "The following PIDs are running: ";
635  check_pids(true);
636  std::cout << std::endl;
637  std::string ignored;
638  std::cin >> ignored;
639  }
640  }
641 }
642 
643 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
644 {
645  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
646  if (restart_art_ || !always_restart_art_) // Art is running
647  {
648  endOfData();
649  }
650  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
651  {
652  broadcasts_.MarkBufferEmpty(ii, true);
653  }
654  if (newRun == 0) newRun = run_id_ + 1;
655 
656  if (art_pset != current_art_pset_ || !current_art_config_file_)
657  {
658  current_art_pset_ = art_pset;
659  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
660  }
661 
662  if (n_art_processes != -1)
663  {
664  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
665  num_art_processes_ = n_art_processes;
666  }
667  startRun(newRun);
668  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
669 }
670 
672 {
673  running_ = false;
674  init_fragments_.clear();
675  TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
676  restart_art_ = false;
677 
678  size_t initialStoreSize = GetIncompleteEventCount();
679  TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
680  << " stale events from the SharedMemoryEventManager.";
681  int counter = initialStoreSize;
682  while (active_buffers_.size() > 0 && counter > 0)
683  {
684  complete_buffer_(*active_buffers_.begin());
685  counter--;
686  }
687  TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
688  << " stale events in the SharedMemoryEventManager.";
689 
690  TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
691  auto start = std::chrono::steady_clock::now();
692  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
693  auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1); //size();
694 
695  auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
696 
697  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
698  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
699  {
700  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
701  if (temp != lastReadCount)
702  {
703  TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
704  lastReadCount = temp;
705  start = std::chrono::steady_clock::now();
706  }
707  if (lastReadCount > 0)
708  {
709  TRACE(19, "About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu", outstanding_buffer_wait_time, lastReadCount, size(), end_of_data_wait_us);
710  usleep(outstanding_buffer_wait_time);
711  }
712  }
713 
714  TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
715  << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
716 
717  TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
718  FragmentPtrs broadcast;
719  broadcast.emplace_back(Fragment::eodFrag(GetBufferCount()));
720  bool success = broadcastFragments_(broadcast);
721  if (!success)
722  {
723  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
724  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
725  {
726  broadcasts_.MarkBufferEmpty(ii, true);
727  }
728  broadcastFragments_(broadcast);
729  }
730  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
731  while (get_art_process_count_() > 0)
732  {
733  TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
734 
735  ShutdownArtProcesses(art_processes_);
736  }
737  TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
738 
739  ResetAttachedCount();
740 
741  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
742  for (size_t ii = 0; ii < size(); ++ii)
743  {
744  MarkBufferEmpty(ii, true);
745  }
746  // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again...
747  // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers";
748  // for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
749  // {
750  // broadcasts_.MarkBufferEmpty(ii, true);
751  // }
752  released_events_.clear();
753  released_incomplete_events_.clear();
754 
755  TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestSender";
756  requests_.reset(nullptr);
757 
758  TLOG(TLVL_DEBUG) << "endOfData END";
759  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
760  return true;
761 }
762 
764 {
765  running_ = true;
766  init_fragments_.clear();
767  statsHelper_.resetStatistics();
768  TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
769  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
770  {
771  broadcasts_.MarkBufferEmpty(ii, true);
772  }
773  released_events_.clear();
774  released_incomplete_events_.clear();
775  StartArt();
776  run_id_ = runID;
777  {
778  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
779  subrun_event_map_.clear();
780  subrun_event_map_[0] = 1;
781  }
782  run_event_count_ = 0;
783  run_incomplete_event_count_ = 0;
784  requests_.reset(new RequestSender(data_pset_));
785  if (requests_)
786  {
787  requests_->SetRunNumber(static_cast<uint32_t>(run_id_));
788  requests_->SendRoutingToken(queue_size_, run_id_);
789  }
790  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
791  << ", max queue size = "
792  << queue_size_
793  << ", queue size = "
794  << GetLockedBufferCount();
795  if (metricMan)
796  {
797  metricMan->sendMetric("Run Number", static_cast<unsigned long>(run_id_), "Run", 1, MetricMode::LastPoint);
798  }
799 }
800 
802 {
803  TLOG(TLVL_INFO) << "Ending run " << run_id_;
804  FragmentPtr endOfRunFrag(new Fragment(static_cast<size_t>(ceil(sizeof(my_rank) /
805  static_cast<double>(sizeof(Fragment::value_type))))));
806 
807  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
808  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
809  *endOfRunFrag->dataBegin() = my_rank;
810  FragmentPtrs broadcast;
811  broadcast.emplace_back(std::move(endOfRunFrag));
812  broadcastFragments_(broadcast);
813 
814  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
815  run_event_count_ = 0;
816  run_incomplete_event_count_ = 0;
817  oversize_fragment_count_ = 0;
818  {
819  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
820  subrun_event_map_.clear();
821  subrun_event_map_[0] = 1;
822  }
823  return true;
824 }
825 
827 {
828  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
829  if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
830 
831  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
832 
833  TLOG(TLVL_INFO) << "Will roll over to subrun " << subrun << " when I reach Sequence ID " << boundary;
834  subrun_event_map_[boundary] = subrun;
835  while (subrun_event_map_.size() > max_subrun_event_map_length_)
836  {
837  subrun_event_map_.erase(subrun_event_map_.begin());
838  }
839 }
840 
842 {
843  Fragment::sequence_id_t seqID = 0;
844  subrun_id_t subrun = 0;
845  {
846  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
847  for (auto& it : subrun_event_map_)
848  {
849  if (it.first >= seqID) seqID = it.first + 1;
850  if (it.second >= subrun) subrun = it.second + 1;
851  }
852  }
853  rolloverSubrun(seqID, subrun);
854 }
855 
857 {
858  if (metricMan)
859  {
860  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
861  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
862  }
863 
864  if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
865  {
866  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
867  return;
868 
869  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
870  std::ostringstream oss;
871  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
872  for (auto& ev : active_buffers_)
873  {
874  auto hdr = getEventHeader_(ev);
875  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
876  }
877  TLOG(TLVL_DEBUG) << oss.str();
878  }
879 }
880 
881 bool artdaq::SharedMemoryEventManager::broadcastFragments_(FragmentPtrs& frags)
882 {
883  if (frags.size() == 0)
884  {
885  TLOG(TLVL_ERROR) << "Requested broadcast but no Fragments given!";
886  return false;
887  }
888  TLOG(TLVL_DEBUG) << "Broadcasting Fragments with seqID=" << frags.front()->sequenceID()
889  << ", type " << detail::RawFragmentHeader::SystemTypeToString(frags.front()->type())
890  << ", size=" << frags.front()->sizeBytes() << "B.";
891  auto buffer = broadcasts_.GetBufferForWriting(false);
892  TLOG(TLVL_DEBUG) << "broadcastFragments_: after getting buffer 1st buffer=" << buffer;
893  auto start_time = std::chrono::steady_clock::now();
894  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
895  {
896  usleep(10000);
897  buffer = broadcasts_.GetBufferForWriting(false);
898  }
899  TLOG(TLVL_DEBUG) << "broadcastFragments_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
900  if (buffer == -1)
901  {
902  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frags.front()->typeString() << " failed due to timeout waiting for buffer!";
903  return false;
904  }
905 
906  TLOG(TLVL_DEBUG) << "broadcastFragments_: Filling in RawEventHeader";
907  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
908  hdr->run_id = run_id_;
909  hdr->subrun_id = GetSubrunForSequenceID(frags.front()->sequenceID());
910  hdr->sequence_id = frags.front()->sequenceID();
911  hdr->is_complete = true;
912  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
913 
914  for (auto& frag : frags)
915  {
916  TLOG(TLVL_DEBUG) << "broadcastFragments_ before Write calls";
917  if (frag->sequenceID() != hdr->sequence_id || frag->type() != frags.front()->type())
918  {
919  TLOG(TLVL_WARNING) << "Not sending fragment because its SequenceID or Type disagrees with leading Fragment";
920  continue;
921  }
922  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
923  }
924 
925  TLOG(TLVL_DEBUG) << "broadcastFragments_ Marking buffer full";
926  broadcasts_.MarkBufferFull(buffer, -1);
927  TLOG(TLVL_DEBUG) << "broadcastFragment_s Complete";
928  return true;
929 }
930 
931 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
932 {
933  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
934 }
935 
937 {
938  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
939 
940  TLOG(TLVL_TRACE) << "GetSubrunForSequenceID BEGIN map size = " << subrun_event_map_.size();
941  auto it = subrun_event_map_.begin();
942  subrun_id_t subrun = 1;
943 
944  while (it->first <= seqID && it != subrun_event_map_.end())
945  {
946  TLOG(TLVL_TRACE) << "Map has sequence ID " << it->first << ", subrun " << it->second << " (looking for <= " << seqID << ")";
947  subrun = it->second;
948  ++it;
949  }
950 
951  TLOG(TLVL_DEBUG) << "GetSubrunForSequenceID returning subrun " << subrun << " for sequence ID " << seqID;
952  return subrun;
953 }
954 
955 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
956 {
957  TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
958  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
959 
960  TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
961 
962  auto buffers = GetBuffersOwnedByManager();
963  for (auto& buf : buffers)
964  {
965  auto hdr = getEventHeader_(buf);
966  if (hdr->sequence_id == seqID)
967  {
968  TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
969  return buf;
970  }
971  }
972 
973 #if !ART_SUPPORTS_DUPLICATE_EVENTS
974  if (released_incomplete_events_.count(seqID))
975  {
976  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
977  return -2;
978  }
979  if (released_events_.count(seqID))
980  {
981  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been completed and released to art! Check configuration for inconsistent Fragment count per event!";
982  return -2;
983  }
984 #endif
985 
986  if (!create_new) return -1;
987 
988  check_pending_buffers_(lk);
989  int new_buffer = GetBufferForWriting(false);
990 
991  if (new_buffer == -1)
992  {
993  new_buffer = GetBufferForWriting(overwrite_mode_);
994  }
995 
996  if (new_buffer == -1) return -1;
997  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
998  std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
999  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
1000  //TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
1001  auto hdr = getEventHeader_(new_buffer);
1002  hdr->is_complete = false;
1003  hdr->run_id = run_id_;
1004  hdr->subrun_id = GetSubrunForSequenceID(seqID);
1005  hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
1006  hdr->sequence_id = seqID;
1007  buffer_writes_pending_[new_buffer] = 0;
1008  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
1009  SetMFIteration("Sequence ID " + std::to_string(seqID));
1010 
1011  TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
1012  active_buffers_.insert(new_buffer);
1013  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1014  << size() << ","
1015  << ReadReadyCount() << ","
1016  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1017  << WriteReadyCount(false) << ","
1018  << pending_buffers_.size() << ","
1019  << active_buffers_.size() << ")";
1020 
1021  if (requests_)
1022  {
1023  if (timestamp != Fragment::InvalidTimestamp)
1024  {
1025  requests_->AddRequest(seqID, timestamp);
1026  }
1027  // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we
1028  // don't double-send requests, but still get the benefit of calling SendRequest 'often'.
1029  else
1030  {
1031  requests_->SendRequest();
1032  }
1033  }
1034  TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
1035  return new_buffer;
1036 }
1037 
1038 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
1039 {
1040  if (buffer == -1) return true;
1041  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
1042  {
1043  return true;
1044  }
1045  ResetReadPos(buffer);
1046  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
1047  return MoreDataInBuffer(buffer);
1048 }
1049 
1050 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
1051 {
1052  auto hdr = getEventHeader_(buffer);
1053  if (hdr->is_complete)
1054  {
1055  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
1056 
1057  {
1058  TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
1059 
1060  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1061  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1062  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1063  active_buffers_.erase(buffer);
1064  pending_buffers_.insert(buffer);
1065  released_events_.insert(hdr->sequence_id);
1066  while (released_events_.size() > max_event_list_length_)
1067  {
1068  released_events_.erase(released_events_.begin());
1069  }
1070 
1071  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1072  << size() << ","
1073  << ReadReadyCount() << ","
1074  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1075  << WriteReadyCount(false) << ","
1076  << pending_buffers_.size() << ","
1077  << active_buffers_.size() << ")";
1078  }
1079  if (requests_)
1080  {
1081  requests_->RemoveRequest(hdr->sequence_id);
1082  }
1083  }
1084  CheckPendingBuffers();
1085 }
1086 
1087 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
1088 {
1089  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1090 }
1091 
1093 {
1094  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
1095  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1096  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
1097  check_pending_buffers_(lk);
1098 }
1099 
1100 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
1101 {
1102  TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1103 
1104  auto buffers = GetBuffersOwnedByManager();
1105  for (auto buf : buffers)
1106  {
1107  if (ResetBuffer(buf) && !pending_buffers_.count(buf))
1108  {
1109  TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1110  auto hdr = getEventHeader_(buf);
1111  if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_))
1112  {
1113  if (requests_)
1114  {
1115  requests_->RemoveRequest(hdr->sequence_id);
1116  }
1117  TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
1118  active_buffers_.erase(buf);
1119  pending_buffers_.insert(buf);
1120  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1121  << size() << ","
1122  << ReadReadyCount() << ","
1123  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1124  << WriteReadyCount(false) << ","
1125  << pending_buffers_.size() << ","
1126  << active_buffers_.size() << ")";
1127 
1128  run_incomplete_event_count_++;
1129  if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
1130  if (!released_incomplete_events_.count(hdr->sequence_id))
1131  {
1132  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1133  }
1134  else
1135  {
1136  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1137  }
1138  TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
1139  }
1140  }
1141  }
1142 
1143  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1144  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
1145 
1146  auto counter = 0;
1147  double eventSize = 0;
1148  for (auto buf : sorted_buffers)
1149  {
1150  auto hdr = getEventHeader_(buf);
1151  auto thisEventSize = BufferDataSize(buf);
1152 
1153  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
1154  << "event_size=" << thisEventSize << ", buffer_size=" << BufferSize();
1155  statsHelper_.addSample(EVENTS_RELEASED_STAT_KEY, thisEventSize);
1156 
1157  TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
1158  MarkBufferFull(buf);
1159  run_event_count_++;
1160  counter++;
1161  eventSize += thisEventSize;
1162  pending_buffers_.erase(buf);
1163  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1164  << size() << ","
1165  << ReadReadyCount() << ","
1166  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1167  << WriteReadyCount(false) << ","
1168  << pending_buffers_.size() << ","
1169  << active_buffers_.size() << ")";
1170  }
1171 
1172  if (requests_ && requests_->RoutingTokenSendsEnabled())
1173  {
1174  TLOG(TLVL_TRACE) << "Sent tokens: " << requests_->GetSentTokenCount() << ", Event count: " << run_event_count_;
1175  auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
1176  auto available_buffers = WriteReadyCount(overwrite_mode_);
1177 
1178  TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
1179  << ", tokens_to_send: " << available_buffers - outstanding_tokens;
1180 
1181  if (available_buffers > outstanding_tokens)
1182  {
1183  auto tokens_to_send = available_buffers - outstanding_tokens;
1184 
1185  while (tokens_to_send > 0)
1186  {
1187  TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
1188  requests_->SendRoutingToken(1, run_id_);
1189  tokens_to_send--;
1190  }
1191  }
1192  }
1193 
1194  if (statsHelper_.readyToReport())
1195  {
1196  std::string statString = buildStatisticsString_();
1197  TLOG(TLVL_INFO) << statString;
1198  }
1199 
1200  if (metricMan)
1201  {
1202  TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
1203  metricMan->sendMetric("Event Rate", counter, "Events", 1, MetricMode::Rate);
1204  metricMan->sendMetric("Data Rate", eventSize, "Bytes", 1, MetricMode::Rate);
1205  if (counter > 0)
1206  {
1207  metricMan->sendMetric("Average Event Size", eventSize / counter, "Bytes", 1, MetricMode::Average);
1208  }
1209 
1210  metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
1211  metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
1212  if (requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
1213 
1214  auto bufferReport = GetBufferReport();
1215  int full = 0, empty = 0, writing = 0, reading = 0;
1216  for (auto& buf : bufferReport)
1217  {
1218  switch (buf.second)
1219  {
1220  case BufferSemaphoreFlags::Full:
1221  full++;
1222  break;
1223  case BufferSemaphoreFlags::Empty:
1224  empty++;
1225  break;
1226  case BufferSemaphoreFlags::Writing:
1227  writing++;
1228  break;
1229  case BufferSemaphoreFlags::Reading:
1230  reading++;
1231  break;
1232  }
1233  }
1234  auto total = size();
1235  TLOG(15) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
1236 
1237  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
1238  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
1239  metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
1240  metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
1241  if (total > 0)
1242  {
1243  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1244  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1245  }
1246  }
1247  TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
1248 }
1249 
1250 void artdaq::SharedMemoryEventManager::send_init_frags_()
1251 {
1252  if (init_fragments_.size() >= init_fragment_count_ && init_fragment_count_ > 0)
1253  {
1254  TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
1255 
1256 #if 0
1257  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
1258  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1259  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1260  ostream.close();
1261 #endif
1262 
1263  broadcastFragments_(init_fragments_);
1264  TLOG(TLVL_TRACE) << "Init Fragment sent";
1265  }
1266  else if (init_fragment_count_ > 0)
1267  {
1268  TLOG(TLVL_WARNING) << "Cannot send init fragments because I haven't yet received them! Set send_init_fragments to false or init_fragment_count to 0 if this process does not receive serialized art events to avoid potentially lengthy timeouts!";
1269  }
1270  else
1271  {
1272  // Send an empty Init Fragment so that ArtdaqInput knows that this is a pure-Fragment input
1273  artdaq::FragmentPtrs begin_run_fragments_;
1274  begin_run_fragments_.emplace_back(new artdaq::Fragment());
1275  begin_run_fragments_.back()->setSystemType(artdaq::Fragment::InitFragmentType);
1276  broadcastFragments_(begin_run_fragments_);
1277  }
1278 }
1279 
1281 {
1282  init_fragments_.push_back(std::move(frag));
1283  send_init_frags_();
1284 }
1285 
1287 {
1288  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
1289  if (art_pset != current_art_pset_ || !current_art_config_file_)
1290  {
1291  current_art_pset_ = art_pset;
1292  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
1293  }
1294  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
1295 }
1296 
1297 std::string artdaq::SharedMemoryEventManager::buildStatisticsString_() const
1298 {
1299  std::ostringstream oss;
1300  oss << app_name << " statistics:" << std::endl;
1301 
1302  artdaq::MonitoredQuantityPtr mqPtr =
1303  artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(EVENTS_RELEASED_STAT_KEY);
1304  if (mqPtr.get() != 0)
1305  {
1306  artdaq::MonitoredQuantityStats stats;
1307  mqPtr->getStats(stats);
1308  oss << " Event statistics: " << stats.recentSampleCount << " events released at " << stats.recentSampleRate
1309  << " events/sec, effective data rate = "
1310  << (stats.recentValueRate / 1024.0 / 1024.0)
1311  << " MB/sec, monitor window = " << stats.recentDuration
1312  << " sec, min::max event size = " << (stats.recentValueMin / 1024.0 / 1024.0)
1313  << "::" << (stats.recentValueMax / 1024.0 / 1024.0) << " MB" << std::endl;
1314  if (stats.recentSampleRate > 0.0)
1315  {
1316  oss << " Average time per event: ";
1317  oss << " elapsed time = " << (1.0 / stats.recentSampleRate) << " sec" << std::endl;
1318  }
1319  }
1320 
1321  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(FRAGMENTS_RECEIVED_STAT_KEY);
1322  if (mqPtr.get() != 0)
1323  {
1324  artdaq::MonitoredQuantityStats stats;
1325  mqPtr->getStats(stats);
1326  oss << " Fragment statistics: " << stats.recentSampleCount << " fragments received at " << stats.recentSampleRate
1327  << " fragments/sec, effective data rate = "
1328  << (stats.recentValueRate / 1024.0 / 1024.0)
1329  << " MB/sec, monitor window = " << stats.recentDuration
1330  << " sec, min::max fragment size = " << (stats.recentValueMin / 1024.0 / 1024.0)
1331  << "::" << (stats.recentValueMax / 1024.0 / 1024.0) << " MB" << std::endl;
1332  }
1333 
1334  oss << " Event counts: Run -- " << run_event_count_ << " Total, " << run_incomplete_event_count_ << " Incomplete."
1335  << " Subrun -- " << subrun_event_count_ << " Total, " << subrun_incomplete_event_count_ << " Incomplete. "
1336  << std::endl;
1337  return oss.str();
1338 }
void RunArt(std::shared_ptr< art_config_file > config_file, std::shared_ptr< std::atomic< pid_t >> pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
void AddInitFragment(FragmentPtr &frag)
Set the stored Init fragment, if one has not yet been set already.
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
void StartArt()
Start all the art processes.
subrun_id_t GetSubrunForSequenceID(Fragment::sequence_id_t seqID)
Get the subrun number that the given Sequence ID would be assigned to.
void rolloverSubrun()
Add a subrun transition immediately after the highest currently define sequence ID.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
static const std::string FRAGMENTS_RECEIVED_STAT_KEY
Key for Fragments Received MonitoredQuantity.
bool createCollectors(fhicl::ParameterSet const &pset, int defaultReportIntervalFragments, double defaultReportIntervalSeconds, double defaultMonitorWindow, std::string const &primaryStatKeyName)
Create MonitoredQuantity objects for all names registered with the StatisticsHelper.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
static const std::string EVENTS_RELEASED_STAT_KEY
Key for the Events Released MonitoredQuantity.
bool endOfData()
Indicate that the end of input has been reached to the art processes.
RawEvent::subrun_id_t subrun_id_t
Copy RawEvent::subrun_id_t into local scope.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...