artdaq  v3_06_02
SharedMemoryEventManager.cc
1 
2 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
3 #include <sys/wait.h>
4 #include "artdaq-core/Core/StatisticsCollection.hh"
5 #include "artdaq-core/Utilities/TraceLock.hh"
6 
7 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
8 
9 #define TLVL_BUFFER 40
10 #define TLVL_BUFLCK 41
11 
12 #define build_key(seed) seed + ((GetPartitionNumber() + 1) << 16) + (getpid() & 0xFFFF)
13 
14 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
15 std::mutex artdaq::SharedMemoryEventManager::subrun_event_map_mutex_;
16 const std::string artdaq::SharedMemoryEventManager::
17  FRAGMENTS_RECEIVED_STAT_KEY("SharedMemoryEventManagerFragmentsReceived");
18 const std::string artdaq::SharedMemoryEventManager::
19  EVENTS_RELEASED_STAT_KEY("SharedMemoryEventManagerEventsReleased");
20 
21 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
22  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", build_key(0xEE000000)),
23  pset.get<size_t>("buffer_count"),
24  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
25  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
26  !pset.get<bool>("broadcast_mode", false))
27  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
28  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
29  , queue_size_(pset.get<size_t>("buffer_count"))
30  , run_id_(0)
31  , max_subrun_event_map_length_(pset.get<size_t>("max_subrun_lookup_table_size", 100))
32  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
33  , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
34  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
35  , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
36  , running_(false)
37  , buffer_writes_pending_()
38  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
39  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
40  , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now())
41  , last_backpressure_report_time_(std::chrono::steady_clock::now())
42  , last_fragment_header_write_time_(std::chrono::steady_clock::now())
43  , metric_data_()
44  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
45  , run_event_count_(0)
46  , run_incomplete_event_count_(0)
47  , subrun_event_count_(0)
48  , subrun_incomplete_event_count_(0)
49  , oversize_fragment_count_(0)
50  , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
51  , art_processes_()
52  , restart_art_(false)
53  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
54  , manual_art_(pset.get<bool>("manual_art", false))
55  , current_art_pset_(art_pset)
56  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
57  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 1000000))
58  , requests_(nullptr)
59  , data_pset_(pset)
60  , dropped_data_()
61  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", build_key(0xBB000000)),
62  pset.get<size_t>("broadcast_buffer_count", 10),
63  pset.get<size_t>("broadcast_buffer_size", 0x100000),
64  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
65 {
66  subrun_event_map_[0] = 1;
67  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
68  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
69 
70  if (pset.get<bool>("use_art", true) == false)
71  {
72  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
73  num_art_processes_ = 0;
74  }
75  else
76  {
77  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
78  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
79  }
80  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
81 
82  if (overwrite_mode_ && num_art_processes_ > 0)
83  {
84  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
85  }
86  else if (overwrite_mode_)
87  {
88  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
89  }
90 
91  for (size_t ii = 0; ii < size(); ++ii)
92  {
93  buffer_writes_pending_[ii] = 0;
94  }
95 
96  if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
97 
98  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
99  SetRank(my_rank);
100  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
101 
104 
105  // fetch the monitoring parameters and create the MonitoredQuantity instances
106  statsHelper_.createCollectors(pset, 100, 30.0, 60.0, EVENTS_RELEASED_STAT_KEY);
107 
108  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
109 }
110 
112 {
113  TLOG(TLVL_TRACE) << "DESTRUCTOR";
114  if (running_) endOfData();
115  TLOG(TLVL_TRACE) << "Destructor END";
116 }
117 
118 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
119 {
120  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
121  << ", sequence_id=" << frag.sequence_id;
122  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
123  TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
124  if (buffer == -1) return false;
125  if (buffer == -2)
126  {
127  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
128  return true;
129  }
130 
131  auto hdr = getEventHeader_(buffer);
132  if (update_run_ids_)
133  {
134  hdr->run_id = run_id_;
135  }
136  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
137 
138  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
139  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
140 
141  TLOG(TLVL_TRACE) << "Checking for complete event";
142  auto fragmentCount = GetFragmentCount(frag.sequence_id);
143  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
144  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
145  << ", fragmentCount=" << fragmentCount
146  << ", num_fragments_per_event=" << num_fragments_per_event_
147  << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
148 
149  complete_buffer_(buffer);
150  if (requests_) requests_->SendRequest(true);
151 
152  TLOG(TLVL_TRACE) << "AddFragment END";
153  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
154  return true;
155 }
156 
157 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
158 {
159  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
160  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
161  auto data = frag->headerAddress();
162  auto start = std::chrono::steady_clock::now();
163  bool sts = false;
164  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
165  {
166  sts = AddFragment(hdr, data);
167  if (!sts) usleep(1000);
168  }
169  if (!sts)
170  {
171  outfrag = std::move(frag);
172  }
173  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
174  return sts;
175 }
176 
177 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
178 {
179  TLOG(14) << "WriteFragmentHeader BEGIN";
180  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
181 
182  if (buffer < 0)
183  {
184  if (buffer == -1 && !dropIfNoBuffersAvailable)
185  {
186  std::unique_lock<std::mutex> bp_lk(sequence_id_mutex_);
187  if (TimeUtils::GetElapsedTime(last_backpressure_report_time_) > 1.0)
188  {
189  TLOG(TLVL_WARNING) << app_name << ": Back-pressure condition: All Shared Memory buffers have been full for " << TimeUtils::GetElapsedTime(last_fragment_header_write_time_) << " s!";
190  last_backpressure_report_time_ = std::chrono::steady_clock::now();
191  }
192  return nullptr;
193  }
194  if (buffer == -2)
195  {
196  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
197  }
198  else
199  {
200  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
201  }
202  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
203 
204  TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
205  return dropped_data_[frag.fragment_id]->dataBegin();
206  }
207 
208  last_backpressure_report_time_ = std::chrono::steady_clock::now();
209  last_fragment_header_write_time_ = std::chrono::steady_clock::now();
210  // Increment this as soon as we know we want to use the buffer
211  buffer_writes_pending_[buffer]++;
212 
213  if (metricMan)
214  {
215  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
216  }
217 
218  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
219 
220  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
221 
222  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
223 
224  //TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
225  auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
226  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
227 
228  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
229  if (frag.word_count - frag.num_words() > 0)
230  {
231  auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
232 
233  if (!sts)
234  {
235  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words();
236  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType;
237  TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
238  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
239 
240  oversize_fragment_count_++;
241 
242  if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
243  {
244  throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
245  }
246 
247  TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin();
248  return dropped_data_[frag.fragment_id]->dataBegin();
249  }
250  }
251  TLOG(14) << "WriteFragmentHeader END";
252  return pos;
253 }
254 
255 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
256 {
257  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
258  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
259  if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
260  if (buffer == -2) { return; }
261 
262  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
263  {
264  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
265 
266  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
267 
268  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
269 
270  //TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
271 
272  TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")";
273  auto hdr = getEventHeader_(buffer);
274  if (update_run_ids_)
275  {
276  hdr->run_id = run_id_;
277  }
278  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
279 
280  TLOG(TLVL_TRACE) << "DoneWritingFragment: Updating buffer touch time";
281  TouchBuffer(buffer);
282 
283  buffer_writes_pending_[buffer]--;
284  if (buffer_writes_pending_[buffer] != 0)
285  {
286  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
287  return;
288  }
289  TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
290  auto frag_count = GetFragmentCount(frag.sequence_id);
291  hdr->is_complete = frag_count == num_fragments_per_event_;
292  TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
293 #if ART_SUPPORTS_DUPLICATE_EVENTS
294  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
295  {
296  hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
297  }
298 #endif
299  }
300 
301  complete_buffer_(buffer);
302  if (requests_) requests_->SendRequest(true);
303  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
304 }
305 
306 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
307 {
308  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
309 }
310 
311 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
312 {
313  if (buffer == -1) return 0;
314  ResetReadPos(buffer);
315  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
316 
317  size_t count = 0;
318 
319  while (MoreDataInBuffer(buffer))
320  {
321  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
322  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
323  if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
324  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
325  ++count;
326  }
327 
328  return count;
329 }
330 
331 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out)
332 {
333  do
334  {
335  auto start_time = std::chrono::steady_clock::now();
336  send_init_frag_();
337  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
338 
339  pid_t pid = 0;
340 
341  if (!manual_art_)
342  {
343  char* filename = new char[config_file->getFileName().length() + 1];
344  strcpy(filename, config_file->getFileName().c_str());
345 
346 #if DEBUG_ART
347  std::string debugArgS = "--config-out=" + app_name + "_art.out";
348  char* debugArg = new char[debugArgS.length() + 1];
349  strcpy(debugArg, debugArgS.c_str());
350 
351  std::vector<char*> args{(char*)"art", (char*)"-c", filename, debugArg, NULL};
352 #else
353  std::vector<char*> args{(char*)"art", (char*)"-c", filename, NULL};
354 #endif
355 
356  pid = fork();
357  if (pid == 0)
358  { /* child */
359  // 23-May-2018, KAB: added the setting of the partition number env var
360  // in the environment of the child art process so that Globals.hh
361  // will pick it up there and provide it to the artdaq classes that
362  // are used in data transfers, etc. within the art process.
363  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
364  std::string envVarValue = std::to_string(GetPartitionNumber());
365  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
366  {
367  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
368  << "\" in the environment of a child art process. "
369  << "This may result in incorrect TCP port number "
370  << "assignments or other issues, and data may "
371  << "not flow through the system correctly.";
372  }
373  envVarKey = "ARTDAQ_APPLICATION_NAME";
374  envVarValue = app_name;
375  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
376  {
377  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
378  << "\" in the environment of a child art process. ";
379  }
380  envVarKey = "ARTDAQ_RANK";
381  envVarValue = std::to_string(my_rank);
382  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
383  {
384  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
385  << "\" in the environment of a child art process. ";
386  }
387 
388  execvp("art", &args[0]);
389  delete[] filename;
390  exit(1);
391  }
392  delete[] filename;
393  }
394  else
395  {
396  //Using cin/cout here to ensure console is active (artdaqDriver)
397  std::cout << "Please run the following command in a separate terminal:" << std::endl
398  << "art -c " << config_file->getFileName() << std::endl
399  << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
400  << "Finally, return to this window and enter the pid: " << std::endl;
401  std::cin >> pid;
402  }
403  *pid_out = pid;
404 
405  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
406  {
407  std::unique_lock<std::mutex> lk(art_process_mutex_);
408  art_processes_.insert(pid);
409  }
410  siginfo_t status;
411  auto sts = waitid(P_PID, pid, &status, WEXITED);
412  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
413  {
414  std::unique_lock<std::mutex> lk(art_process_mutex_);
415  art_processes_.erase(pid);
416  }
417  if (sts < 0)
418  {
419  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
420  }
421  else if (status.si_code == CLD_EXITED && status.si_status == 0)
422  {
423  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
424  }
425  else
426  {
427  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
428  if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
429 
430  auto exit_type = "exited with status code";
431  switch (status.si_code)
432  {
433  case CLD_DUMPED:
434  case CLD_KILLED:
435  exit_type = "was killed with signal";
436  break;
437  case CLD_EXITED:
438  default:
439  break;
440  }
441 
442  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
443  << "art process " << pid << " " << exit_type << " " << status.si_status
444  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
445  << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
446  << (restart_art_ ? "restarting" : "not restarting");
447  }
448  } while (restart_art_);
449 }
450 
452 {
453  restart_art_ = always_restart_art_;
454  if (num_art_processes_ == 0) return;
455  for (size_t ii = 0; ii < num_art_processes_; ++ii)
456  {
457  StartArtProcess(current_art_pset_);
458  }
459 }
460 
462 {
463  static std::mutex start_art_mutex;
464  std::unique_lock<std::mutex> lk(start_art_mutex);
465  //TraceLock lk(start_art_mutex, 15, "StartArtLock");
466  restart_art_ = always_restart_art_;
467  auto initialCount = GetAttachedCount();
468  auto startTime = std::chrono::steady_clock::now();
469 
470  if (pset != current_art_pset_ || !current_art_config_file_)
471  {
472  current_art_pset_ = pset;
473  current_art_config_file_ = std::make_shared<art_config_file>(pset /*, GetKey(), GetBroadcastKey()*/);
474  }
475  std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
476  boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
477  thread.detach();
478 
479  auto currentCount = GetAttachedCount() - initialCount;
480  while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
481  {
482  usleep(10000);
483  currentCount = GetAttachedCount() - initialCount;
484  }
485  if ((currentCount < 1 || *pid <= 0) && manual_art_)
486  {
487  TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
488  return 0;
489  }
490  else if (currentCount < 1 || *pid <= 0)
491  {
492  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
493  << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
494  return 0;
495  }
496  else
497  {
498  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
499  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
500 
501  return *pid;
502  }
503 }
504 
506 {
507  restart_art_ = false;
508  //current_art_config_file_ = nullptr;
509  //current_art_pset_ = fhicl::ParameterSet();
510 
511  auto check_pids = [&](bool print) {
512  std::unique_lock<std::mutex> lk(art_process_mutex_);
513  for (auto pid = pids.begin(); pid != pids.end();)
514  {
515  // 08-May-2018, KAB: protect against killing invalid PIDS
516 
517  if (*pid <= 0)
518  {
519  TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
520  << ") from the shutdown list.";
521  pid = pids.erase(pid);
522  }
523  else if (kill(*pid, 0) < 0)
524  {
525  pid = pids.erase(pid);
526  }
527  else
528  {
529  if (print) std::cout << *pid << " ";
530  ++pid;
531  }
532  }
533  };
534  auto count_pids = [&]() {
535  std::unique_lock<std::mutex> lk(art_process_mutex_);
536  return pids.size();
537  };
538  check_pids(false);
539  if (count_pids() == 0)
540  {
541  TLOG(14) << "All art processes already exited, nothing to do.";
542  usleep(1000);
543  return;
544  }
545 
546  if (!manual_art_)
547  {
548  {
549  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
550  std::unique_lock<std::mutex> lk(art_process_mutex_);
551  for (auto pid : pids)
552  {
553  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
554  kill(pid, SIGQUIT);
555  }
556  }
557 
558  int graceful_wait_ms = 5000;
559  int int_wait_ms = 1000;
560 
561  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
562  for (int ii = 0; ii < graceful_wait_ms; ++ii)
563  {
564  usleep(1000);
565 
566  check_pids(false);
567  if (count_pids() == 0)
568  {
569  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
570  return;
571  }
572  }
573 
574  {
575  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
576  std::unique_lock<std::mutex> lk(art_process_mutex_);
577  for (auto pid : pids)
578  {
579  kill(pid, SIGINT);
580  }
581  }
582 
583  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
584  for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
585  {
586  usleep(1000);
587 
588  check_pids(false);
589 
590  if (count_pids() == 0)
591  {
592  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
593  return;
594  }
595  }
596 
597  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
598  while (count_pids() > 0)
599  {
600  {
601  std::unique_lock<std::mutex> lk(art_process_mutex_);
602  kill(*pids.begin(), SIGKILL);
603  usleep(1000);
604  }
605  check_pids(false);
606  }
607  }
608  else
609  {
610  std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
611  while (count_pids() > 0)
612  {
613  std::cout << "The following PIDs are running: ";
614  check_pids(true);
615  std::cout << std::endl;
616  std::string ignored;
617  std::cin >> ignored;
618  }
619  }
620 }
621 
622 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
623 {
624  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
625  if (restart_art_ || !always_restart_art_) // Art is running
626  {
627  endOfData();
628  }
629  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
630  {
631  broadcasts_.MarkBufferEmpty(ii, true);
632  }
633  if (newRun == 0) newRun = run_id_ + 1;
634 
635  if (art_pset != current_art_pset_ || !current_art_config_file_)
636  {
637  current_art_pset_ = art_pset;
638  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
639  }
640 
641  if (n_art_processes != -1)
642  {
643  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
644  num_art_processes_ = n_art_processes;
645  }
646  startRun(newRun);
647  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
648 }
649 
651 {
652  running_ = false;
653  init_fragment_.reset(nullptr);
654  TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
655  restart_art_ = false;
656 
657  size_t initialStoreSize = GetIncompleteEventCount();
658  TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
659  << " stale events from the SharedMemoryEventManager.";
660  int counter = initialStoreSize;
661  while (active_buffers_.size() > 0 && counter > 0)
662  {
663  complete_buffer_(*active_buffers_.begin());
664  counter--;
665  }
666  TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
667  << " stale events in the SharedMemoryEventManager.";
668 
669  TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
670  auto start = std::chrono::steady_clock::now();
671  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
672  auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1); //size();
673 
674  auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
675 
676  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
677  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
678  {
679  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
680  if (temp != lastReadCount)
681  {
682  TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
683  lastReadCount = temp;
684  start = std::chrono::steady_clock::now();
685  }
686  if (lastReadCount > 0)
687  {
688  TRACE(19, "About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu", outstanding_buffer_wait_time, lastReadCount, size(), end_of_data_wait_us);
689  usleep(outstanding_buffer_wait_time);
690  }
691  }
692 
693  TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
694  << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
695 
696  TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
697  FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
698  bool success = broadcastFragment_(std::move(outFrag), outFrag);
699  if (!success)
700  {
701  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
702  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
703  {
704  broadcasts_.MarkBufferEmpty(ii, true);
705  }
706  broadcastFragment_(std::move(outFrag), outFrag);
707  }
708  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
709 
710  if (get_art_process_count_() > 0)
711  {
712  TLOG(TLVL_DEBUG) << "Allowing " << get_art_process_count_() << " art processes the chance to end gracefully";
713  if (end_of_data_wait_us == 0)
714  {
715  TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
716  end_of_data_wait_us = 100 * 1000000;
717  }
718 
719  auto sleep_count = (end_of_data_wait_us / 10000) + 1;
720  for (size_t ii = 0; ii < sleep_count; ++ii)
721  {
722  usleep(10000);
723  if (get_art_process_count_() == 0) break;
724  }
725  }
726 
727  while (get_art_process_count_() > 0)
728  {
729  TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
730 
731  ShutdownArtProcesses(art_processes_);
732  }
733  TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
734 
735  ResetAttachedCount();
736 
737  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
738  for (size_t ii = 0; ii < size(); ++ii)
739  {
740  MarkBufferEmpty(ii, true);
741  }
742  // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again...
743  // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers";
744  // for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
745  // {
746  // broadcasts_.MarkBufferEmpty(ii, true);
747  // }
748  released_incomplete_events_.clear();
749 
750  TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestSender";
751  requests_.reset(nullptr);
752 
753  TLOG(TLVL_DEBUG) << "endOfData END";
754  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
755  return true;
756 }
757 
759 {
760  running_ = true;
761  init_fragment_.reset(nullptr);
762  statsHelper_.resetStatistics();
763  TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
764  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
765  {
766  broadcasts_.MarkBufferEmpty(ii, true);
767  }
768  StartArt();
769  run_id_ = runID;
770  {
771  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
772  subrun_event_map_.clear();
773  subrun_event_map_[0] = 1;
774  }
775  run_event_count_ = 0;
776  run_incomplete_event_count_ = 0;
777  requests_.reset(new RequestSender(data_pset_));
778  if (requests_)
779  {
780  requests_->SetRunNumber(static_cast<uint32_t>(run_id_));
781  requests_->SendRoutingToken(queue_size_, run_id_);
782  }
783  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
784  << ", max queue size = "
785  << queue_size_
786  << ", queue size = "
787  << GetLockedBufferCount();
788  if (metricMan)
789  {
790  metricMan->sendMetric("Run Number", static_cast<unsigned long>(run_id_), "Run", 1, MetricMode::LastPoint);
791  }
792 }
793 
795 {
796  TLOG(TLVL_INFO) << "Ending run " << run_id_;
797  FragmentPtr endOfRunFrag(new Fragment(static_cast<size_t>(ceil(sizeof(my_rank) /
798  static_cast<double>(sizeof(Fragment::value_type))))));
799 
800  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
801  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
802  *endOfRunFrag->dataBegin() = my_rank;
803  broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
804 
805  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
806  run_event_count_ = 0;
807  run_incomplete_event_count_ = 0;
808  oversize_fragment_count_ = 0;
809  {
810  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
811  subrun_event_map_.clear();
812  subrun_event_map_[0] = 1;
813  }
814  return true;
815 }
816 
818 {
819  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
820  if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
821 
822  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
823 
824  TLOG(TLVL_INFO) << "Will roll over to subrun " << subrun << " when I reach Sequence ID " << boundary;
825  subrun_event_map_[boundary] = subrun;
826  while (subrun_event_map_.size() > max_subrun_event_map_length_)
827  {
828  subrun_event_map_.erase(subrun_event_map_.begin());
829  }
830 }
831 
833 {
834  Fragment::sequence_id_t seqID = 0;
835  subrun_id_t subrun = 0;
836  {
837  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
838  for (auto& it : subrun_event_map_)
839  {
840  if (it.first >= seqID) seqID = it.first + 1;
841  if (it.second >= subrun) subrun = it.second + 1;
842  }
843  }
844  rolloverSubrun(seqID, subrun);
845 }
846 
848 {
849  if (metricMan)
850  {
851  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
852  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
853  }
854 
855  if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
856  {
857  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
858  return;
859 
860  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
861  std::ostringstream oss;
862  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
863  for (auto& ev : active_buffers_)
864  {
865  auto hdr = getEventHeader_(ev);
866  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
867  }
868  TLOG(TLVL_DEBUG) << oss.str();
869  }
870 }
871 
872 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
873 {
874  TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
875  auto buffer = broadcasts_.GetBufferForWriting(false);
876  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
877  auto start_time = std::chrono::steady_clock::now();
878  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
879  {
880  usleep(10000);
881  buffer = broadcasts_.GetBufferForWriting(false);
882  }
883  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
884  if (buffer == -1)
885  {
886  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
887  outFrag.swap(frag);
888  return false;
889  }
890 
891  TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
892  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
893  hdr->run_id = run_id_;
894  hdr->subrun_id = GetSubrunForSequenceID(frag->sequenceID());
895  hdr->sequence_id = frag->sequenceID();
896  hdr->is_complete = true;
897  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
898 
899  TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
900  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
901 
902  TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
903  broadcasts_.MarkBufferFull(buffer, -1);
904  outFrag.swap(frag);
905  TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
906  return true;
907 }
908 
909 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
910 {
911  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
912 }
913 
915 {
916  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
917 
918  TLOG(TLVL_TRACE) << "GetSubrunForSequenceID BEGIN map size = " << subrun_event_map_.size();
919  auto it = subrun_event_map_.begin();
920  subrun_id_t subrun = 1;
921 
922  while (it->first <= seqID && it != subrun_event_map_.end())
923  {
924  TLOG(TLVL_TRACE) << "Map has sequence ID " << it->first << ", subrun " << it->second << " (looking for <= " << seqID << ")";
925  subrun = it->second;
926  ++it;
927  }
928 
929  TLOG(TLVL_DEBUG) << "GetSubrunForSequenceID returning subrun " << subrun << " for sequence ID " << seqID;
930  return subrun;
931 }
932 
933 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
934 {
935  TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
936  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
937 
938  TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
939 
940  auto buffers = GetBuffersOwnedByManager();
941  for (auto& buf : buffers)
942  {
943  auto hdr = getEventHeader_(buf);
944  if (hdr->sequence_id == seqID)
945  {
946  TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
947  return buf;
948  }
949  }
950 
951 #if !ART_SUPPORTS_DUPLICATE_EVENTS
952  if (released_incomplete_events_.count(seqID))
953  {
954  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
955  return -2;
956  }
957 #endif
958 
959  if (!create_new) return -1;
960 
961  check_pending_buffers_(lk);
962  int new_buffer = GetBufferForWriting(false);
963 
964  if (new_buffer == -1)
965  {
966  new_buffer = GetBufferForWriting(overwrite_mode_);
967  }
968 
969  if (new_buffer == -1) return -1;
970  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
971  std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
972  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
973  //TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
974  auto hdr = getEventHeader_(new_buffer);
975  hdr->is_complete = false;
976  hdr->run_id = run_id_;
977  hdr->subrun_id = GetSubrunForSequenceID(seqID);
978  hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
979  hdr->sequence_id = seqID;
980  buffer_writes_pending_[new_buffer] = 0;
981  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
982  SetMFIteration("Sequence ID " + std::to_string(seqID));
983 
984  TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
985  active_buffers_.insert(new_buffer);
986  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
987  << size() << ","
988  << ReadReadyCount() << ","
989  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
990  << WriteReadyCount(false) << ","
991  << pending_buffers_.size() << ","
992  << active_buffers_.size() << ")";
993 
994  if (requests_)
995  {
996  if (timestamp != Fragment::InvalidTimestamp)
997  {
998  requests_->AddRequest(seqID, timestamp);
999  }
1000  // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we
1001  // don't double-send requests, but still get the benefit of calling SendRequest 'often'.
1002  else
1003  {
1004  requests_->SendRequest();
1005  }
1006  }
1007  TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
1008  return new_buffer;
1009 }
1010 
1011 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
1012 {
1013  if (buffer == -1) return true;
1014  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
1015  {
1016  return true;
1017  }
1018  ResetReadPos(buffer);
1019  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
1020  return MoreDataInBuffer(buffer);
1021 }
1022 
1023 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
1024 {
1025  auto hdr = getEventHeader_(buffer);
1026  if (hdr->is_complete)
1027  {
1028  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
1029 
1030  {
1031  TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
1032 
1033  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1034  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1035  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1036  active_buffers_.erase(buffer);
1037  pending_buffers_.insert(buffer);
1038 
1039  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1040  << size() << ","
1041  << ReadReadyCount() << ","
1042  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1043  << WriteReadyCount(false) << ","
1044  << pending_buffers_.size() << ","
1045  << active_buffers_.size() << ")";
1046  }
1047  if (requests_)
1048  {
1049  requests_->RemoveRequest(hdr->sequence_id);
1050  }
1051  }
1052  CheckPendingBuffers();
1053 }
1054 
1055 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
1056 {
1057  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1058 }
1059 
1061 {
1062  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
1063  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1064  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
1065  check_pending_buffers_(lk);
1066 }
1067 
1068 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
1069 {
1070  TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1071 
1072  auto buffers = GetBuffersOwnedByManager();
1073  for (auto buf : buffers)
1074  {
1075  if (ResetBuffer(buf) && !pending_buffers_.count(buf))
1076  {
1077  TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1078  auto hdr = getEventHeader_(buf);
1079  if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_))
1080  {
1081  if (requests_)
1082  {
1083  requests_->RemoveRequest(hdr->sequence_id);
1084  }
1085  TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
1086  active_buffers_.erase(buf);
1087  pending_buffers_.insert(buf);
1088  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1089  << size() << ","
1090  << ReadReadyCount() << ","
1091  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1092  << WriteReadyCount(false) << ","
1093  << pending_buffers_.size() << ","
1094  << active_buffers_.size() << ")";
1095 
1096  run_incomplete_event_count_++;
1097  if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
1098  if (!released_incomplete_events_.count(hdr->sequence_id))
1099  {
1100  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1101  }
1102  else
1103  {
1104  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1105  }
1106  TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
1107  }
1108  }
1109  }
1110 
1111  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1112  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
1113 
1114  auto counter = 0;
1115  double eventSize = 0;
1116  for (auto buf : sorted_buffers)
1117  {
1118  auto hdr = getEventHeader_(buf);
1119  auto thisEventSize = BufferDataSize(buf);
1120 
1121  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
1122  << "event_size=" << thisEventSize << ", buffer_size=" << BufferSize();
1123  statsHelper_.addSample(EVENTS_RELEASED_STAT_KEY, thisEventSize);
1124 
1125  TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
1126  MarkBufferFull(buf);
1127  run_event_count_++;
1128  counter++;
1129  eventSize += thisEventSize;
1130  pending_buffers_.erase(buf);
1131  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1132  << size() << ","
1133  << ReadReadyCount() << ","
1134  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1135  << WriteReadyCount(false) << ","
1136  << pending_buffers_.size() << ","
1137  << active_buffers_.size() << ")";
1138  }
1139 
1140  if (requests_)
1141  {
1142  TLOG(TLVL_TRACE) << "Sent tokens: " << requests_->GetSentTokenCount() << ", Event count: " << run_event_count_;
1143  auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
1144  auto available_buffers = WriteReadyCount(overwrite_mode_);
1145 
1146  TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
1147  << ", tokens_to_send: " << available_buffers - outstanding_tokens;
1148 
1149  if (available_buffers > outstanding_tokens)
1150  {
1151  auto tokens_to_send = available_buffers - outstanding_tokens;
1152 
1153  while (tokens_to_send > 0)
1154  {
1155  TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
1156  requests_->SendRoutingToken(1, run_id_);
1157  tokens_to_send--;
1158  }
1159  }
1160  }
1161 
1162  if (statsHelper_.readyToReport())
1163  {
1164  std::string statString = buildStatisticsString_();
1165  TLOG(TLVL_INFO) << statString;
1166  }
1167 
1168  metric_data_.event_count += counter;
1169  metric_data_.event_size += eventSize;
1170 
1171  if (metricMan && TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500) // Limit to 2 Hz updates
1172  {
1173  TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
1174  metricMan->sendMetric("Event Rate", metric_data_.event_count, "Events/s", 1, MetricMode::Rate);
1175  if (metric_data_.event_count > 0) metricMan->sendMetric("Average Event Size", metric_data_.event_size / metric_data_.event_count, "Bytes", 1, MetricMode::Average);
1176  metric_data_ = MetricData();
1177 
1178  metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
1179  metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
1180  if (requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
1181 
1182  auto bufferReport = GetBufferReport();
1183  int full = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Full; });
1184  int empty = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Empty; });
1185  int writing = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Writing; });
1186  int reading = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) { return p.second == BufferSemaphoreFlags::Reading; });
1187  auto total = size();
1188  TLOG(TLVL_DEBUG) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
1189 
1190  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
1191  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
1192  metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
1193  metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
1194  if (total > 0)
1195  {
1196  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1197  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1198  }
1199 
1200  last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now();
1201  }
1202  TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
1203 }
1204 
1205 void artdaq::SharedMemoryEventManager::send_init_frag_()
1206 {
1207  if (init_fragment_ != nullptr)
1208  {
1209  TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
1210 
1211 #if 0
1212  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
1213  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1214  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1215  ostream.close();
1216 #endif
1217 
1218  broadcastFragment_(std::move(init_fragment_), init_fragment_);
1219  TLOG(TLVL_TRACE) << "Init Fragment sent";
1220  }
1221  else if (send_init_fragments_)
1222  {
1223  TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
1224  }
1225 }
1226 
1228 {
1229  if (!init_fragment_ || init_fragment_ == nullptr)
1230  {
1231  init_fragment_.swap(frag);
1232  send_init_frag_();
1233  }
1234 }
1235 
1237 {
1238  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
1239  if (art_pset != current_art_pset_ || !current_art_config_file_)
1240  {
1241  current_art_pset_ = art_pset;
1242  current_art_config_file_ = std::make_shared<art_config_file>(art_pset /*, GetKey(), GetBroadcastKey()*/);
1243  }
1244  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
1245 }
1246 
1247 std::string artdaq::SharedMemoryEventManager::buildStatisticsString_() const
1248 {
1249  std::ostringstream oss;
1250  oss << app_name << " statistics:" << std::endl;
1251 
1252  artdaq::MonitoredQuantityPtr mqPtr =
1253  artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(EVENTS_RELEASED_STAT_KEY);
1254  if (mqPtr.get() != 0)
1255  {
1256  artdaq::MonitoredQuantityStats stats;
1257  mqPtr->getStats(stats);
1258  oss << " Event statistics: " << stats.recentSampleCount << " events released at " << stats.recentSampleRate
1259  << " events/sec, effective data rate = "
1260  << (stats.recentValueRate * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0)
1261  << " MB/sec, monitor window = " << stats.recentDuration
1262  << " sec, min::max event size = " << (stats.recentValueMin * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0)
1263  << "::" << (stats.recentValueMax * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0) << " MB" << std::endl;
1264  if (stats.recentSampleRate > 0.0)
1265  {
1266  oss << " Average time per event: ";
1267  oss << " elapsed time = " << (1.0 / stats.recentSampleRate) << " sec" << std::endl;
1268  }
1269  }
1270 
1271  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(FRAGMENTS_RECEIVED_STAT_KEY);
1272  if (mqPtr.get() != 0)
1273  {
1274  artdaq::MonitoredQuantityStats stats;
1275  mqPtr->getStats(stats);
1276  oss << " Fragment statistics: " << stats.recentSampleCount << " fragments received at " << stats.recentSampleRate
1277  << " fragments/sec, effective data rate = "
1278  << (stats.recentValueRate * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0)
1279  << " MB/sec, monitor window = " << stats.recentDuration
1280  << " sec, min::max fragment size = " << (stats.recentValueMin * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0)
1281  << "::" << (stats.recentValueMax * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0) << " MB" << std::endl;
1282  }
1283 
1284  oss << " Event counts: Run -- " << run_event_count_ << " Total, " << run_incomplete_event_count_ << " Incomplete."
1285  << " Subrun -- " << subrun_event_count_ << " Total, " << subrun_incomplete_event_count_ << " Incomplete. "
1286  << std::endl;
1287  return oss.str();
1288 }
1289 
1290 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
1291 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager)
1292 #endif
void RunArt(std::shared_ptr< art_config_file > config_file, std::shared_ptr< std::atomic< pid_t >> pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
The SharedMemoryEventManager is a SharedMemoryManger which tracks events as they are built...
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
void StartArt()
Start all the art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
subrun_id_t GetSubrunForSequenceID(Fragment::sequence_id_t seqID)
Get the subrun number that the given Sequence ID would be assigned to.
void rolloverSubrun()
Add a subrun transition immediately after the highest currently define sequence ID.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
static const std::string FRAGMENTS_RECEIVED_STAT_KEY
Key for Fragments Received MonitoredQuantity.
bool createCollectors(fhicl::ParameterSet const &pset, int defaultReportIntervalFragments, double defaultReportIntervalSeconds, double defaultMonitorWindow, std::string const &primaryStatKeyName)
Create MonitoredQuantity objects for all names registered with the StatisticsHelper.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
static const std::string EVENTS_RELEASED_STAT_KEY
Key for the Events Released MonitoredQuantity.
bool endOfData()
Indicate that the end of input has been reached to the art processes.
RawEvent::subrun_id_t subrun_id_t
Copy RawEvent::subrun_id_t into local scope.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...