artdaq  v3_06_00
SharedMemoryEventManager.cc
1 
2 #include "artdaq/DAQrate/SharedMemoryEventManager.hh"
3 #include <sys/wait.h>
4 #include "artdaq-core/Core/StatisticsCollection.hh"
5 #include "artdaq-core/Utilities/TraceLock.hh"
6 
7 #define TRACE_NAME (app_name + "_SharedMemoryEventManager").c_str()
8 
9 #define TLVL_BUFFER 40
10 #define TLVL_BUFLCK 41
11 
12 std::mutex artdaq::SharedMemoryEventManager::sequence_id_mutex_;
13 std::mutex artdaq::SharedMemoryEventManager::subrun_event_map_mutex_;
14 const std::string artdaq::SharedMemoryEventManager::
15 FRAGMENTS_RECEIVED_STAT_KEY("SharedMemoryEventManagerFragmentsReceived");
16 const std::string artdaq::SharedMemoryEventManager::
17 EVENTS_RELEASED_STAT_KEY("SharedMemoryEventManagerEventsReleased");
18 
19 artdaq::SharedMemoryEventManager::SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
20  : SharedMemoryManager(pset.get<uint32_t>("shared_memory_key", 0xBEE70000 + getpid()),
21  pset.get<size_t>("buffer_count"),
22  pset.has_key("max_event_size_bytes") ? pset.get<size_t>("max_event_size_bytes") : pset.get<size_t>("expected_fragments_per_event") * pset.get<size_t>("max_fragment_size_bytes"),
23  pset.get<size_t>("stale_buffer_timeout_usec", pset.get<size_t>("event_queue_wait_time", 5) * 1000000),
24  !pset.get<bool>("broadcast_mode", false))
25  , num_art_processes_(pset.get<size_t>("art_analyzer_count", 1))
26  , num_fragments_per_event_(pset.get<size_t>("expected_fragments_per_event"))
27  , queue_size_(pset.get<size_t>("buffer_count"))
28  , run_id_(0)
29  , max_subrun_event_map_length_(pset.get<size_t>("max_subrun_lookup_table_size", 100))
30  , update_run_ids_(pset.get<bool>("update_run_ids_on_new_fragment", true))
31  , use_sequence_id_for_event_number_(pset.get<bool>("use_sequence_id_for_event_number", true))
32  , overwrite_mode_(!pset.get<bool>("use_art", true) || pset.get<bool>("overwrite_mode", false) || pset.get<bool>("broadcast_mode", false))
33  , send_init_fragments_(pset.get<bool>("send_init_fragments", true))
34  , running_(false)
35  , buffer_writes_pending_()
36  , incomplete_event_report_interval_ms_(pset.get<int>("incomplete_event_report_interval_ms", -1))
37  , last_incomplete_event_report_time_(std::chrono::steady_clock::now())
38  , last_shmem_buffer_metric_update_(std::chrono::steady_clock::now())
39  , last_backpressure_report_time_(std::chrono::steady_clock::now())
40  , last_fragment_header_write_time_(std::chrono::steady_clock::now())
41  , metric_data_()
42  , broadcast_timeout_ms_(pset.get<int>("fragment_broadcast_timeout_ms", 3000))
43  , run_event_count_(0)
44  , run_incomplete_event_count_(0)
45  , subrun_event_count_(0)
46  , subrun_incomplete_event_count_(0)
47  , oversize_fragment_count_(0)
48  , maximum_oversize_fragment_count_(pset.get<int>("maximum_oversize_fragment_count", 1))
49  , art_processes_()
50  , restart_art_(false)
51  , always_restart_art_(pset.get<bool>("restart_crashed_art_processes", true))
52  , manual_art_(pset.get<bool>("manual_art", false))
53  , current_art_pset_(art_pset)
54  , minimum_art_lifetime_s_(pset.get<double>("minimum_art_lifetime_s", 2.0))
55  , art_event_processing_time_us_(pset.get<size_t>("expected_art_event_processing_time_us", 1000000))
56  , requests_(nullptr)
57  , data_pset_(pset)
58  , dropped_data_()
59  , broadcasts_(pset.get<uint32_t>("broadcast_shared_memory_key", 0xCEE70000 + getpid()),
60  pset.get<size_t>("broadcast_buffer_count", 10),
61  pset.get<size_t>("broadcast_buffer_size", 0x100000),
62  pset.get<int>("expected_art_event_processing_time_us", 100000) * pset.get<size_t>("buffer_count"), false)
63 {
64  subrun_event_map_[0] = 1;
65  SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
66  broadcasts_.SetMinWriteSize(sizeof(detail::RawEventHeader) + sizeof(detail::RawFragmentHeader));
67 
68  if (pset.get<bool>("use_art", true) == false)
69  {
70  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:false";
71  num_art_processes_ = 0;
72  }
73  else
74  {
75  TLOG(TLVL_INFO) << "BEGIN SharedMemoryEventManager CONSTRUCTOR with use_art:true";
76  TLOG(TLVL_TRACE) << "art_pset is " << art_pset.to_string();
77  }
78  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
79 
80  if (overwrite_mode_ && num_art_processes_ > 0)
81  {
82  TLOG(TLVL_WARNING) << "Art is configured to run, but overwrite mode is enabled! Check your configuration if this in unintentional!";
83  }
84  else if (overwrite_mode_)
85  {
86  TLOG(TLVL_INFO) << "Overwrite Mode enabled, no configured art processes at startup";
87  }
88 
89  for (size_t ii = 0; ii < size(); ++ii)
90  {
91  buffer_writes_pending_[ii] = 0;
92  }
93 
94  if (!IsValid()) throw cet::exception(app_name + "_SharedMemoryEventManager") << "Unable to attach to Shared Memory!";
95 
96  TLOG(TLVL_TRACE) << "Setting Writer rank to " << my_rank;
97  SetRank(my_rank);
98  TLOG(TLVL_DEBUG) << "Writer Rank is " << GetRank();
99 
102 
103  // fetch the monitoring parameters and create the MonitoredQuantity instances
104  statsHelper_.createCollectors(pset, 100, 30.0, 60.0, EVENTS_RELEASED_STAT_KEY);
105 
106  TLOG(TLVL_TRACE) << "END CONSTRUCTOR";
107 }
108 
110 {
111  TLOG(TLVL_TRACE) << "DESTRUCTOR";
112  if (running_) endOfData();
113  TLOG(TLVL_TRACE) << "Destructor END";
114 }
115 
116 bool artdaq::SharedMemoryEventManager::AddFragment(detail::RawFragmentHeader frag, void* dataPtr)
117 {
118  TLOG(TLVL_TRACE) << "AddFragment(Header, ptr) BEGIN frag.word_count=" << frag.word_count
119  << ", sequence_id=" << frag.sequence_id;
120  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
121  TLOG(TLVL_TRACE) << "Using buffer " << buffer << " for seqid=" << frag.sequence_id;
122  if (buffer == -1) return false;
123  if (buffer == -2)
124  {
125  TLOG(TLVL_ERROR) << "Dropping event because data taking has already passed this event number: " << frag.sequence_id;
126  return true;
127  }
128 
129  auto hdr = getEventHeader_(buffer);
130  if (update_run_ids_)
131  {
132  hdr->run_id = run_id_;
133  }
134  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
135 
136  TLOG(TLVL_TRACE) << "AddFragment before Write calls";
137  Write(buffer, dataPtr, frag.word_count * sizeof(RawDataType));
138 
139  TLOG(TLVL_TRACE) << "Checking for complete event";
140  auto fragmentCount = GetFragmentCount(frag.sequence_id);
141  hdr->is_complete = fragmentCount == num_fragments_per_event_ && buffer_writes_pending_[buffer] == 0;
142  TLOG(TLVL_TRACE) << "hdr->is_complete=" << std::boolalpha << hdr->is_complete
143  << ", fragmentCount=" << fragmentCount
144  << ", num_fragments_per_event=" << num_fragments_per_event_
145  << ", buffer_writes_pending_[buffer]=" << buffer_writes_pending_[buffer];
146 
147  complete_buffer_(buffer);
148  if (requests_) requests_->SendRequest(true);
149 
150  TLOG(TLVL_TRACE) << "AddFragment END";
151  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
152  return true;
153 }
154 
155 bool artdaq::SharedMemoryEventManager::AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag)
156 {
157  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) BEGIN";
158  auto hdr = *reinterpret_cast<detail::RawFragmentHeader*>(frag->headerAddress());
159  auto data = frag->headerAddress();
160  auto start = std::chrono::steady_clock::now();
161  bool sts = false;
162  while (!sts && TimeUtils::GetElapsedTimeMicroseconds(start) < timeout_usec)
163  {
164  sts = AddFragment(hdr, data);
165  if (!sts) usleep(1000);
166  }
167  if (!sts)
168  {
169  outfrag = std::move(frag);
170  }
171  TLOG(TLVL_TRACE) << "AddFragment(FragmentPtr) RETURN " << std::boolalpha << sts;
172  return sts;
173 }
174 
175 artdaq::RawDataType* artdaq::SharedMemoryEventManager::WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable)
176 {
177  TLOG(14) << "WriteFragmentHeader BEGIN";
178  auto buffer = getBufferForSequenceID_(frag.sequence_id, true, frag.timestamp);
179 
180  if (buffer < 0)
181  {
182  if (buffer == -1 && !dropIfNoBuffersAvailable)
183  {
184  std::unique_lock<std::mutex> bp_lk(sequence_id_mutex_);
185  if (TimeUtils::GetElapsedTime(last_backpressure_report_time_) > 1.0)
186  {
187  TLOG(TLVL_WARNING) << app_name << ": Back-pressure condition: All Shared Memory buffers have been full for " << TimeUtils::GetElapsedTime(last_fragment_header_write_time_) << " s!";
188  last_backpressure_report_time_ = std::chrono::steady_clock::now();
189  }
190  return nullptr;
191  }
192  if (buffer == -2)
193  {
194  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because data taking has already passed this event.";
195  }
196  else
197  {
198  TLOG(TLVL_ERROR) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the queue and reliable mode is off.";
199  }
200  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
201 
202  TLOG(6) << "Dropping fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin() << " sz=" << dropped_data_[frag.fragment_id]->dataSizeBytes();
203  return dropped_data_[frag.fragment_id]->dataBegin();
204  }
205 
206  last_backpressure_report_time_ = std::chrono::steady_clock::now();
207  last_fragment_header_write_time_ = std::chrono::steady_clock::now();
208  // Increment this as soon as we know we want to use the buffer
209  buffer_writes_pending_[buffer]++;
210 
211  if (metricMan)
212  {
213  metricMan->sendMetric("Input Fragment Rate", 1, "Fragments/s", 1, MetricMode::Rate);
214  }
215 
216  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtaining buffer_mutexes lock for buffer " << buffer;
217 
218  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
219 
220  TLOG(TLVL_BUFLCK) << "WriteFragmentHeader: obtained buffer_mutexes lock for buffer " << buffer;
221 
222  //TraceLock lk(buffer_mutexes_[buffer], 50, "WriteFragmentHeader");
223  auto hdrpos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
224  Write(buffer, &frag, frag.num_words() * sizeof(RawDataType));
225 
226  auto pos = reinterpret_cast<RawDataType*>(GetWritePos(buffer));
227  if (frag.word_count - frag.num_words() > 0)
228  {
229  auto sts = IncrementWritePos(buffer, (frag.word_count - frag.num_words()) * sizeof(RawDataType));
230 
231  if (!sts)
232  {
233  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->word_count = frag.num_words();
234  reinterpret_cast<detail::RawFragmentHeader*>(hdrpos)->type = Fragment::InvalidFragmentType;
235  TLOG(TLVL_ERROR) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " because there is no room in the current buffer for this Fragment! (Keeping header)";
236  dropped_data_[frag.fragment_id].reset(new Fragment(frag.word_count - frag.num_words()));
237 
238  oversize_fragment_count_++;
239 
240  if (maximum_oversize_fragment_count_ > 0 && oversize_fragment_count_ >= maximum_oversize_fragment_count_)
241  {
242  throw cet::exception("Too many over-size Fragments received! Please adjust max_event_size_bytes or max_fragment_size_bytes!");
243  }
244 
245  TLOG(6) << "Dropping over-size fragment with sequence id " << frag.sequence_id << " and fragment id " << frag.fragment_id << " into " << (void*)dropped_data_[frag.fragment_id]->dataBegin();
246  return dropped_data_[frag.fragment_id]->dataBegin();
247  }
248  }
249  TLOG(14) << "WriteFragmentHeader END";
250  return pos;
251 }
252 
253 void artdaq::SharedMemoryEventManager::DoneWritingFragment(detail::RawFragmentHeader frag)
254 {
255  TLOG(TLVL_TRACE) << "DoneWritingFragment BEGIN";
256  auto buffer = getBufferForSequenceID_(frag.sequence_id, false, frag.timestamp);
257  if (buffer == -1) Detach(true, "SharedMemoryEventManager", "getBufferForSequenceID_ returned -1 when it REALLY shouldn't have! Check program logic!");
258  if (buffer == -2) { return; }
259 
260  statsHelper_.addSample(FRAGMENTS_RECEIVED_STAT_KEY, frag.word_count * sizeof(RawDataType));
261  {
262  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtaining buffer_mutexes lock for buffer " << buffer;
263 
264  std::unique_lock<std::mutex> lk(buffer_mutexes_[buffer]);
265 
266  TLOG(TLVL_BUFLCK) << "DoneWritingFragment: obtained buffer_mutexes lock for buffer " << buffer;
267 
268  //TraceLock lk(buffer_mutexes_[buffer], 50, "DoneWritingFragment");
269 
270  TLOG(TLVL_DEBUG) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << " (type " << (int)frag.type << ")";
271  auto hdr = getEventHeader_(buffer);
272  if (update_run_ids_)
273  {
274  hdr->run_id = run_id_;
275  }
276  hdr->subrun_id = GetSubrunForSequenceID(frag.sequence_id);
277 
278  TLOG(TLVL_TRACE) << "DoneWritingFragment: Updating buffer touch time";
279  TouchBuffer(buffer);
280 
281  buffer_writes_pending_[buffer]--;
282  if (buffer_writes_pending_[buffer] != 0)
283  {
284  TLOG(TLVL_TRACE) << "Done writing fragment, but there's another writer. Not doing bookkeeping steps.";
285  return;
286  }
287  TLOG(TLVL_TRACE) << "Done writing fragment, and no other writer. Doing bookkeeping steps.";
288  auto frag_count = GetFragmentCount(frag.sequence_id);
289  hdr->is_complete = frag_count == num_fragments_per_event_;
290  TLOG(TLVL_TRACE) << "DoneWritingFragment: Received Fragment with sequence ID " << frag.sequence_id << " and fragment id " << frag.fragment_id << ", count/expected = " << frag_count << "/" << num_fragments_per_event_;
291 #if ART_SUPPORTS_DUPLICATE_EVENTS
292  if (!hdr->is_complete && released_incomplete_events_.count(frag.sequence_id))
293  {
294  hdr->is_complete = frag_count == released_incomplete_events_[frag.sequence_id] && buffer_writes_pending_[buffer] == 0;
295  }
296 #endif
297  }
298 
299  complete_buffer_(buffer);
300  if (requests_) requests_->SendRequest(true);
301  TLOG(TLVL_TRACE) << "DoneWritingFragment END";
302 }
303 
304 size_t artdaq::SharedMemoryEventManager::GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type)
305 {
306  return GetFragmentCountInBuffer(getBufferForSequenceID_(seqID, false), type);
307 }
308 
309 size_t artdaq::SharedMemoryEventManager::GetFragmentCountInBuffer(int buffer, Fragment::type_t type)
310 {
311  if (buffer == -1) return 0;
312  ResetReadPos(buffer);
313  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
314 
315  size_t count = 0;
316 
317  while (MoreDataInBuffer(buffer))
318  {
319  auto fragHdr = reinterpret_cast<artdaq::detail::RawFragmentHeader*>(GetReadPos(buffer));
320  IncrementReadPos(buffer, fragHdr->word_count * sizeof(RawDataType));
321  if (type != Fragment::InvalidFragmentType && fragHdr->type != type) continue;
322  TLOG(TLVL_TRACE) << "Adding Fragment with size=" << fragHdr->word_count << " to Fragment count";
323  ++count;
324  }
325 
326  return count;
327 }
328 
329 void artdaq::SharedMemoryEventManager::RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out)
330 {
331  do
332  {
333  auto start_time = std::chrono::steady_clock::now();
334  send_init_frag_();
335  TLOG(TLVL_INFO) << "Starting art process with config file " << config_file->getFileName();
336 
337  pid_t pid = 0;
338 
339  if (!manual_art_)
340  {
341  char* filename = new char[config_file->getFileName().length() + 1];
342  strcpy(filename, config_file->getFileName().c_str());
343 
344 #if DEBUG_ART
345  std::string debugArgS = "--config-out=" + app_name + "_art.out";
346  char* debugArg = new char[debugArgS.length() + 1];
347  strcpy(debugArg, debugArgS.c_str());
348 
349  std::vector<char*> args{(char*)"art", (char*)"-c", filename, debugArg, NULL};
350 #else
351  std::vector<char*> args{ (char*)"art", (char*)"-c", filename, NULL };
352 #endif
353 
354  pid = fork();
355  if (pid == 0)
356  { /* child */
357  // 23-May-2018, KAB: added the setting of the partition number env var
358  // in the environment of the child art process so that Globals.hh
359  // will pick it up there and provide it to the artdaq classes that
360  // are used in data transfers, etc. within the art process.
361  std::string envVarKey = "ARTDAQ_PARTITION_NUMBER";
362  std::string envVarValue = std::to_string(GetPartitionNumber());
363  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
364  {
365  TLOG(TLVL_ERROR) << "Error setting environment variable \"" << envVarKey
366  << "\" in the environment of a child art process. "
367  << "This may result in incorrect TCP port number "
368  << "assignments or other issues, and data may "
369  << "not flow through the system correctly.";
370  }
371  envVarKey = "ARTDAQ_APPLICATION_NAME";
372  envVarValue = app_name;
373  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
374  {
375  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
376  << "\" in the environment of a child art process. ";
377  }
378  envVarKey = "ARTDAQ_RANK";
379  envVarValue = std::to_string(my_rank);
380  if (setenv(envVarKey.c_str(), envVarValue.c_str(), 1) != 0)
381  {
382  TLOG(TLVL_DEBUG) << "Error setting environment variable \"" << envVarKey
383  << "\" in the environment of a child art process. ";
384  }
385 
386  execvp("art", &args[0]);
387  delete[] filename;
388  exit(1);
389  }
390  delete[] filename;
391  }
392  else
393  {
394  //Using cin/cout here to ensure console is active (artdaqDriver)
395  std::cout << "Please run the following command in a separate terminal:" << std::endl
396  << "art -c " << config_file->getFileName() << std::endl
397  << "Then, in a third terminal, execute: \"ps aux|grep [a]rt -c " << config_file->getFileName() << "\" and note the PID of the art process." << std::endl
398  << "Finally, return to this window and enter the pid: " << std::endl;
399  std::cin >> pid;
400  }
401  *pid_out = pid;
402 
403  TLOG(TLVL_INFO) << "PID of new art process is " << pid;
404  {
405  std::unique_lock<std::mutex> lk(art_process_mutex_);
406  art_processes_.insert(pid);
407  }
408  siginfo_t status;
409  auto sts = waitid(P_PID, pid, &status, WEXITED);
410  TLOG(TLVL_INFO) << "Removing PID " << pid << " from process list";
411  {
412  std::unique_lock<std::mutex> lk(art_process_mutex_);
413  art_processes_.erase(pid);
414  }
415  if (sts < 0)
416  {
417  TLOG(TLVL_WARNING) << "Error occurred in waitid for art process " << pid << ": " << errno << " (" << strerror(errno) << ").";
418  }
419  else if (status.si_code == CLD_EXITED && status.si_status == 0)
420  {
421  TLOG(TLVL_INFO) << "art process " << pid << " exited normally, " << (restart_art_ ? "restarting" : "not restarting");
422  }
423  else
424  {
425  auto art_lifetime = TimeUtils::GetElapsedTime(start_time);
426  if (art_lifetime < minimum_art_lifetime_s_) restart_art_ = false;
427 
428  auto exit_type = "exited with status code";
429  switch (status.si_code)
430  {
431  case CLD_DUMPED:
432  case CLD_KILLED:
433  exit_type = "was killed with signal";
434  break;
435  case CLD_EXITED:
436  default:
437  break;
438  }
439 
440  TLOG((restart_art_ ? TLVL_WARNING : TLVL_ERROR))
441  << "art process " << pid << " " << exit_type << " " << status.si_status
442  << (status.si_code == CLD_DUMPED ? " (core dumped)" : "")
443  << " after running for " << std::setprecision(2) << std::fixed << art_lifetime << " seconds, "
444  << (restart_art_ ? "restarting" : "not restarting");
445  }
446  } while (restart_art_);
447 }
448 
450 {
451  restart_art_ = always_restart_art_;
452  if (num_art_processes_ == 0) return;
453  for (size_t ii = 0; ii < num_art_processes_; ++ii)
454  {
455  StartArtProcess(current_art_pset_);
456  }
457 }
458 
460 {
461  static std::mutex start_art_mutex;
462  std::unique_lock<std::mutex> lk(start_art_mutex);
463  //TraceLock lk(start_art_mutex, 15, "StartArtLock");
464  restart_art_ = always_restart_art_;
465  auto initialCount = GetAttachedCount();
466  auto startTime = std::chrono::steady_clock::now();
467 
468  if (pset != current_art_pset_ || !current_art_config_file_)
469  {
470  current_art_pset_ = pset;
471  current_art_config_file_ = std::make_shared<art_config_file>(pset/*, GetKey(), GetBroadcastKey()*/);
472  }
473  std::shared_ptr<std::atomic<pid_t>> pid(new std::atomic<pid_t>(-1));
474  boost::thread thread([&] { RunArt(current_art_config_file_, pid); });
475  thread.detach();
476 
477  auto currentCount = GetAttachedCount() - initialCount;
478  while ((currentCount < 1 || *pid <= 0) && (TimeUtils::GetElapsedTime(startTime) < 5 || manual_art_))
479  {
480  usleep(10000);
481  currentCount = GetAttachedCount() - initialCount;
482  }
483  if ((currentCount < 1 || *pid <= 0) && manual_art_)
484  {
485  TLOG(TLVL_WARNING) << "Manually-started art process has not connected to shared memory or has bad PID: connected:" << currentCount << ", PID:" << pid;
486  return 0;
487  }
488  else if (currentCount < 1 || *pid <= 0)
489  {
490  TLOG(TLVL_WARNING) << "art process has not started after 5s. Check art configuration!"
491  << " (pid=" << *pid << ", attachedCount=" << currentCount << ")";
492  return 0;
493  }
494  else
495  {
496  TLOG(TLVL_INFO) << std::setw(4) << std::fixed << "art initialization took "
497  << TimeUtils::GetElapsedTime(startTime) << " seconds.";
498 
499  return *pid;
500  }
501 }
502 
504 {
505  restart_art_ = false;
506  //current_art_config_file_ = nullptr;
507  //current_art_pset_ = fhicl::ParameterSet();
508 
509  auto check_pids = [&](bool print) {
510  std::unique_lock<std::mutex> lk(art_process_mutex_);
511  for (auto pid = pids.begin(); pid != pids.end();)
512  {
513  // 08-May-2018, KAB: protect against killing invalid PIDS
514 
515  if (*pid <= 0)
516  {
517  TLOG(TLVL_WARNING) << "Removing an invalid PID (" << *pid
518  << ") from the shutdown list.";
519  pid = pids.erase(pid);
520  }
521  else if (kill(*pid, 0) < 0)
522  {
523  pid = pids.erase(pid);
524  }
525  else
526  {
527  if (print) std::cout << *pid << " ";
528  ++pid;
529  }
530  }
531  };
532  auto count_pids = [&]() {
533  std::unique_lock<std::mutex> lk(art_process_mutex_);
534  return pids.size();
535  };
536  check_pids(false);
537  if (count_pids() == 0)
538  {
539  TLOG(14) << "All art processes already exited, nothing to do.";
540  usleep(1000);
541  return;
542  }
543 
544  if (!manual_art_)
545  {
546  {
547  TLOG(TLVL_TRACE) << "Gently informing art processes that it is time to shut down";
548  std::unique_lock<std::mutex> lk(art_process_mutex_);
549  for (auto pid : pids)
550  {
551  TLOG(TLVL_TRACE) << "Sending SIGQUIT to pid " << pid;
552  kill(pid, SIGQUIT);
553  }
554  }
555 
556  int graceful_wait_ms = 5000;
557  int int_wait_ms = 1000;
558 
559  TLOG(TLVL_TRACE) << "Waiting up to " << graceful_wait_ms << " ms for all art processes to exit gracefully";
560  for (int ii = 0; ii < graceful_wait_ms; ++ii)
561  {
562  usleep(1000);
563 
564  check_pids(false);
565  if (count_pids() == 0)
566  {
567  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
568  return;
569  }
570  }
571 
572  {
573  TLOG(TLVL_TRACE) << "Insisting that the art processes shut down";
574  std::unique_lock<std::mutex> lk(art_process_mutex_);
575  for (auto pid : pids)
576  {
577  kill(pid, SIGINT);
578  }
579  }
580 
581  TLOG(TLVL_TRACE) << "Waiting up to " << int_wait_ms << " ms for all art processes to exit";
582  for (int ii = graceful_wait_ms; ii < graceful_wait_ms + int_wait_ms; ++ii)
583  {
584  usleep(1000);
585 
586  check_pids(false);
587 
588  if (count_pids() == 0)
589  {
590  TLOG(TLVL_TRACE) << "All art processes exited after " << ii << " ms.";
591  return;
592  }
593  }
594 
595  TLOG(TLVL_TRACE) << "Killing remaning art processes with extreme prejudice";
596  while (count_pids() > 0)
597  {
598  {
599  std::unique_lock<std::mutex> lk(art_process_mutex_);
600  kill(*pids.begin(), SIGKILL);
601  usleep(1000);
602  }
603  check_pids(false);
604  }
605  }
606  else
607  {
608  std::cout << "Please shut down all art processes, then hit return/enter" << std::endl;
609  while (count_pids() > 0)
610  {
611  std::cout << "The following PIDs are running: ";
612  check_pids(true);
613  std::cout << std::endl;
614  std::string ignored;
615  std::cin >> ignored;
616  }
617  }
618 }
619 
620 void artdaq::SharedMemoryEventManager::ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun, int n_art_processes)
621 {
622  TLOG(TLVL_DEBUG) << "ReconfigureArt BEGIN";
623  if (restart_art_ || !always_restart_art_) // Art is running
624  {
625  endOfData();
626  }
627  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
628  {
629  broadcasts_.MarkBufferEmpty(ii, true);
630  }
631  if (newRun == 0) newRun = run_id_ + 1;
632 
633  if (art_pset != current_art_pset_ || !current_art_config_file_)
634  {
635  current_art_pset_ = art_pset;
636  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
637  }
638 
639  if (n_art_processes != -1)
640  {
641  TLOG(TLVL_INFO) << "Setting number of art processes to " << n_art_processes;
642  num_art_processes_ = n_art_processes;
643  }
644  startRun(newRun);
645  TLOG(TLVL_DEBUG) << "ReconfigureArt END";
646 }
647 
649 {
650  running_ = false;
651  init_fragment_.reset(nullptr);
652  TLOG(TLVL_DEBUG) << "SharedMemoryEventManager::endOfData";
653  restart_art_ = false;
654 
655  size_t initialStoreSize = GetIncompleteEventCount();
656  TLOG(TLVL_DEBUG) << "endOfData: Flushing " << initialStoreSize
657  << " stale events from the SharedMemoryEventManager.";
658  int counter = initialStoreSize;
659  while (active_buffers_.size() > 0 && counter > 0)
660  {
661  complete_buffer_(*active_buffers_.begin());
662  counter--;
663  }
664  TLOG(TLVL_DEBUG) << "endOfData: Done flushing, there are now " << GetIncompleteEventCount()
665  << " stale events in the SharedMemoryEventManager.";
666 
667  TLOG(TLVL_DEBUG) << "Waiting for " << (ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_))) << " outstanding buffers...";
668  auto start = std::chrono::steady_clock::now();
669  auto lastReadCount = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
670  auto end_of_data_wait_us = art_event_processing_time_us_ * (lastReadCount > 0 ? lastReadCount : 1);//size();
671 
672  auto outstanding_buffer_wait_time = art_event_processing_time_us_ > 100000 ? 100000 : art_event_processing_time_us_;
673 
674  // We will wait until no buffer has been read for the end of data wait seconds, or no art processes are left.
675  while (lastReadCount > 0 && (end_of_data_wait_us == 0 || TimeUtils::GetElapsedTimeMicroseconds(start) < end_of_data_wait_us) && get_art_process_count_() > 0)
676  {
677  auto temp = ReadReadyCount() + (size() - WriteReadyCount(overwrite_mode_));
678  if (temp != lastReadCount)
679  {
680  TLOG(TLVL_TRACE) << "Waiting for " << temp << " outstanding buffers...";
681  lastReadCount = temp;
682  start = std::chrono::steady_clock::now();
683  }
684  if (lastReadCount > 0)
685  {
686  TRACE(19, "About to sleep %lu us - lastReadCount=%lu size=%lu end_of_data_wait_us=%lu", outstanding_buffer_wait_time, lastReadCount, size(), end_of_data_wait_us);
687  usleep(outstanding_buffer_wait_time);
688  }
689  }
690 
691  TLOG(TLVL_DEBUG) << "endOfData: After wait for outstanding buffers. Still outstanding: " << lastReadCount << ", time waited: "
692  << TimeUtils::GetElapsedTime(start) << " s / " << (end_of_data_wait_us / 1000000.0) << " s, art process count: " << get_art_process_count_();
693 
694  TLOG(TLVL_DEBUG) << "endOfData: Broadcasting EndOfData Fragment";
695  FragmentPtr outFrag = Fragment::eodFrag(GetBufferCount());
696  bool success = broadcastFragment_(std::move(outFrag), outFrag);
697  if (!success)
698  {
699  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers to make room for EndOfData Fragment";
700  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
701  {
702  broadcasts_.MarkBufferEmpty(ii, true);
703  }
704  broadcastFragment_(std::move(outFrag), outFrag);
705  }
706  auto endOfDataProcessingStart = std::chrono::steady_clock::now();
707 
708  if (get_art_process_count_() > 0)
709  {
710  TLOG(TLVL_DEBUG) << "Allowing " << get_art_process_count_() << " art processes the chance to end gracefully";
711  if (end_of_data_wait_us == 0)
712  {
713  TLOG(TLVL_DEBUG) << "Expected art event processing time not specified. Waiting up to 100s for art to end gracefully.";
714  end_of_data_wait_us = 100 * 1000000;
715  }
716 
717  auto sleep_count = (end_of_data_wait_us / 10000) + 1;
718  for (size_t ii = 0; ii < sleep_count; ++ii)
719  {
720  usleep(10000);
721  if (get_art_process_count_() == 0) break;
722  }
723  }
724 
725  while (get_art_process_count_() > 0)
726  {
727  TLOG(TLVL_DEBUG) << "There are " << get_art_process_count_() << " art processes remaining. Proceeding to shutdown.";
728 
729  ShutdownArtProcesses(art_processes_);
730  }
731  TLOG(TLVL_DEBUG) << "It took " << TimeUtils::GetElapsedTime(endOfDataProcessingStart) << " s for all art processes to close after sending EndOfData Fragment";
732 
733  ResetAttachedCount();
734 
735  TLOG(TLVL_DEBUG) << "endOfData: Clearing buffers";
736  for (size_t ii = 0; ii < size(); ++ii)
737  {
738  MarkBufferEmpty(ii, true);
739  }
740  // ELF 06/04/2018: Cannot clear broadcasts here, we want the EndOfDataFragment to persist until it's time to start art again...
741  // TLOG(TLVL_TRACE) << "endOfData: Clearing broadcast buffers";
742  // for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
743  // {
744  // broadcasts_.MarkBufferEmpty(ii, true);
745  // }
746  released_incomplete_events_.clear();
747 
748  TLOG(TLVL_DEBUG) << "endOfData: Shutting down RequestSender";
749  requests_.reset(nullptr);
750 
751  TLOG(TLVL_DEBUG) << "endOfData END";
752  TLOG(TLVL_INFO) << "EndOfData Complete. There were " << GetLastSeenBufferID() << " buffers processed.";
753  return true;
754 }
755 
757 {
758  running_ = true;
759  init_fragment_.reset(nullptr);
760  statsHelper_.resetStatistics();
761  TLOG(TLVL_TRACE) << "startRun: Clearing broadcast buffers";
762  for (size_t ii = 0; ii < broadcasts_.size(); ++ii)
763  {
764  broadcasts_.MarkBufferEmpty(ii, true);
765  }
766  StartArt();
767  run_id_ = runID;
768  {
769  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
770  subrun_event_map_.clear();
771  subrun_event_map_[0] = 1;
772  }
773  run_event_count_ = 0;
774  run_incomplete_event_count_ = 0;
775  requests_.reset(new RequestSender(data_pset_));
776  if (requests_)
777  {
778  requests_->SetRunNumber(static_cast<uint32_t>(run_id_));
779  requests_->SendRoutingToken(queue_size_, run_id_);
780  }
781  TLOG(TLVL_DEBUG) << "Starting run " << run_id_
782  << ", max queue size = "
783  << queue_size_
784  << ", queue size = "
785  << GetLockedBufferCount();
786  if (metricMan)
787  {
788  metricMan->sendMetric("Run Number", static_cast<unsigned long>(run_id_), "Run", 1, MetricMode::LastPoint);
789  }
790 }
791 
793 {
794  TLOG(TLVL_INFO) << "Ending run " << run_id_;
795  FragmentPtr endOfRunFrag(new Fragment(static_cast<size_t>(ceil(sizeof(my_rank) /
796  static_cast<double>(sizeof(Fragment::value_type))))));
797 
798  TLOG(TLVL_DEBUG) << "Broadcasting EndOfRun Fragment";
799  endOfRunFrag->setSystemType(Fragment::EndOfRunFragmentType);
800  *endOfRunFrag->dataBegin() = my_rank;
801  broadcastFragment_(std::move(endOfRunFrag), endOfRunFrag);
802 
803  TLOG(TLVL_INFO) << "Run " << run_id_ << " has ended. There were " << run_event_count_ << " events in this run.";
804  run_event_count_ = 0;
805  run_incomplete_event_count_ = 0;
806  oversize_fragment_count_ = 0;
807 {
808  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
809  subrun_event_map_.clear();
810  subrun_event_map_[0] = 1;
811  }
812  return true;
813 }
814 
816 {
817  // Generated EndOfSubrun Fragments have Sequence ID 0 and should be ignored
818  if (boundary == 0 || boundary == Fragment::InvalidSequenceID) return;
819 
820  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
821 
822  TLOG(TLVL_INFO) << "Will roll over to subrun " << subrun << " when I reach Sequence ID " << boundary;
823  subrun_event_map_[boundary] = subrun;
824  while (subrun_event_map_.size() > max_subrun_event_map_length_)
825  {
826  subrun_event_map_.erase(subrun_event_map_.begin());
827  }
828  }
829 
831 {
832  Fragment::sequence_id_t seqID = 0;
833  subrun_id_t subrun = 0;
834  {
835  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
836  for (auto& it : subrun_event_map_)
837  {
838  if (it.first >= seqID) seqID = it.first + 1;
839  if (it.second >= subrun) subrun = it.second + 1;
840  }
841  }
842  rolloverSubrun(seqID, subrun);
843 }
844 
846 {
847  if (metricMan)
848  {
849  metricMan->sendMetric("Incomplete Event Count", GetIncompleteEventCount(), "events", 1, MetricMode::LastPoint);
850  metricMan->sendMetric("Pending Event Count", GetPendingEventCount(), "events", 1, MetricMode::LastPoint);
851  }
852 
853  if (incomplete_event_report_interval_ms_ > 0 && GetLockedBufferCount())
854  {
855  if (TimeUtils::GetElapsedTimeMilliseconds(last_incomplete_event_report_time_) < static_cast<size_t>(incomplete_event_report_interval_ms_))
856  return;
857 
858  last_incomplete_event_report_time_ = std::chrono::steady_clock::now();
859  std::ostringstream oss;
860  oss << "Incomplete Events (" << num_fragments_per_event_ << "): ";
861  for (auto& ev : active_buffers_)
862  {
863  auto hdr = getEventHeader_(ev);
864  oss << hdr->sequence_id << " (" << GetFragmentCount(hdr->sequence_id) << "), ";
865  }
866  TLOG(TLVL_DEBUG) << oss.str();
867  }
868 }
869 
870 bool artdaq::SharedMemoryEventManager::broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag)
871 {
872  TLOG(TLVL_DEBUG) << "Broadcasting Fragment with seqID=" << frag->sequenceID() << ", type " << detail::RawFragmentHeader::SystemTypeToString(frag->type()) << ", size=" << frag->sizeBytes() << "B.";
873  auto buffer = broadcasts_.GetBufferForWriting(false);
874  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer 1st buffer=" << buffer;
875  auto start_time = std::chrono::steady_clock::now();
876  while (buffer == -1 && TimeUtils::GetElapsedTimeMilliseconds(start_time) < static_cast<size_t>(broadcast_timeout_ms_))
877  {
878  usleep(10000);
879  buffer = broadcasts_.GetBufferForWriting(false);
880  }
881  TLOG(TLVL_DEBUG) << "broadcastFragment_: after getting buffer w/timeout, buffer=" << buffer << ", elapsed time=" << TimeUtils::GetElapsedTime(start_time) << " s.";
882  if (buffer == -1)
883  {
884  TLOG(TLVL_ERROR) << "Broadcast of fragment type " << frag->typeString() << " failed due to timeout waiting for buffer!";
885  outFrag.swap(frag);
886  return false;
887  }
888 
889  TLOG(TLVL_DEBUG) << "broadcastFragment_: Filling in RawEventHeader";
890  auto hdr = reinterpret_cast<detail::RawEventHeader*>(broadcasts_.GetBufferStart(buffer));
891  hdr->run_id = run_id_;
892  hdr->subrun_id = GetSubrunForSequenceID(frag->sequenceID());
893  hdr->sequence_id = frag->sequenceID();
894  hdr->is_complete = true;
895  broadcasts_.IncrementWritePos(buffer, sizeof(detail::RawEventHeader));
896 
897  TLOG(TLVL_DEBUG) << "broadcastFragment_ before Write calls";
898  broadcasts_.Write(buffer, frag->headerAddress(), frag->size() * sizeof(RawDataType));
899 
900  TLOG(TLVL_DEBUG) << "broadcastFragment_ Marking buffer full";
901  broadcasts_.MarkBufferFull(buffer, -1);
902  outFrag.swap(frag);
903  TLOG(TLVL_DEBUG) << "broadcastFragment_ Complete";
904  return true;
905 }
906 
907 artdaq::detail::RawEventHeader* artdaq::SharedMemoryEventManager::getEventHeader_(int buffer)
908 {
909  return reinterpret_cast<detail::RawEventHeader*>(GetBufferStart(buffer));
910 }
911 
913 {
914  std::unique_lock<std::mutex> lk(subrun_event_map_mutex_);
915 
916  TLOG(TLVL_TRACE) << "GetSubrunForSequenceID BEGIN map size = " << subrun_event_map_.size();
917  auto it = subrun_event_map_.begin();
918  subrun_id_t subrun = 1;
919 
920  while (it->first <= seqID && it != subrun_event_map_.end())
921  {
922  TLOG(TLVL_TRACE) << "Map has sequence ID " << it->first << ", subrun " << it->second << " (looking for <= " << seqID << ")";
923  subrun = it->second;
924  ++it;
925  }
926 
927  TLOG(TLVL_DEBUG) << "GetSubrunForSequenceID returning subrun " << subrun << " for sequence ID " << seqID;
928  return subrun;
929 }
930 
931 int artdaq::SharedMemoryEventManager::getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp)
932 {
933  TLOG(14) << "getBufferForSequenceID " << seqID << " BEGIN";
934  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
935 
936  TLOG(14) << "getBufferForSequenceID obtained sequence_id_mutex for seqid=" << seqID;
937 
938  auto buffers = GetBuffersOwnedByManager();
939  for (auto& buf : buffers)
940  {
941  auto hdr = getEventHeader_(buf);
942  if (hdr->sequence_id == seqID)
943  {
944  TLOG(14) << "getBufferForSequenceID " << seqID << " returning " << buf;
945  return buf;
946  }
947  }
948 
949 #if !ART_SUPPORTS_DUPLICATE_EVENTS
950  if (released_incomplete_events_.count(seqID))
951  {
952  TLOG(TLVL_ERROR) << "Event " << seqID << " has already been marked \"Incomplete\" and sent to art!";
953  return -2;
954  }
955 #endif
956 
957  if (!create_new) return -1;
958 
959  check_pending_buffers_(lk);
960  int new_buffer = GetBufferForWriting(false);
961 
962  if (new_buffer == -1)
963  {
964  new_buffer = GetBufferForWriting(overwrite_mode_);
965  }
966 
967  if (new_buffer == -1) return -1;
968  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtaining buffer_mutexes lock for buffer " << new_buffer;
969  std::unique_lock<std::mutex> buffer_lk(buffer_mutexes_[new_buffer]);
970  TLOG(TLVL_BUFLCK) << "getBufferForSequenceID_: obtained buffer_mutexes lock for buffer " << new_buffer;
971  //TraceLock(buffer_mutexes_[new_buffer], 34, "getBufferForSequenceID");
972  auto hdr = getEventHeader_(new_buffer);
973  hdr->is_complete = false;
974  hdr->run_id = run_id_;
975  hdr->subrun_id = GetSubrunForSequenceID(seqID);
976  hdr->event_id = use_sequence_id_for_event_number_ ? static_cast<uint32_t>(seqID) : static_cast<uint32_t>(timestamp);
977  hdr->sequence_id = seqID;
978  buffer_writes_pending_[new_buffer] = 0;
979  IncrementWritePos(new_buffer, sizeof(detail::RawEventHeader));
980  SetMFIteration("Sequence ID " + std::to_string(seqID));
981 
982  TLOG(TLVL_BUFFER) << "getBufferForSequenceID placing " << new_buffer << " to active.";
983  active_buffers_.insert(new_buffer);
984  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
985  << size() << ","
986  << ReadReadyCount() << ","
987  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
988  << WriteReadyCount(false) << ","
989  << pending_buffers_.size() << ","
990  << active_buffers_.size() << ")";
991 
992  if (requests_)
993  {
994  if (timestamp != Fragment::InvalidTimestamp)
995  {
996  requests_->AddRequest(seqID, timestamp);
997  }
998  // 17-Aug-2018, KAB: only call SendRequest if AddRequest was *not* called so that we
999  // don't double-send requests, but still get the benefit of calling SendRequest 'often'.
1000  else
1001  {
1002  requests_->SendRequest();
1003  }
1004  }
1005  TLOG(14) << "getBufferForSequenceID " << seqID << " returning newly initialized buffer " << new_buffer;
1006  return new_buffer;
1007 }
1008 
1009 bool artdaq::SharedMemoryEventManager::hasFragments_(int buffer)
1010 {
1011  if (buffer == -1) return true;
1012  if (!CheckBuffer(buffer, BufferSemaphoreFlags::Writing))
1013  {
1014  return true;
1015  }
1016  ResetReadPos(buffer);
1017  IncrementReadPos(buffer, sizeof(detail::RawEventHeader));
1018  return MoreDataInBuffer(buffer);
1019 }
1020 
1021 void artdaq::SharedMemoryEventManager::complete_buffer_(int buffer)
1022 {
1023  auto hdr = getEventHeader_(buffer);
1024  if (hdr->is_complete)
1025  {
1026  TLOG(TLVL_DEBUG) << "complete_buffer_: This fragment completes event " << hdr->sequence_id << ".";
1027 
1028  {
1029  TLOG(TLVL_BUFFER) << "complete_buffer_ moving " << buffer << " from active to pending.";
1030 
1031  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtaining sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1032  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1033  TLOG(TLVL_BUFLCK) << "complete_buffer_: obtained sequence_id_mutex lock for seqid=" << hdr->sequence_id;
1034  active_buffers_.erase(buffer);
1035  pending_buffers_.insert(buffer);
1036 
1037  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1038  << size() << ","
1039  << ReadReadyCount() << ","
1040  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1041  << WriteReadyCount(false) << ","
1042  << pending_buffers_.size() << ","
1043  << active_buffers_.size() << ")";
1044  }
1045  if (requests_)
1046  {
1047  requests_->RemoveRequest(hdr->sequence_id);
1048  }
1049  }
1050  CheckPendingBuffers();
1051 }
1052 
1053 bool artdaq::SharedMemoryEventManager::bufferComparator(int bufA, int bufB)
1054 {
1055  return getEventHeader_(bufA)->sequence_id < getEventHeader_(bufB)->sequence_id;
1056 }
1057 
1059 {
1060  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtaining sequence_id_mutex_";
1061  std::unique_lock<std::mutex> lk(sequence_id_mutex_);
1062  TLOG(TLVL_BUFLCK) << "CheckPendingBuffers: Obtained sequence_id_mutex_";
1063  check_pending_buffers_(lk);
1064 }
1065 
1066 void artdaq::SharedMemoryEventManager::check_pending_buffers_(std::unique_lock<std::mutex> const& lock)
1067 {
1068  TLOG(TLVL_TRACE) << "check_pending_buffers_ BEGIN Locked=" << std::boolalpha << lock.owns_lock();
1069 
1070  auto buffers = GetBuffersOwnedByManager();
1071  for (auto buf : buffers)
1072  {
1073  if (ResetBuffer(buf) && !pending_buffers_.count(buf))
1074  {
1075  TLOG(15) << "check_pending_buffers_ Incomplete buffer detected, buf=" << buf << " active_bufers_.count(buf)=" << active_buffers_.count(buf) << " buffer_writes_pending_[buf]=" << buffer_writes_pending_[buf].load();
1076  auto hdr = getEventHeader_(buf);
1077  if (active_buffers_.count(buf) && (buffer_writes_pending_[buf].load() == 0 || !running_))
1078  {
1079  if (requests_)
1080  {
1081  requests_->RemoveRequest(hdr->sequence_id);
1082  }
1083  TLOG(TLVL_BUFFER) << "check_pending_buffers_ moving buffer " << buf << " from active to pending";
1084  active_buffers_.erase(buf);
1085  pending_buffers_.insert(buf);
1086  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1087  << size() << ","
1088  << ReadReadyCount() << ","
1089  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1090  << WriteReadyCount(false) << ","
1091  << pending_buffers_.size() << ","
1092  << active_buffers_.size() << ")";
1093 
1094  run_incomplete_event_count_++;
1095  if (metricMan) metricMan->sendMetric("Incomplete Event Rate", 1, "events/s", 3, MetricMode::Rate);
1096  if (!released_incomplete_events_.count(hdr->sequence_id))
1097  {
1098  released_incomplete_events_[hdr->sequence_id] = num_fragments_per_event_ - GetFragmentCountInBuffer(buf);
1099  }
1100  else
1101  {
1102  released_incomplete_events_[hdr->sequence_id] -= GetFragmentCountInBuffer(buf);
1103  }
1104  TLOG(TLVL_WARNING) << "Active event " << hdr->sequence_id << " is stale. Scheduling release of incomplete event (missing " << released_incomplete_events_[hdr->sequence_id] << " Fragments) to art.";
1105  }
1106  }
1107  }
1108 
1109  std::list<int> sorted_buffers(pending_buffers_.begin(), pending_buffers_.end());
1110  sorted_buffers.sort([this](int a, int b) { return bufferComparator(a, b); });
1111 
1112  auto counter = 0;
1113  double eventSize = 0;
1114  for (auto buf : sorted_buffers)
1115  {
1116  auto hdr = getEventHeader_(buf);
1117  auto thisEventSize = BufferDataSize(buf);
1118 
1119  TLOG(TLVL_DEBUG) << "Releasing event " << std::to_string(hdr->sequence_id) << " in buffer " << buf << " to art, "
1120  << "event_size=" << thisEventSize << ", buffer_size=" << BufferSize();
1121  statsHelper_.addSample(EVENTS_RELEASED_STAT_KEY, thisEventSize);
1122 
1123  TLOG(TLVL_BUFFER) << "check_pending_buffers_ removing buffer " << buf << " moving from pending to full";
1124  MarkBufferFull(buf);
1125  run_event_count_++;
1126  counter++;
1127  eventSize += thisEventSize;
1128  pending_buffers_.erase(buf);
1129  TLOG(TLVL_BUFFER) << "Buffer occupancy now (total,full,reading,empty,pending,active)=("
1130  << size() << ","
1131  << ReadReadyCount() << ","
1132  << WriteReadyCount(true) - WriteReadyCount(false) - ReadReadyCount() << ","
1133  << WriteReadyCount(false) << ","
1134  << pending_buffers_.size() << ","
1135  << active_buffers_.size() << ")";
1136  }
1137 
1138  if (requests_)
1139  {
1140  TLOG(TLVL_TRACE) << "Sent tokens: " << requests_->GetSentTokenCount() << ", Event count: " << run_event_count_;
1141  auto outstanding_tokens = requests_->GetSentTokenCount() - run_event_count_;
1142  auto available_buffers = WriteReadyCount(overwrite_mode_);
1143 
1144  TLOG(TLVL_TRACE) << "check_pending_buffers_: outstanding_tokens: " << outstanding_tokens << ", available_buffers: " << available_buffers
1145  << ", tokens_to_send: " << available_buffers - outstanding_tokens;
1146 
1147  if (available_buffers > outstanding_tokens)
1148  {
1149  auto tokens_to_send = available_buffers - outstanding_tokens;
1150 
1151  while (tokens_to_send > 0)
1152  {
1153  TLOG(35) << "check_pending_buffers_: Sending a Routing Token";
1154  requests_->SendRoutingToken(1, run_id_);
1155  tokens_to_send--;
1156  }
1157  }
1158  }
1159 
1160  if (statsHelper_.readyToReport()) {
1161  std::string statString = buildStatisticsString_();
1162  TLOG(TLVL_INFO) << statString;
1163  }
1164 
1165  metric_data_.event_count += counter;
1166  metric_data_.event_size += eventSize;
1167 
1168  if (metricMan && TimeUtils::GetElapsedTimeMilliseconds(last_shmem_buffer_metric_update_) > 500) // Limit to 2 Hz updates
1169  {
1170  TLOG(TLVL_TRACE) << "check_pending_buffers_: Sending Metrics";
1171  metricMan->sendMetric("Event Rate", metric_data_.event_count, "Events/s", 1, MetricMode::Rate);
1172  if (metric_data_.event_count > 0) metricMan->sendMetric("Average Event Size", metric_data_.event_size / metric_data_.event_count, "Bytes", 1, MetricMode::Average);
1173  metric_data_ = MetricData();
1174 
1175  metricMan->sendMetric("Events Released to art this run", run_event_count_, "Events", 1, MetricMode::LastPoint);
1176  metricMan->sendMetric("Incomplete Events Released to art this run", run_incomplete_event_count_, "Events", 1, MetricMode::LastPoint);
1177  if (requests_) metricMan->sendMetric("Tokens sent", requests_->GetSentTokenCount(), "Tokens", 2, MetricMode::LastPoint);
1178 
1179  auto bufferReport = GetBufferReport();
1180  int full = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Full; });
1181  int empty = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Empty; });
1182  int writing = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Writing; });
1183  int reading = std::count_if(bufferReport.begin(), bufferReport.end(), [](std::pair<int, BufferSemaphoreFlags> p) {return p.second == BufferSemaphoreFlags::Reading; });
1184  auto total = size();
1185  TLOG(TLVL_DEBUG) << "Buffer usage: full=" << full << ", empty=" << empty << ", writing=" << writing << ", reading=" << reading << ", total=" << total;
1186 
1187  metricMan->sendMetric("Shared Memory Full Buffers", full, "buffers", 2, MetricMode::LastPoint);
1188  metricMan->sendMetric("Shared Memory Available Buffers", empty, "buffers", 2, MetricMode::LastPoint);
1189  metricMan->sendMetric("Shared Memory Pending Buffers", writing, "buffers", 2, MetricMode::LastPoint);
1190  metricMan->sendMetric("Shared Memory Reading Buffers", reading, "buffers", 2, MetricMode::LastPoint);
1191  if (total > 0)
1192  {
1193  metricMan->sendMetric("Shared Memory Full %", full * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1194  metricMan->sendMetric("Shared Memory Available %", empty * 100 / static_cast<double>(total), "%", 2, MetricMode::LastPoint);
1195  }
1196 
1197  last_shmem_buffer_metric_update_ = std::chrono::steady_clock::now();
1198  }
1199  TLOG(TLVL_TRACE) << "check_pending_buffers_ END";
1200 }
1201 
1202 void artdaq::SharedMemoryEventManager::send_init_frag_()
1203 {
1204  if (init_fragment_ != nullptr)
1205  {
1206  TLOG(TLVL_INFO) << "Broadcasting init fragment to all art subprocesses...";
1207 
1208 #if 0
1209  std::string fileName = "receiveInitMessage_" + std::to_string(my_rank) + ".bin";
1210  std::fstream ostream(fileName.c_str(), std::ios::out | std::ios::binary);
1211  ostream.write(reinterpret_cast<char*>(init_fragment_->dataBeginBytes()), init_fragment_->dataSizeBytes());
1212  ostream.close();
1213 #endif
1214 
1215  broadcastFragment_(std::move(init_fragment_), init_fragment_);
1216  TLOG(TLVL_TRACE) << "Init Fragment sent";
1217  }
1218  else if (send_init_fragments_)
1219  {
1220  TLOG(TLVL_WARNING) << "Cannot send init fragment because I haven't yet received one!";
1221  }
1222 }
1223 
1225 {
1226  if (!init_fragment_ || init_fragment_ == nullptr)
1227  {
1228  init_fragment_.swap(frag);
1229  send_init_frag_();
1230  }
1231 }
1232 
1234 {
1235  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration BEGIN";
1236  if (art_pset != current_art_pset_ || !current_art_config_file_)
1237  {
1238  current_art_pset_ = art_pset;
1239  current_art_config_file_ = std::make_shared<art_config_file>(art_pset/*, GetKey(), GetBroadcastKey()*/);
1240  }
1241  TLOG(TLVL_DEBUG) << "UpdateArtConfiguration END";
1242 }
1243 
1244 std::string artdaq::SharedMemoryEventManager::buildStatisticsString_() const {
1245  std::ostringstream oss;
1246  oss << app_name << " statistics:" << std::endl;
1247 
1248  artdaq::MonitoredQuantityPtr mqPtr =
1249  artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(EVENTS_RELEASED_STAT_KEY);
1250  if (mqPtr.get() != 0) {
1251  artdaq::MonitoredQuantityStats stats;
1252  mqPtr->getStats(stats);
1253  oss << " Event statistics: " << stats.recentSampleCount << " events released at " << stats.recentSampleRate
1254  << " events/sec, effective data rate = "
1255  << (stats.recentValueRate * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0)
1256  << " MB/sec, monitor window = " << stats.recentDuration
1257  << " sec, min::max event size = " << (stats.recentValueMin * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0)
1258  << "::" << (stats.recentValueMax * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0) << " MB" << std::endl;
1259  if (stats.recentSampleRate > 0.0) {
1260  oss << " Average time per event: ";
1261  oss << " elapsed time = " << (1.0 / stats.recentSampleRate) << " sec" << std::endl;
1262  }
1263  }
1264 
1265  mqPtr = artdaq::StatisticsCollection::getInstance().getMonitoredQuantity(FRAGMENTS_RECEIVED_STAT_KEY);
1266  if (mqPtr.get() != 0) {
1267  artdaq::MonitoredQuantityStats stats;
1268  mqPtr->getStats(stats);
1269  oss << " Fragment statistics: " << stats.recentSampleCount << " fragments received at " << stats.recentSampleRate
1270  << " fragments/sec, effective data rate = "
1271  << (stats.recentValueRate * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0)
1272  << " MB/sec, monitor window = " << stats.recentDuration
1273  << " sec, min::max fragment size = " << (stats.recentValueMin * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0)
1274  << "::" << (stats.recentValueMax * sizeof(artdaq::RawDataType) / 1024.0 / 1024.0) << " MB" << std::endl;
1275  }
1276 
1277  oss << " Event counts: Run -- " << run_event_count_ << " Total, " << run_incomplete_event_count_ << " Incomplete."
1278  << " Subrun -- " << subrun_event_count_ << " Total, " << subrun_incomplete_event_count_ << " Incomplete. "
1279  << std::endl;
1280  return oss.str();
1281 }
1282 
1283 #if MESSAGEFACILITY_HEX_VERSION >= 0x20103
1284 FHICL_PROVIDE_ALLOWED_CONFIGURATION(artdaq::SharedMemoryEventManager)
1285 #endif
void RunArt(std::shared_ptr< art_config_file > config_file, std::shared_ptr< std::atomic< pid_t >> pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
void addMonitoredQuantityName(std::string const &statKey)
Add a MonitoredQuantity name to the list.
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
The SharedMemoryEventManager is a SharedMemoryManger which tracks events as they are built...
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
The RequestSender contains methods used to send data requests and Routing tokens. ...
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
void StartArt()
Start all the art processes.
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
subrun_id_t GetSubrunForSequenceID(Fragment::sequence_id_t seqID)
Get the subrun number that the given Sequence ID would be assigned to.
void rolloverSubrun()
Add a subrun transition immediately after the highest currently define sequence ID.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
static const std::string FRAGMENTS_RECEIVED_STAT_KEY
Key for Fragments Received MonitoredQuantity.
bool createCollectors(fhicl::ParameterSet const &pset, int defaultReportIntervalFragments, double defaultReportIntervalSeconds, double defaultMonitorWindow, std::string const &primaryStatKeyName)
Create MonitoredQuantity objects for all names registered with the StatisticsHelper.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endRun()
Send an EndOfRunFragment to the art thread.
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
static const std::string EVENTS_RELEASED_STAT_KEY
Key for the Events Released MonitoredQuantity.
bool endOfData()
Indicate that the end of input has been reached to the art processes.
RawEvent::subrun_id_t subrun_id_t
Copy RawEvent::subrun_id_t into local scope.
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...