artdaq  v3_06_00
SharedMemoryEventManager.hh
1 #ifndef ARTDAQ_DAQRATE_SHAREDMEMORYEVENTMANAGER_HH
2 #define ARTDAQ_DAQRATE_SHAREDMEMORYEVENTMANAGER_HH
3 
4 #include "artdaq/DAQdata/Globals.hh" // Before trace.h gets included in ConcurrentQueue (from GlobalQueue)
5 
6 #include <sys/stat.h>
7 #include <deque>
8 #include <fstream>
9 #include <iomanip>
10 #include "artdaq/DAQrate/StatisticsHelper.hh"
11 #include <set>
12 #include "artdaq-core/Core/SharedMemoryManager.hh"
13 #include "artdaq-core/Data/RawEvent.hh"
14 #include "artdaq/DAQrate/RequestSender.hh"
15 #include "artdaq/DAQrate/detail/ArtConfig.hh"
16 #include "fhiclcpp/fwd.h"
17 #define ART_SUPPORTS_DUPLICATE_EVENTS 0
18 
19 namespace artdaq {
20 
25  {
26  public:
31  art_config_file(fhicl::ParameterSet ps/*, uint32_t shm_key, uint32_t broadcast_key*/)
32  : dir_name_("/tmp/partition_" + std::to_string(GetPartitionNumber()))
33  , file_name_(dir_name_ + "/artConfig_" + std::to_string(my_rank) + "_" + std::to_string(artdaq::TimeUtils::gettimeofday_us()) + ".fcl")
34  {
35  mkdir(dir_name_.c_str(), 0777); // Allowed to fail if directory already exists
36 
37  std::ofstream of(file_name_, std::ofstream::trunc);
38  if (of.fail())
39  {
40  // Probably a permissions error...
41  dir_name_ = "/tmp/partition_" + std::to_string(GetPartitionNumber()) + "_" + std::to_string(getuid());
42  mkdir(dir_name_.c_str(), 0777); // Allowed to fail if directory already exists
43  file_name_ = dir_name_ + "/artConfig_" + std::to_string(my_rank) + "_" + std::to_string(artdaq::TimeUtils::gettimeofday_us()) + ".fcl";
44 
45  of.open(file_name_, std::ofstream::trunc);
46  if (of.fail())
47  {
48  TLOG(TLVL_ERROR) << "Failed to open configuration file after two attemps! ABORTING!";
49  exit(46);
50  }
51  }
52  of << ps.to_string();
53 
54  //if (ps.has_key("services.NetMonTransportServiceInterface"))
55  //{
56  // of << " services.NetMonTransportServiceInterface.shared_memory_key: 0x" << std::hex << shm_key;
57  // of << " services.NetMonTransportServiceInterface.broadcast_shared_memory_key: 0x" << std::hex << broadcast_key;
58  // of << " services.NetMonTransportServiceInterface.rank: " << std::dec << my_rank;
59  //}
60  if (!ps.has_key("services.message"))
61  {
62  of << " services.message: { " << generateMessageFacilityConfiguration("art") << "} ";
63  }
64  //of << " source.shared_memory_key: 0x" << std::hex << shm_key;
65  //of << " source.broadcast_shared_memory_key: 0x" << std::hex << broadcast_key;
66  //of << " source.rank: " << std::dec << my_rank;
67  of.close();
68  }
70  {
71  remove(file_name_.c_str());
72  rmdir(dir_name_.c_str()); // Will only delete directory if no config files are left over
73  }
78  std::string getFileName() const { return file_name_; }
79 
80  private:
81  std::string dir_name_;
82  std::string file_name_;
83  };
84 
88  class SharedMemoryEventManager : public SharedMemoryManager
89  {
90  public:
91  static const std::string FRAGMENTS_RECEIVED_STAT_KEY;
92  static const std::string EVENTS_RELEASED_STAT_KEY;
93 
94  typedef RawEvent::run_id_t run_id_t;
95  typedef RawEvent::subrun_id_t subrun_id_t;
96  typedef Fragment::sequence_id_t sequence_id_t;
97  typedef std::map<sequence_id_t, RawEvent_ptr> EventMap;
98 
102  struct Config
103  {
106  fhicl::Atom<size_t> max_event_size_bytes{ fhicl::Name{ "max_event_size_bytes"}, fhicl::Comment{"Maximum event size (all Fragments), in bytes"} };
108  fhicl::Atom<size_t> stale_buffer_timeout_usec{ fhicl::Name{ "stale_buffer_timeout_usec"}, fhicl::Comment{"Maximum amount of time elapsed before a buffer is marked as abandoned. Time is reset each time an operation is performed on the buffer."}, 5000000 };
110  fhicl::Atom<bool> overwrite_mode{ fhicl::Name{ "overwrite_mode"}, fhicl::Comment{"Whether buffers are allowed to be overwritten when safe (state == Full or Reading)"}, false };
112  fhicl::Atom<bool> restart_crashed_art_processes{ fhicl::Name{"restart_crashed_art_processes"}, fhicl::Comment{"Whether to automatically restart art processes that fail for any reason"}, true };
114  fhicl::Atom<uint32_t> shared_memory_key{ fhicl::Name{ "shared_memory_key"}, fhicl::Comment{"Key to use for shared memory access"}, 0xBEE70000 + getpid() };
116  fhicl::Atom<size_t> buffer_count{ fhicl::Name{ "buffer_count"}, fhicl::Comment{"Number of events in the Shared Memory (incomplete + pending art)"} };
118  fhicl::Atom<size_t> max_subrun_lookup_table_size{fhicl::Name{"max_subrun_lookup_table_size"}, fhicl::Comment{"Maximum number of entries in the subrun rollover history"}, 100};
121  fhicl::Atom<size_t> max_fragment_size_bytes{ fhicl::Name{ "max_fragment_size_bytes"}, fhicl::Comment{" Maximum Fragment size, in bytes"} };
123  fhicl::Atom<size_t> event_queue_wait_time{ fhicl::Name{ "event_queue_wait_time"}, fhicl::Comment{"Amount of time (in seconds) an event can exist in shared memory before being released to art. Used as input to default parameter of \"stale_buffer_timeout_usec\"."}, 5 };
125  fhicl::Atom<bool> broadcast_mode{ fhicl::Name{ "broadcast_mode"}, fhicl::Comment{"When true, buffers are not marked Empty when read, but return to Full state. Buffers are overwritten in order received."}, false };
127  fhicl::Atom<size_t> art_analyzer_count{ fhicl::Name{ "art_analyzer_count"}, fhicl::Comment{"Number of art procceses to start"}, 1 };
129  fhicl::Atom<size_t> expected_fragments_per_event{ fhicl::Name{ "expected_fragments_per_event"}, fhicl::Comment{"Number of Fragments to expect per event"} };
131  fhicl::Atom<int> maximum_oversize_fragment_count{ fhicl::Name{"maximum_oversize_fragment_count"}, fhicl::Comment{"Maximum number of over-size Fragments to drop before throwing an exception. Default is 1, which means to throw an exception if any over-size Fragments are dropped. Set to 0 to disable."},1 };
133  fhicl::Atom<bool> update_run_ids_on_new_fragment{ fhicl::Name{ "update_run_ids_on_new_fragment"}, fhicl::Comment{"Whether the run and subrun ID of an event should be updated whenever a Fragment is added."}, true };
135  fhicl::Atom<bool> use_sequence_id_for_event_number{ fhicl::Name{"use_sequence_id_for_event_number"}, fhicl::Comment{"Whether to use the artdaq Sequence ID (true) or the Timestamp (false) for art Event numbers"}, true };
137  fhicl::Atom<size_t> max_subrun_event_map_length{fhicl::Name{"max_subrun_event_map_length"}, fhicl::Comment{"The maximum number of entries to store in the sequence ID-SubRun ID lookup table"}, 100};
139  fhicl::Atom<bool> send_init_fragments{ fhicl::Name{ "send_init_fragments"}, fhicl::Comment{"Whether Init Fragments are expected to be sent to art. If true, a Warning message is printed when an Init Fragment is requested but none are available."}, true };
141  fhicl::Atom<int> incomplete_event_report_interval_ms{ fhicl::Name{ "incomplete_event_report_interval_ms"}, fhicl::Comment{"Interval at which an incomplete event report should be written"}, -1 };
144  fhicl::Atom<int> fragment_broadcast_timeout_ms{ fhicl::Name{ "fragment_broadcast_timeout_ms"}, fhicl::Comment{"Amount of time broadcast fragments should live in the broadcast shared memory segment"}, 3000 };
146  fhicl::Atom<double> minimum_art_lifetime_s{ fhicl::Name{ "minimum_art_lifetime_s"}, fhicl::Comment{"Amount of time that an art process should run to not be considered \"DOA\""}, 2.0 };
149  fhicl::Atom<size_t> expected_art_event_processing_time_us{ fhicl::Name{ "expected_art_event_processing_time_us"}, fhicl::Comment{"During shutdown, SMEM will wait for this amount of time while it is checking that the art threads are done reading buffers."}, 100000 };
151  fhicl::Atom<uint32_t> broadcast_shared_memory_key{ fhicl::Name{ "broadcast_shared_memory_key"}, fhicl::Comment{""}, 0xCEE70000 + getpid() };
153  fhicl::Atom<size_t> broadcast_buffer_count{ fhicl::Name{ "broadcast_buffer_count"}, fhicl::Comment{"Buffers in the broadcast shared memory segment"}, 10 };
155  fhicl::Atom<size_t> broadcast_buffer_size{ fhicl::Name{ "broadcast_buffer_size"}, fhicl::Comment{"Size of the buffers in the broadcast shared memory segment"}, 0x100000 };
157  fhicl::Atom<bool> use_art{ fhicl::Name{ "use_art"}, fhicl::Comment{"Whether to start and manage art threads (Sets art_analyzer count to 0 and overwrite_mode to true when false)"}, true };
159  fhicl::Atom<bool> manual_art{ fhicl::Name{"manual_art"}, fhicl::Comment{"Prints the startup command line for the art process so that the user may (for example) run it in GDB or valgrind"}, false };
160 
161  fhicl::TableFragment<artdaq::RequestSender::Config> requestSenderConfig;
162  };
164  using Parameters = fhicl::WrappedTable<Config>;
165 
171  SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset);
175  virtual ~SharedMemoryEventManager();
176 
177  private:
184  bool AddFragment(detail::RawFragmentHeader frag, void* dataPtr);
185 
186  public:
194  bool AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag);
195 
202  RawDataType* WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable = false);
203 
208  void DoneWritingFragment(detail::RawFragmentHeader frag);
209 
214  size_t GetIncompleteEventCount() { return active_buffers_.size(); }
215 
220  size_t GetPendingEventCount() { return pending_buffers_.size(); }
221 
226  size_t GetLockedBufferCount() { return GetBuffersOwnedByManager().size(); }
227 
232  size_t GetArtEventCount() { return run_event_count_; }
233 
240  size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type = Fragment::InvalidFragmentType);
241 
248  size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type = Fragment::InvalidFragmentType);
249 
253  void RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out);
257  void StartArt();
258 
264  pid_t StartArtProcess(fhicl::ParameterSet pset);
265 
270  void ShutdownArtProcesses(std::set<pid_t>& pids);
271 
278  void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun = 0, int n_art_processes = -1);
279 
289  bool endOfData();
290 
295  void startRun(run_id_t runID);
296 
301  run_id_t runID() const { return run_id_; }
302 
307  bool endRun();
308 
314  void rolloverSubrun(sequence_id_t boundary, subrun_id_t subrun);
315 
319  void rolloverSubrun();
320 
324  void sendMetrics();
325 
331  {
332  if (requests_) requests_->SetRequestMode(mode);
333  }
334 
339  void setOverwrite(bool overwrite) { overwrite_mode_ = overwrite; }
340 
344  void SetInitFragment(FragmentPtr frag);
345 
350  uint32_t GetBroadcastKey() { return broadcasts_.GetKey(); }
351 
357  RawDataType* GetDroppedDataAddress(Fragment::fragment_id_t frag) { return dropped_data_[frag]->dataBegin(); }
358 
368  void UpdateArtConfiguration(fhicl::ParameterSet art_pset);
369 
373  void CheckPendingBuffers();
374 
380  subrun_id_t GetSubrunForSequenceID(Fragment::sequence_id_t seqID);
381 
386  subrun_id_t GetCurrentSubrun() { return GetSubrunForSequenceID(Fragment::InvalidSequenceID); }
387 
388  private:
389  size_t get_art_process_count_()
390  {
391  std::unique_lock<std::mutex> lk(art_process_mutex_);
392  return art_processes_.size();
393  }
394 
395  std::string buildStatisticsString_() const;
396 
397  private:
398  size_t num_art_processes_;
399  size_t const num_fragments_per_event_;
400  size_t const queue_size_;
401  run_id_t run_id_;
402 
403  std::map<sequence_id_t, subrun_id_t> subrun_event_map_;
404  size_t max_subrun_event_map_length_;
405  static std::mutex subrun_event_map_mutex_;
406 
407  std::set<int> active_buffers_;
408  std::set<int> pending_buffers_;
409  std::unordered_map<Fragment::sequence_id_t, size_t> released_incomplete_events_;
410 
411  bool update_run_ids_;
412  bool use_sequence_id_for_event_number_;
413  bool overwrite_mode_;
414  bool send_init_fragments_;
415  bool running_;
416 
417  std::unordered_map<int, std::atomic<int>> buffer_writes_pending_;
418  std::unordered_map<int, std::mutex> buffer_mutexes_;
419  static std::mutex sequence_id_mutex_;
420 
421  int incomplete_event_report_interval_ms_;
422  std::chrono::steady_clock::time_point last_incomplete_event_report_time_;
423  std::chrono::steady_clock::time_point last_shmem_buffer_metric_update_;
424  std::chrono::steady_clock::time_point last_backpressure_report_time_;
425  std::chrono::steady_clock::time_point last_fragment_header_write_time_;
426 
427 
428  struct MetricData
429  {
430  MetricData()
431  : event_count(0), event_size(0) {}
432  size_t event_count;
433  size_t event_size;
434  };
435  MetricData metric_data_;
436  StatisticsHelper statsHelper_;
437 
438  int broadcast_timeout_ms_;
439 
440  std::atomic<int> run_event_count_;
441  std::atomic<int> run_incomplete_event_count_;
442  std::atomic<int> subrun_event_count_;
443  std::atomic<int> subrun_incomplete_event_count_;
444  std::atomic<int> oversize_fragment_count_;
445  int maximum_oversize_fragment_count_;
446 
447  mutable std::mutex art_process_mutex_;
448  std::set<pid_t> art_processes_;
449  std::atomic<bool> restart_art_;
450  bool always_restart_art_;
451  std::atomic<bool> manual_art_;
452  fhicl::ParameterSet current_art_pset_;
453  std::shared_ptr<art_config_file> current_art_config_file_;
454  double minimum_art_lifetime_s_;
455  size_t art_event_processing_time_us_;
456 
457  std::unique_ptr<RequestSender> requests_;
458  fhicl::ParameterSet data_pset_;
459 
460  FragmentPtr init_fragment_;
461  std::unordered_map<Fragment::fragment_id_t, FragmentPtr> dropped_data_;
462 
463  bool broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag);
464 
465  detail::RawEventHeader* getEventHeader_(int buffer);
466 
467  int getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp = Fragment::InvalidTimestamp);
468  bool hasFragments_(int buffer);
469  void complete_buffer_(int buffer);
470  bool bufferComparator(int bufA, int bufB);
471  void check_pending_buffers_(std::unique_lock<std::mutex> const& lock);
472 
473  void send_init_frag_();
474  SharedMemoryManager broadcasts_;
475  };
476 } // namespace artdaq
477 
478 #endif //ARTDAQ_DAQRATE_SHAREDMEMORYEVENTMANAGER_HH
void RunArt(std::shared_ptr< art_config_file > config_file, std::shared_ptr< std::atomic< pid_t >> pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
fhicl::Atom< size_t > event_queue_wait_time
&quot;event_queue_wait_time&quot; (Default: 5) : Amount of time(in seconds) an event can exist in shared memory...
fhicl::Atom< bool > overwrite_mode
&quot;overwite_mode&quot; (Default: false): Whether new data is allowed to overwrite buffers in the &quot;Full&quot; stat...
art_config_file wraps a temporary file used to configure art
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
fhicl::Atom< size_t > max_subrun_event_map_length
&quot;max_subrun_event_map_length&quot; (Default: 100): The maximum number of entries to store in the sequence ...
size_t GetLockedBufferCount()
Returns the number of buffers currently owned by this manager.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
The SharedMemoryEventManager is a SharedMemoryManger which tracks events as they are built...
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
fhicl::Atom< size_t > broadcast_buffer_size
&quot;broadcast_buffer_size&quot; (Default: 0x100000): Size of the buffers in the broadcast shared memory segme...
subrun_id_t GetCurrentSubrun()
Get the current subrun number (Gets the last defined subrun)
fhicl::Atom< bool > restart_crashed_art_processes
&quot;restart_crashed_art_processes&quot; (Default: true) : Whether to automatically restart art processes that...
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
fhicl::Atom< double > minimum_art_lifetime_s
&quot;minimum_art_lifetime_s&quot; (Default: 2 seconds): Amount of time that an art process should run to not b...
fhicl::Atom< int > maximum_oversize_fragment_count
&quot;maximum_oversize_fragment_count&quot; (Default: 1): Maximum number of over-size Fragments to drop before ...
fhicl::Atom< bool > update_run_ids_on_new_fragment
&quot;update_run_ids_on_new_fragment&quot; (Default: true) : Whether the run and subrun ID of an event should b...
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
void setRequestMode(detail::RequestMessageMode mode)
Set the RequestMessageMode for all outgoing data requests.
size_t GetArtEventCount()
Returns the number of events sent to art this run.
fhicl::Atom< size_t > art_analyzer_count
&quot;art_analyzer_count&quot; (Default: 1) : Number of art procceses to start
Configuration of the SharedMemoryEventManager. May be used for parameter validation ...
fhicl::Atom< bool > use_sequence_id_for_event_number
&quot;use_sequence_id_for_event_number&quot; (Default: true): Whether to use the artdaq Sequence ID (true) or t...
fhicl::Atom< uint32_t > broadcast_shared_memory_key
&quot;broadcast_shared_memory_key&quot; (Default: 0xCEE7000 + PID): Key to use for broadcast shared memory acce...
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
fhicl::WrappedTable< Config > Parameters
Used for ParameterSet validation (if desired)
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
RawDataType * GetDroppedDataAddress(Fragment::fragment_id_t frag)
Gets the address of the &quot;dropped data&quot; fragment. Used for testing.
fhicl::Atom< size_t > max_subrun_lookup_table_size
&quot;max_subrun_lookup_table_size&quot; (Default: 100): Maximum number of entries in the subrun rollover histo...
size_t GetPendingEventCount()
Returns the number of events which are complete but waiting on lower sequenced events to finish...
void StartArt()
Start all the art processes.
run_id_t runID() const
Get the current Run number.
fhicl::Atom< size_t > buffer_count
&quot;buffer_count&quot; REQUIRED: Number of events in the Shared Memory(incomplete + pending art) ...
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
subrun_id_t GetSubrunForSequenceID(Fragment::sequence_id_t seqID)
Get the subrun number that the given Sequence ID would be assigned to.
fhicl::TableFragment< artdaq::RequestSender::Config > requestSenderConfig
Configuration of the RequestSender. See artdaq::RequestSender::Config.
fhicl::Atom< uint32_t > shared_memory_key
&quot;shared_memory_key&quot; (Default 0xBEE70000 + PID) : Key to use for shared memory access ...
std::map< sequence_id_t, RawEvent_ptr > EventMap
An EventMap is a map of RawEvent_ptr objects, keyed by sequence ID.
RequestMessageMode
Mode used to indicate current run conditions to the request receiver.
void rolloverSubrun()
Add a subrun transition immediately after the highest currently define sequence ID.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
fhicl::Atom< size_t > stale_buffer_timeout_usec
&quot;stale_buffer_timeout_usec&quot; (Default: event_queue_wait_time * 1, 000, 000) : Maximum amount of time e...
fhicl::Atom< size_t > expected_fragments_per_event
&quot;expected_fragments_per_event&quot; (REQUIRED) : Number of Fragments to expect per event ...
size_t GetIncompleteEventCount()
Returns the number of buffers which contain data but are not yet complete.
fhicl::Atom< bool > send_init_fragments
&quot;send_init_fragments&quot; (Default: true): Whether Init Fragments are expected to be sent to art...
static const std::string FRAGMENTS_RECEIVED_STAT_KEY
Key for Fragments Received MonitoredQuantity.
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endRun()
Send an EndOfRunFragment to the art thread.
void setOverwrite(bool overwrite)
Set the overwrite flag (non-reliable data transfer) for the Shared Memory.
fhicl::Atom< int > incomplete_event_report_interval_ms
&quot;incomplete_event_report_interval_ms&quot; (Default: -1): Interval at which an incomplete event report sho...
std::string getFileName() const
Get the path of the temporary file.
fhicl::Atom< bool > manual_art
&quot;manual_art&quot; (Default: false): Prints the startup command line for the art process so that the user m...
fhicl::Atom< bool > broadcast_mode
&quot;broadcast_mode&quot; (Default: false) : When true, buffers are not marked Empty when read, but return to Full state.Buffers are overwritten in order received.
fhicl::Atom< size_t > broadcast_buffer_count
&quot;broadcast_buffer_count&quot; (Default: 10): Buffers in the broadcast shared memory segment ...
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
static const std::string EVENTS_RELEASED_STAT_KEY
Key for the Events Released MonitoredQuantity.
uint32_t GetBroadcastKey()
Gets the shared memory key of the broadcast SharedMemoryManager.
bool endOfData()
Indicate that the end of input has been reached to the art processes.
fhicl::Atom< bool > use_art
&quot;use_art&quot; (Default: true): Whether to start and manage art threads (Sets art_analyzer count to 0 and ...
RawEvent::subrun_id_t subrun_id_t
Copy RawEvent::subrun_id_t into local scope.
art_config_file(fhicl::ParameterSet ps)
art_config_file Constructor
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...