artdaq  v3_03_00
SharedMemoryEventManager.hh
1 #ifndef ARTDAQ_DAQRATE_SHAREDMEMORYEVENTMANAGER_HH
2 #define ARTDAQ_DAQRATE_SHAREDMEMORYEVENTMANAGER_HH
3 
4 #include "artdaq/DAQdata/Globals.hh" // Before trace.h gets included in ConcurrentQueue (from GlobalQueue)
5 #include "artdaq-core/Core/SharedMemoryManager.hh"
6 #include "artdaq-core/Data/RawEvent.hh"
7 #include "artdaq/DAQrate/RequestSender.hh"
8 #include <set>
9 #include <deque>
10 #include <fstream>
11 #include <iomanip>
12 #include <sys/stat.h>
13 #include "fhiclcpp/fwd.h"
14 #include "artdaq/Application/StatisticsHelper.hh"
15 #include "artdaq/DAQrate/detail/ArtConfig.hh"
16 #define ART_SUPPORTS_DUPLICATE_EVENTS 0
17 
18 namespace artdaq {
19 
24  {
25  public:
30  art_config_file(fhicl::ParameterSet ps/*, uint32_t shm_key, uint32_t broadcast_key*/)
31  : dir_name_("/tmp/partition_" + std::to_string(GetPartitionNumber()))
32  , file_name_(dir_name_ + "/artConfig_" + std::to_string(my_rank) + "_" + std::to_string(artdaq::TimeUtils::gettimeofday_us()) + ".fcl")
33  {
34  mkdir(dir_name_.c_str(), 0777); // Allowed to fail if directory already exists
35 
36  std::ofstream of(file_name_, std::ofstream::trunc);
37  if (of.fail()) {
38  // Probably a permissions error...
39  dir_name_ = "/tmp/partition_" + std::to_string(GetPartitionNumber()) + "_" + std::to_string(getuid());
40  mkdir(dir_name_.c_str(), 0777); // Allowed to fail if directory already exists
41  file_name_ = dir_name_ + "/artConfig_" + std::to_string(my_rank) + "_" + std::to_string(artdaq::TimeUtils::gettimeofday_us()) + ".fcl";
42 
43  of.open(file_name_, std::ofstream::trunc);
44  if (of.fail())
45  {
46  TLOG(TLVL_ERROR) << "Failed to open configuration file after two attemps! ABORTING!";
47  exit(46);
48  }
49  }
50  of << ps.to_string();
51 
52  //if (ps.has_key("services.NetMonTransportServiceInterface"))
53  //{
54  // of << " services.NetMonTransportServiceInterface.shared_memory_key: 0x" << std::hex << shm_key;
55  // of << " services.NetMonTransportServiceInterface.broadcast_shared_memory_key: 0x" << std::hex << broadcast_key;
56  // of << " services.NetMonTransportServiceInterface.rank: " << std::dec << my_rank;
57  //}
58  if (!ps.has_key("services.message"))
59  {
60  of << " services.message: { " << generateMessageFacilityConfiguration("art") << "} ";
61  }
62  //of << " source.shared_memory_key: 0x" << std::hex << shm_key;
63  //of << " source.broadcast_shared_memory_key: 0x" << std::hex << broadcast_key;
64  //of << " source.rank: " << std::dec << my_rank;
65  of.close();
66  }
68  {
69  remove(file_name_.c_str());
70  rmdir(dir_name_.c_str()); // Will only delete directory if no config files are left over
71  }
76  std::string getFileName() const { return file_name_; }
77  private:
78  std::string dir_name_;
79  std::string file_name_;
80  };
81 
85  class SharedMemoryEventManager : public SharedMemoryManager
86  {
87  public:
88  typedef RawEvent::run_id_t run_id_t;
89  typedef RawEvent::subrun_id_t subrun_id_t;
90  typedef Fragment::sequence_id_t sequence_id_t;
91  typedef std::map<sequence_id_t, RawEvent_ptr> EventMap;
92 
96  struct Config
97  {
100  fhicl::Atom<size_t> max_event_size_bytes{ fhicl::Name{ "max_event_size_bytes"}, fhicl::Comment{"Maximum event size (all Fragments), in bytes"} };
102  fhicl::Atom<size_t> stale_buffer_timeout_usec{ fhicl::Name{ "stale_buffer_timeout_usec"}, fhicl::Comment{"Maximum amount of time elapsed before a buffer is marked as abandoned. Time is reset each time an operation is performed on the buffer."}, 5000000 };
104  fhicl::Atom<bool> overwrite_mode{ fhicl::Name{ "overwrite_mode"}, fhicl::Comment{"Whether buffers are allowed to be overwritten when safe (state == Full or Reading)"}, false };
106  fhicl::Atom<bool> restart_crashed_art_processes{ fhicl::Name{"restart_crashed_art_processes"}, fhicl::Comment{"Whether to automatically restart art processes that fail for any reason"}, true };
108  fhicl::Atom<uint32_t> shared_memory_key{ fhicl::Name{ "shared_memory_key"}, fhicl::Comment{"Key to use for shared memory access"}, 0xBEE70000 + getpid() };
110  fhicl::Atom<size_t> buffer_count{ fhicl::Name{ "buffer_count"}, fhicl::Comment{"Number of events in the Shared Memory (incomplete + pending art)"} };
113  fhicl::Atom<size_t> max_fragment_size_bytes{ fhicl::Name{ "max_fragment_size_bytes"}, fhicl::Comment{" Maximum Fragment size, in bytes"} };
115  fhicl::Atom<size_t> event_queue_wait_time{ fhicl::Name{ "event_queue_wait_time"}, fhicl::Comment{"Amount of time (in seconds) an event can exist in shared memory before being released to art. Used as input to default parameter of \"stale_buffer_timeout_usec\"."}, 5 };
117  fhicl::Atom<bool> broadcast_mode{ fhicl::Name{ "broadcast_mode"}, fhicl::Comment{"When true, buffers are not marked Empty when read, but return to Full state. Buffers are overwritten in order received."}, false };
119  fhicl::Atom<size_t> art_analyzer_count{ fhicl::Name{ "art_analyzer_count"}, fhicl::Comment{"Number of art procceses to start"}, 1 };
121  fhicl::Atom<size_t> expected_fragments_per_event{ fhicl::Name{ "expected_fragments_per_event"}, fhicl::Comment{"Number of Fragments to expect per event"} };
123  fhicl::Atom<int> maximum_oversize_fragment_count{ fhicl::Name{"maximum_oversize_fragment_count"}, fhicl::Comment{"Maximum number of over-size Fragments to drop before throwing an exception. Default is 1, which means to throw an exception if any over-size Fragments are dropped. Set to 0 to disable."},1 };
125  fhicl::Atom<bool> update_run_ids_on_new_fragment{ fhicl::Name{ "update_run_ids_on_new_fragment"}, fhicl::Comment{"Whether the run and subrun ID of an event should be updated whenever a Fragment is added."}, true };
127  fhicl::Atom<bool> use_sequence_id_for_event_number{ fhicl::Name{"use_sequence_id_for_event_number"}, fhicl::Comment{"Whether to use the artdaq Sequence ID (true) or the Timestamp (false) for art Event numbers"}, true };
129  fhicl::Atom<bool> send_init_fragments{ fhicl::Name{ "send_init_fragments"}, fhicl::Comment{"Whether Init Fragments are expected to be sent to art. If true, a Warning message is printed when an Init Fragment is requested but none are available."}, true };
131  fhicl::Atom<int> incomplete_event_report_interval_ms{ fhicl::Name{ "incomplete_event_report_interval_ms"}, fhicl::Comment{"Interval at which an incomplete event report should be written"}, -1 };
134  fhicl::Atom<int> fragment_broadcast_timeout_ms{ fhicl::Name{ "fragment_broadcast_timeout_ms"}, fhicl::Comment{"Amount of time broadcast fragments should live in the broadcast shared memory segment"}, 3000 };
136  fhicl::Atom<double> minimum_art_lifetime_s{ fhicl::Name{ "minimum_art_lifetime_s"}, fhicl::Comment{"Amount of time that an art process should run to not be considered \"DOA\""}, 2.0 };
139  fhicl::Atom<size_t> expected_art_event_processing_time_us{ fhicl::Name{ "expected_art_event_processing_time_us"}, fhicl::Comment{"During shutdown, SMEM will wait for this amount of time while it is checking that the art threads are done reading buffers."}, 100000 };
141  fhicl::Atom<uint32_t> broadcast_shared_memory_key{ fhicl::Name{ "broadcast_shared_memory_key"}, fhicl::Comment{""}, 0xCEE70000 + getpid() };
143  fhicl::Atom<size_t> broadcast_buffer_count{ fhicl::Name{ "broadcast_buffer_count"}, fhicl::Comment{"Buffers in the broadcast shared memory segment"}, 10 };
145  fhicl::Atom<size_t> broadcast_buffer_size{ fhicl::Name{ "broadcast_buffer_size"}, fhicl::Comment{"Size of the buffers in the broadcast shared memory segment"}, 0x100000 };
147  fhicl::Atom<bool> use_art{ fhicl::Name{ "use_art"}, fhicl::Comment{"Whether to start and manage art threads (Sets art_analyzer count to 0 and overwrite_mode to true when false)"}, true };
149  fhicl::Atom<bool> manual_art{ fhicl::Name{"manual_art"}, fhicl::Comment{"Prints the startup command line for the art process so that the user may (for example) run it in GDB or valgrind"}, false };
150 
151  fhicl::TableFragment<artdaq::RequestSender::Config> requestSenderConfig;
152  };
153  using Parameters = fhicl::WrappedTable<Config>;
154 
160  SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset);
164  virtual ~SharedMemoryEventManager();
165 
166  private:
173  bool AddFragment(detail::RawFragmentHeader frag, void* dataPtr);
174 
175  public:
183  bool AddFragment(FragmentPtr frag, size_t timeout_usec, FragmentPtr& outfrag);
184 
191  RawDataType* WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable = false);
192 
197  void DoneWritingFragment(detail::RawFragmentHeader frag);
198 
203  size_t GetIncompleteEventCount() { return active_buffers_.size(); }
204 
209  size_t GetPendingEventCount() { return pending_buffers_.size(); }
210 
215  size_t GetLockedBufferCount() { return GetBuffersOwnedByManager().size(); }
216 
221  size_t GetArtEventCount() { return subrun_event_count_; }
222 
229  size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type = Fragment::InvalidFragmentType);
230 
237  size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type = Fragment::InvalidFragmentType);
238 
242  void RunArt(std::shared_ptr<art_config_file> config_file, std::shared_ptr<std::atomic<pid_t>> pid_out);
246  void StartArt();
247 
253  pid_t StartArtProcess(fhicl::ParameterSet pset);
254 
259  void ShutdownArtProcesses(std::set<pid_t>& pids);
260 
267  void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun = 0, int n_art_processes = -1);
268 
278  bool endOfData();
279 
284  void startRun(run_id_t runID);
285 
289  void startSubrun();
290 
295  run_id_t runID() const { return run_id_; }
296 
301  subrun_id_t subrunID() const { return subrun_id_; }
302 
307  bool endRun();
308 
313  bool endSubrun();
314 
319  void rolloverSubrun(sequence_id_t boundary);
320 
324  void sendMetrics();
325 
330  void setRequestMode(detail::RequestMessageMode mode) { if (requests_) requests_->SetRequestMode(mode); }
331 
336  void setOverwrite(bool overwrite) { overwrite_mode_ = overwrite; }
337 
341  void SetInitFragment(FragmentPtr frag);
342 
347  uint32_t GetBroadcastKey() { return broadcasts_.GetKey(); }
348 
353  RawDataType* GetDroppedDataAddress(Fragment::fragment_id_t frag) { return dropped_data_[frag]->dataBegin(); }
354 
364  void UpdateArtConfiguration(fhicl::ParameterSet art_pset);
365 
369  void CheckPendingBuffers();
370 
371  private:
372  size_t get_art_process_count_()
373  {
374  std::unique_lock<std::mutex> lk(art_process_mutex_);
375  return art_processes_.size();
376  }
377 
378  private:
379 
380  size_t num_art_processes_;
381  size_t const num_fragments_per_event_;
382  size_t const queue_size_;
383  run_id_t run_id_;
384  subrun_id_t subrun_id_;
385  sequence_id_t subrun_rollover_event_;
386  sequence_id_t last_released_event_;
387 
388  std::set<int> active_buffers_;
389  std::set<int> pending_buffers_;
390  std::unordered_map<Fragment::sequence_id_t, size_t> released_incomplete_events_;
391 
392  bool update_run_ids_;
393  bool use_sequence_id_for_event_number_;
394  bool overwrite_mode_;
395  bool send_init_fragments_;
396  bool running_;
397 
398  std::unordered_map<int, std::atomic<int>> buffer_writes_pending_;
399  std::unordered_map<int, std::mutex> buffer_mutexes_;
400  static std::mutex sequence_id_mutex_;
401 
402  int incomplete_event_report_interval_ms_;
403  std::chrono::steady_clock::time_point last_incomplete_event_report_time_;
404  std::chrono::steady_clock::time_point last_shmem_buffer_metric_update_;
405 
406  struct MetricData {
407  MetricData() : event_count(0), event_size(0) {}
408  size_t event_count;
409  size_t event_size;
410  };
411  MetricData metric_data_;
412 
413  int broadcast_timeout_ms_;
414 
415  std::atomic<int> run_event_count_;
416  std::atomic<int> run_incomplete_event_count_;
417  std::atomic<int> subrun_event_count_;
418  std::atomic<int> subrun_incomplete_event_count_;
419  std::atomic<int> oversize_fragment_count_;
420  int maximum_oversize_fragment_count_;
421 
422  mutable std::mutex art_process_mutex_;
423  std::set<pid_t> art_processes_;
424  std::atomic<bool> restart_art_;
425  bool always_restart_art_;
426  std::atomic<bool> manual_art_;
427  fhicl::ParameterSet current_art_pset_;
428  std::shared_ptr<art_config_file> current_art_config_file_;
429  double minimum_art_lifetime_s_;
430  size_t art_event_processing_time_us_;
431 
432  std::unique_ptr<RequestSender> requests_;
433  fhicl::ParameterSet data_pset_;
434 
435  FragmentPtr init_fragment_;
436  std::unordered_map<Fragment::fragment_id_t, FragmentPtr> dropped_data_;
437 
438  bool broadcastFragment_(FragmentPtr frag, FragmentPtr& outFrag);
439 
440  detail::RawEventHeader* getEventHeader_(int buffer);
441 
442  int getBufferForSequenceID_(Fragment::sequence_id_t seqID, bool create_new, Fragment::timestamp_t timestamp = Fragment::InvalidTimestamp);
443  bool hasFragments_(int buffer);
444  void complete_buffer_(int buffer);
445  bool bufferComparator(int bufA, int bufB);
446  void check_pending_buffers_(std::unique_lock<std::mutex> const& lock);
447 
448  void send_init_frag_();
449  SharedMemoryManager broadcasts_;
450  };
451  }
452 
453 #endif //ARTDAQ_DAQRATE_SHAREDMEMORYEVENTMANAGER_HH
void RunArt(std::shared_ptr< art_config_file > config_file, std::shared_ptr< std::atomic< pid_t >> pid_out)
Run an art instance, recording the return codes and restarting it until the end flag is raised...
fhicl::Atom< size_t > event_queue_wait_time
&quot;event_queue_wait_time&quot; (Default: 5) : Amount of time(in seconds) an event can exist in shared memory...
fhicl::Atom< bool > overwrite_mode
&quot;overwite_mode&quot; (Default: false): Whether new data is allowed to overwrite buffers in the &quot;Full&quot; stat...
art_config_file wraps a temporary file used to configure art
void ShutdownArtProcesses(std::set< pid_t > &pids)
Shutdown a set of art processes.
size_t GetLockedBufferCount()
Returns the number of buffers currently owned by this manager.
virtual ~SharedMemoryEventManager()
SharedMemoryEventManager Destructor.
The SharedMemoryEventManager is a SharedMemoryManger which tracks events as they are built...
Fragment::sequence_id_t sequence_id_t
Copy Fragment::sequence_id_t into local scope.
fhicl::Atom< size_t > broadcast_buffer_size
&quot;broadcast_buffer_size&quot; (Default: 0x100000): Size of the buffers in the broadcast shared memory segme...
fhicl::Atom< bool > restart_crashed_art_processes
&quot;restart_crashed_art_processes&quot; (Default: true) : Whether to automatically restart art processes that...
void ReconfigureArt(fhicl::ParameterSet art_pset, run_id_t newRun=0, int n_art_processes=-1)
Restart all art processes, using the given fhicl code to configure the new art processes.
pid_t StartArtProcess(fhicl::ParameterSet pset)
Start one art process.
fhicl::Atom< double > minimum_art_lifetime_s
&quot;minimum_art_lifetime_s&quot; (Default: 2 seconds): Amount of time that an art process should run to not b...
fhicl::Atom< int > maximum_oversize_fragment_count
&quot;maximum_oversize_fragment_count&quot; (Default: 1): Maximum number of over-size Fragments to drop before ...
fhicl::Atom< bool > update_run_ids_on_new_fragment
&quot;update_run_ids_on_new_fragment&quot; (Default: true) : Whether the run and subrun ID of an event should b...
RawDataType * WriteFragmentHeader(detail::RawFragmentHeader frag, bool dropIfNoBuffersAvailable=false)
Get a pointer to a reserved memory area for the given Fragment header.
void setRequestMode(detail::RequestMessageMode mode)
Set the RequestMessageMode for all outgoing data requests.
size_t GetArtEventCount()
Returns the number of events sent to art this subrun.
fhicl::Atom< size_t > art_analyzer_count
&quot;art_analyzer_count&quot; (Default: 1) : Number of art procceses to start
Configuration of the SharedMemoryEventManager. May be used for parameter validation ...
fhicl::Atom< bool > use_sequence_id_for_event_number
&quot;use_sequence_id_for_event_number&quot; (Default: true): Whether to use the artdaq Sequence ID (true) or t...
fhicl::Atom< uint32_t > broadcast_shared_memory_key
&quot;broadcast_shared_memory_key&quot; (Default: 0xCEE7000 + PID): Key to use for broadcast shared memory acce...
RawEvent::run_id_t run_id_t
Copy RawEvent::run_id_t into local scope.
size_t GetFragmentCount(Fragment::sequence_id_t seqID, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in an event.
void UpdateArtConfiguration(fhicl::ParameterSet art_pset)
Updates the internally-stored copy of the art configuration.
RawDataType * GetDroppedDataAddress(Fragment::fragment_id_t frag)
Gets the address of the &quot;dropped data&quot; fragment. Used for testing.
size_t GetPendingEventCount()
Returns the number of events which are complete but waiting on lower sequenced events to finish...
void StartArt()
Start all the art processes.
run_id_t runID() const
Get the current Run number.
fhicl::Atom< size_t > buffer_count
&quot;buffer_count&quot; REQUIRED: Number of events in the Shared Memory(incomplete + pending art) ...
void SetInitFragment(FragmentPtr frag)
Set the stored Init fragment, if one has not yet been set already.
subrun_id_t subrunID() const
Get the current subrun number.
fhicl::TableFragment< artdaq::RequestSender::Config > requestSenderConfig
Configuration of the RequestSender. See artdaq::RequestSender::Config.
fhicl::Atom< uint32_t > shared_memory_key
&quot;shared_memory_key&quot; (Default 0xBEE70000 + PID) : Key to use for shared memory access ...
void rolloverSubrun(sequence_id_t boundary)
Rollover the subrun after the specified event.
std::map< sequence_id_t, RawEvent_ptr > EventMap
An EventMap is a map of RawEvent_ptr objects, keyed by sequence ID.
RequestMessageMode
Mode used to indicate current run conditions to the request receiver.
void sendMetrics()
Send metrics to the MetricManager, if one has been instantiated in the application.
fhicl::Atom< size_t > stale_buffer_timeout_usec
&quot;stale_buffer_timeout_usec&quot; (Default: event_queue_wait_time * 1, 000, 000) : Maximum amount of time e...
void startSubrun()
Start a new Subrun, incrementing the subrun number.
fhicl::Atom< size_t > expected_fragments_per_event
&quot;expected_fragments_per_event&quot; (REQUIRED) : Number of Fragments to expect per event ...
size_t GetIncompleteEventCount()
Returns the number of buffers which contain data but are not yet complete.
fhicl::Atom< bool > send_init_fragments
&quot;send_init_fragments&quot; (Default: true): Whether Init Fragments are expected to be sent to art...
SharedMemoryEventManager(fhicl::ParameterSet pset, fhicl::ParameterSet art_pset)
SharedMemoryEventManager Constructor.
bool endSubrun()
Send an EndOfSubRunFragment to the art thread.
bool endRun()
Send an EndOfRunFragment to the art thread.
void setOverwrite(bool overwrite)
Set the overwrite flag (non-reliable data transfer) for the Shared Memory.
fhicl::Atom< int > incomplete_event_report_interval_ms
&quot;incomplete_event_report_interval_ms&quot; (Default: -1): Interval at which an incomplete event report sho...
std::string getFileName() const
Get the path of the temporary file.
fhicl::Atom< bool > manual_art
&quot;manual_art&quot; (Default: false): Prints the startup command line for the art process so that the user m...
fhicl::Atom< bool > broadcast_mode
&quot;broadcast_mode&quot; (Default: false) : When true, buffers are not marked Empty when read, but return to Full state.Buffers are overwritten in order received.
fhicl::Atom< size_t > broadcast_buffer_count
&quot;broadcast_buffer_count&quot; (Default: 10): Buffers in the broadcast shared memory segment ...
void DoneWritingFragment(detail::RawFragmentHeader frag)
Used to indicate that the given Fragment is now completely in the buffer. Will check for buffer compl...
uint32_t GetBroadcastKey()
Gets the shared memory key of the broadcast SharedMemoryManager.
bool endOfData()
Indicate that the end of input has been reached to the art processes.
fhicl::Atom< bool > use_art
&quot;use_art&quot; (Default: true): Whether to start and manage art threads (Sets art_analyzer count to 0 and ...
RawEvent::subrun_id_t subrun_id_t
Copy RawEvent::subrun_id_t into local scope.
art_config_file(fhicl::ParameterSet ps)
art_config_file Constructor
void startRun(run_id_t runID)
Start a Run.
size_t GetFragmentCountInBuffer(int buffer, Fragment::type_t type=Fragment::InvalidFragmentType)
Get the count of Fragments of a given type in a buffer.
void CheckPendingBuffers()
Check for buffers which are ready to be marked incomplete and released to art and issue tokens for an...