artdaq  v3_12_02
TransferWrapper.cc
1 #include "TRACE/tracemf.h" // Pre-empt TRACE/trace.h from Fragment.hh.
2 #include "artdaq-core/Data/Fragment.hh"
3 
4 #define TRACE_NAME "TransferWrapper"
5 
6 #include "artdaq/ArtModules/detail/TransferWrapper.hh"
7 
8 #include "artdaq-core/Utilities/ExceptionHandler.hh"
9 #include "artdaq-core/Utilities/TimeUtils.hh"
10 #include "artdaq/DAQdata/NetMonHeader.hh"
11 #include "artdaq/ExternalComms/MakeCommanderPlugin.hh"
12 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
13 
14 #include "cetlib_except/exception.h"
15 #include "fhiclcpp/ParameterSet.h"
16 
17 #include <csignal>
18 #include <iostream>
19 #include <limits>
20 #include <memory>
21 #include <sstream>
22 #include <string>
23 
24 namespace {
25 volatile std::sig_atomic_t gSignalStatus = 0;
26 }
27 
32 void signal_handler(int signal)
33 {
34  gSignalStatus = signal;
35 }
36 
37 artdaq::TransferWrapper::TransferWrapper(const fhicl::ParameterSet& pset)
38  : timeoutInUsecs_(pset.get<std::size_t>("timeoutInUsecs", 100000))
39  , last_received_data_()
40  , last_report_(std::chrono::steady_clock::now())
41  , transfer_(nullptr)
42  , commander_(nullptr)
43  , pset_(pset)
44  , dispatcherHost_(pset.get<std::string>("dispatcherHost", "localhost"))
45  , dispatcherPort_(pset.get<std::string>("dispatcherPort", "5266"))
46  , serverUrl_(pset.get<std::string>("server_url", "http://" + dispatcherHost_ + ":" + dispatcherPort_ + "/RPC2"))
47  , maxEventsBeforeInit_(pset.get<std::size_t>("maxEventsBeforeInit", 5))
48  , allowedFragmentTypes_(pset.get<std::vector<int>>("allowedFragmentTypes", {226, 227, 229}))
49  , runningStateTimeout_(pset.get<double>("dispatcherConnectTimeout", 0))
50  , runningStateInterval_us_(pset.get<size_t>("dispatcherConnectRetryInterval_us", 1000000))
51  , quitOnFragmentIntegrityProblem_(pset.get<bool>("quitOnFragmentIntegrityProblem", true))
52  , multi_run_mode_(pset.get<bool>("allowMultipleRuns", false))
53  , monitorRegistered_(false)
54 {
55  std::signal(SIGINT, signal_handler);
56 
57  try
58  {
59  if (metricMan)
60  {
61  metricMan->initialize(pset.get<fhicl::ParameterSet>("metrics", fhicl::ParameterSet()), "Online Monitor");
62  metricMan->do_start();
63  }
64  }
65  catch (...)
66  {
67  ExceptionHandler(ExceptionHandlerRethrow::no, "TransferWrapper: could not configure metrics");
68  }
69 
70  // Clamp possible values
71  if (runningStateInterval_us_ < 1000)
72  {
73  TLOG(TLVL_WARNING) << "Invalid value " << runningStateInterval_us_ << " us detected for dispatcherConnectRetryInterval_us. Setting to 1000 us";
74  runningStateInterval_us_ = 1000;
75  }
76  if (runningStateInterval_us_ > 30000000)
77  {
78  TLOG(TLVL_WARNING) << "Invalid value " << runningStateInterval_us_ << " us detected for dispatcherConnectRetryInterval_us. Setting to 30,000,000 us";
79  runningStateInterval_us_ = 30000000;
80  }
81 
82  fhicl::ParameterSet new_pset(pset);
83  if (!new_pset.has_key("server_url"))
84  {
85  new_pset.put<std::string>("server_url", serverUrl_);
86  }
87 
89  commander_ = MakeCommanderPlugin(new_pset, c);
90 }
91 
93 {
94  artdaq::FragmentPtrs fragmentPtrs;
95  bool receivedFragment = false;
96  static bool initialized = false;
97  static size_t fragments_received = 0;
98 
99  while (gSignalStatus == 0)
100  {
101  receivedFragment = false;
102  auto fragmentPtr = std::make_unique<artdaq::Fragment>();
103 
104  while (!receivedFragment)
105  {
106  if (gSignalStatus != 0)
107  {
108  TLOG(TLVL_INFO) << "Ctrl-C appears to have been hit";
109  unregisterMonitor();
110  return fragmentPtrs;
111  }
112  if (!monitorRegistered_)
113  {
114  registerMonitor();
115  if (!monitorRegistered_)
116  {
117  return fragmentPtrs;
118  }
119  }
120 
121  try
122  {
123  auto result = transfer_->receiveFragment(*fragmentPtr, timeoutInUsecs_);
124 
126  {
127  receivedFragment = true;
128  fragments_received++;
129 
130  static size_t cntr = 0;
131  auto mod = ++cntr % 10;
132  auto suffix = "-th";
133  if (mod == 1)
134  {
135  suffix = "-st";
136  }
137  if (mod == 2)
138  {
139  suffix = "-nd";
140  }
141  if (mod == 3)
142  {
143  suffix = "-rd";
144  }
145  TLOG(TLVL_INFO) << "Received " << cntr << suffix << " event, "
146  << "seqID == " << fragmentPtr->sequenceID()
147  << ", type == " << fragmentPtr->typeString();
148  last_received_data_ = std::chrono::steady_clock::now();
149  continue;
150  }
152  {
153  TLOG(TLVL_ERROR) << "Transfer Plugin disconnected or other unrecoverable error. Shutting down.";
154  if (multi_run_mode_)
155  {
156  unregisterMonitor();
157  initialized = false;
158  continue;
159  }
160  return fragmentPtrs;
161  }
162  else
163  {
164  auto tlvl = TLVL_DEBUG + 33;
165  if (artdaq::TimeUtils::GetElapsedTime(last_report_) > 1.0 && artdaq::TimeUtils::GetElapsedTime(last_received_data_) > 1.0)
166  {
167  tlvl = TLVL_WARNING;
168  last_report_ = std::chrono::steady_clock::now();
169  }
170 
171  auto last_received_milliseconds = artdaq::TimeUtils::GetElapsedTimeMilliseconds(last_received_data_);
172 
173  // 02-Jun-2018, KAB: added status/result printout
174  // to-do: add another else clause that explicitly checks for RECV_TIMEOUT
175  TLOG(tlvl) << "Timeout occurred in call to transfer_->receiveFragmentFrom; will try again"
176  << ", status = " << result << ", last received data " << last_received_milliseconds << " ms ago.";
177  }
178  }
179  catch (...)
180  {
181  ExceptionHandler(ExceptionHandlerRethrow::yes,
182  "Problem receiving data in TransferWrapper::receiveMessage");
183  }
184  }
185 
186  if (fragmentPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
187  {
188  // if (monitorRegistered_)
189  //{
190  // unregisterMonitor();
191  // }
192  if (multi_run_mode_)
193  {
194  unregisterMonitor();
195  initialized = false;
196  continue;
197  }
198 
199  return fragmentPtrs;
200  }
201 
202  checkIntegrity(*fragmentPtr);
203 
204  if (initialized || fragmentPtr->type() == artdaq::Fragment::InitFragmentType)
205  {
206  initialized = true;
207  fragmentPtrs.push_back(std::move(fragmentPtr));
208  break;
209  }
210 
211  if (fragments_received > maxEventsBeforeInit_)
212  {
213  throw cet::exception("TransferWrapper") << "First " << maxEventsBeforeInit_ << " events received did not include the \"Init\" event containing necessary info for art; exiting..."; // NOLINT(cert-err60-cpp)
214  }
215  }
216 
217  return fragmentPtrs;
218 }
219 
220 std::unordered_map<artdaq::Fragment::type_t, std::unique_ptr<artdaq::Fragments>> artdaq::TransferWrapper::receiveMessages()
221 {
222  std::unordered_map<artdaq::Fragment::type_t, std::unique_ptr<artdaq::Fragments>> output;
223 
224  auto ptrs = receiveMessage();
225  for (auto& ptr : ptrs)
226  {
227  auto fragType = ptr->type();
228  auto fragPtr = ptr.release();
229  ptr.reset(nullptr);
230 
231  if (output.count(fragType) == 0u)
232  {
233  output[fragType] = std::make_unique<artdaq::Fragments>();
234  }
235 
236  output[fragType]->emplace_back(std::move(*fragPtr));
237  }
238 
239  return output;
240 }
241 
242 void artdaq::TransferWrapper::checkIntegrity(const artdaq::Fragment& fragment) const
243 {
244  const size_t artdaqheader = artdaq::detail::RawFragmentHeader::num_words() *
245  sizeof(artdaq::detail::RawFragmentHeader::RawDataType);
246  const auto payload = static_cast<size_t>(fragment.dataEndBytes() - fragment.dataBeginBytes());
247  const size_t metadata = sizeof(artdaq::NetMonHeader);
248  const size_t totalsize = fragment.sizeBytes();
249 
250  const auto type = static_cast<size_t>(fragment.type());
251 
252  if (totalsize != artdaqheader + metadata + payload)
253  {
254  std::stringstream errmsg;
255  errmsg << "Error: artdaq fragment of type " << fragment.typeString() << ", sequence ID " << fragment.sequenceID() << " has internally inconsistent measures of its size, signalling data corruption: in bytes,"
256  << " total size = " << totalsize << ", artdaq fragment header = " << artdaqheader << ", metadata = " << metadata << ", payload = " << payload;
257 
258  TLOG(TLVL_ERROR) << errmsg.str();
259 
260  if (quitOnFragmentIntegrityProblem_)
261  {
262  throw cet::exception("TransferWrapper") << errmsg.str(); // NOLINT(cert-err60-cpp)
263  }
264 
265  return;
266  }
267 
268  auto findloc = std::find(allowedFragmentTypes_.begin(), allowedFragmentTypes_.end(), static_cast<int>(type));
269 
270  if (findloc == allowedFragmentTypes_.end())
271  {
272  std::stringstream errmsg;
273  errmsg << "Error: artdaq fragment appears to have type "
274  << type << ", not found in the allowed fragment types list";
275 
276  TLOG(TLVL_ERROR) << errmsg.str();
277  if (quitOnFragmentIntegrityProblem_)
278  {
279  throw cet::exception("TransferWrapper") << errmsg.str(); // NOLINT(cert-err60-cpp)
280  }
281 
282  return;
283  }
284 }
285 
286 void artdaq::TransferWrapper::registerMonitor()
287 {
288  try
289  {
290  transfer_.reset(nullptr);
291  transfer_ = MakeTransferPlugin(pset_, "transfer_plugin", TransferInterface::Role::kReceive);
292  }
293  catch (...)
294  {
295  ExceptionHandler(ExceptionHandlerRethrow::yes,
296  "TransferWrapper: failure in call to MakeTransferPlugin");
297  }
298 
299  auto start = std::chrono::steady_clock::now();
300  auto sts = getDispatcherStatus();
301  while (sts != "Running" && (runningStateTimeout_ == 0 || TimeUtils::GetElapsedTime(start) < runningStateTimeout_))
302  {
303  TLOG(TLVL_DEBUG + 32) << "Dispatcher state: " << sts;
304  if (gSignalStatus != 0)
305  {
306  TLOG(TLVL_INFO) << "Ctrl-C appears to have been hit";
307  return;
308  }
309  TLOG(TLVL_INFO) << "Waited " << std::fixed << std::setprecision(2) << TimeUtils::GetElapsedTime(start) << " s / " << runningStateTimeout_ << " s for Dispatcher to enter the Running state (state=" << sts << ")";
310  usleep(runningStateInterval_us_);
311  sts = getDispatcherStatus();
312  }
313  if (sts != "Running")
314  {
315  return;
316  }
317 
318  auto dispatcherConfig = pset_.get<fhicl::ParameterSet>("dispatcher_config");
319 
320  int retry = 3;
321 
322  while (retry > 0)
323  {
324  TLOG(TLVL_INFO) << "Attempting to register this monitor (\"" << transfer_->uniqueLabel()
325  << "\") with the dispatcher aggregator";
326 
327  auto status = commander_->send_register_monitor(dispatcherConfig.to_string());
328 
329  TLOG(TLVL_INFO) << "Response from dispatcher is \"" << status << "\"";
330 
331  if (status == "Success")
332  {
333  monitorRegistered_ = true;
334  break;
335  }
336 
337  TLOG(TLVL_WARNING) << "Error in TransferWrapper: attempt to register with dispatcher did not result in the \"Success\" response";
338  usleep(100000);
339 
340  retry--;
341  }
342 }
343 
344 void artdaq::TransferWrapper::unregisterMonitor()
345 {
346  if (!monitorRegistered_)
347  {
348  TLOG(TLVL_WARNING) << "The function to unregister the monitor was called, but the monitor doesn't appear to be registered";
349  return;
350  }
351 
352  auto start_time = std::chrono::steady_clock::now();
353  bool waiting = true;
354  while (artdaq::TimeUtils::GetElapsedTime(start_time) < 5.0 && waiting)
355  {
356  std::string sts = getDispatcherStatus();
357 
358  if (sts.empty())
359  return;
360 
361  if (sts == "busy")
362  {
363  TLOG(TLVL_INFO) << "The Dispatcher returned \"busy\", will wait 0.5s and retry";
364  usleep(500000);
365  continue;
366  }
367 
368  if (sts != "Running" && sts != "Ready")
369  {
370  TLOG(TLVL_WARNING) << "The Dispatcher is not in the Running or Ready state, will not attempt to unregister (state: " << sts << ")";
371  return;
372  }
373  waiting = false;
374  }
375  if (waiting)
376  {
377  TLOG(TLVL_WARNING) << "A timeout occurred waiting for the Dispatcher to leave the \"busy\" state, will not attempt to unregister";
378  return;
379  }
380 
381  int retry = 3;
382  while (retry > 0)
383  {
384  TLOG(TLVL_INFO) << "Requesting that this monitor (" << transfer_->uniqueLabel()
385  << ") be unregistered from the dispatcher aggregator";
386 
387  auto status = commander_->send_unregister_monitor(transfer_->uniqueLabel());
388 
389  TLOG(TLVL_INFO) << "Response from dispatcher is \"" << status << "\"";
390 
391  if (status == "Success")
392  {
393  break;
394  }
395  else if (status == "busy")
396  {
397  TLOG(TLVL_DEBUG + 32) << "The Dispatcher returned \"busy\", will retry in 0.5s";
398  }
399  else
400  {
401  TLOG(TLVL_WARNING) << "The Dispatcher returned status " << status << " when attempting to unregister this monitor!";
402  // throw cet::exception("TransferWrapper") << "Error in TransferWrapper: attempt to unregister with dispatcher did not result in the \"Success\" response";
403  }
404  retry--;
405  usleep(500000);
406  }
407 
408  TLOG(TLVL_INFO) << "Successfully unregistered the monitor from the Dispatcher";
409  monitorRegistered_ = false;
410 }
411 
412 std::string artdaq::TransferWrapper::getDispatcherStatus()
413 {
414  try
415  {
416  return commander_->send_status();
417  }
418  catch (std::exception const& ex)
419  {
420  TLOG(TLVL_WARNING) << "An exception was thrown trying to collect the Dispatcher's status. Most likely cause is the application is no longer running.";
421  return "";
422  }
423 }
424 
426 {
427  if (monitorRegistered_)
428  {
429  try
430  {
431  unregisterMonitor();
432  }
433  catch (...)
434  {
435  ExceptionHandler(ExceptionHandlerRethrow::no,
436  "An exception occurred when trying to unregister monitor during TransferWrapper's destruction");
437  }
438  }
440 }
Commandable is the base class for all artdaq components which implement the artdaq state machine...
Definition: Commandable.hh:22
TransferWrapper(const fhicl::ParameterSet &pset)
TransferWrapper Constructor.
static void CleanUpGlobals()
Clean up statically-allocated Manager class instances.
Definition: Globals.hh:156
virtual ~TransferWrapper()
TransferWrapper Destructor.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
artdaq::FragmentPtrs receiveMessage()
Receive a Fragment from the TransferInterface, and send it to art.
This TransferInterface is a Receiver.
Header with length information for NetMonTransport messages.
Definition: NetMonHeader.hh:13
volatile std::sig_atomic_t gSignalStatus
Stores singal from signal handler.
std::unique_ptr< artdaq::CommanderInterface > MakeCommanderPlugin(const fhicl::ParameterSet &commander_pset, artdaq::Commandable &commandable)
Load a CommanderInterface plugin.
Value that is to be returned when a Transfer plugin determines that no more data will be arriving...
For code clarity, things checking for successful receive should check retval &gt;= NO_RANK_INFO.
std::unordered_map< artdaq::Fragment::type_t, std::unique_ptr< artdaq::Fragments > > receiveMessages()
Receive all messsages for an event from ArtdaqSharedMemoryService.