artdaq  v3_11_01
TransferWrapper.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME "TransferWrapper"
3 
4 #include "artdaq-core/Data/Fragment.hh"
5 #include "artdaq-core/Utilities/ExceptionHandler.hh"
6 #include "artdaq-core/Utilities/TimeUtils.hh"
7 #include "artdaq/ArtModules/detail/TransferWrapper.hh"
8 #include "artdaq/DAQdata/NetMonHeader.hh"
9 #include "artdaq/ExternalComms/MakeCommanderPlugin.hh"
10 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
11 
12 #include "cetlib/BasicPluginFactory.h"
13 #include "cetlib_except/exception.h"
14 #include "fhiclcpp/ParameterSet.h"
15 
16 #include <csignal>
17 #include <iostream>
18 #include <limits>
19 #include <memory>
20 #include <sstream>
21 #include <string>
22 
23 namespace {
24 volatile std::sig_atomic_t gSignalStatus = 0;
25 }
26 
31 void signal_handler(int signal)
32 {
33  gSignalStatus = signal;
34 }
35 
36 artdaq::TransferWrapper::TransferWrapper(const fhicl::ParameterSet& pset)
37  : timeoutInUsecs_(pset.get<std::size_t>("timeoutInUsecs", 100000))
38  , last_received_data_()
39  , last_report_(std::chrono::steady_clock::now())
40  , transfer_(nullptr)
41  , commander_(nullptr)
42  , pset_(pset)
43  , dispatcherHost_(pset.get<std::string>("dispatcherHost", "localhost"))
44  , dispatcherPort_(pset.get<std::string>("dispatcherPort", "5266"))
45  , serverUrl_(pset.get<std::string>("server_url", "http://" + dispatcherHost_ + ":" + dispatcherPort_ + "/RPC2"))
46  , maxEventsBeforeInit_(pset.get<std::size_t>("maxEventsBeforeInit", 5))
47  , allowedFragmentTypes_(pset.get<std::vector<int>>("allowedFragmentTypes", {226, 227, 229}))
48  , runningStateTimeout_(pset.get<double>("dispatcherConnectTimeout", 0))
49  , runningStateInterval_us_(pset.get<size_t>("dispatcherConnectRetryInterval_us", 1000000))
50  , quitOnFragmentIntegrityProblem_(pset.get<bool>("quitOnFragmentIntegrityProblem", true))
51  , multi_run_mode_(pset.get<bool>("allowMultipleRuns", false))
52  , monitorRegistered_(false)
53 {
54  std::signal(SIGINT, signal_handler);
55 
56  try
57  {
58  if (metricMan)
59  {
60  metricMan->initialize(pset.get<fhicl::ParameterSet>("metrics", fhicl::ParameterSet()), "Online Monitor");
61  metricMan->do_start();
62  }
63  }
64  catch (...)
65  {
66  ExceptionHandler(ExceptionHandlerRethrow::no, "TransferWrapper: could not configure metrics");
67  }
68 
69  // Clamp possible values
70  if (runningStateInterval_us_ < 1000)
71  {
72  TLOG(TLVL_WARNING) << "Invalid value " << runningStateInterval_us_ << " us detected for dispatcherConnectRetryInterval_us. Setting to 1000 us";
73  runningStateInterval_us_ = 1000;
74  }
75  if (runningStateInterval_us_ > 30000000)
76  {
77  TLOG(TLVL_WARNING) << "Invalid value " << runningStateInterval_us_ << " us detected for dispatcherConnectRetryInterval_us. Setting to 30,000,000 us";
78  runningStateInterval_us_ = 30000000;
79  }
80 
81  fhicl::ParameterSet new_pset(pset);
82  if (!new_pset.has_key("server_url"))
83  {
84  new_pset.put<std::string>("server_url", serverUrl_);
85  }
86 
88  commander_ = MakeCommanderPlugin(new_pset, c);
89 }
90 
92 {
93  artdaq::FragmentPtrs fragmentPtrs;
94  bool receivedFragment = false;
95  static bool initialized = false;
96  static size_t fragments_received = 0;
97 
98  while (gSignalStatus == 0)
99  {
100  receivedFragment = false;
101  auto fragmentPtr = std::make_unique<artdaq::Fragment>();
102 
103  while (!receivedFragment)
104  {
105  if (gSignalStatus != 0)
106  {
107  TLOG(TLVL_INFO) << "Ctrl-C appears to have been hit";
108  unregisterMonitor();
109  return fragmentPtrs;
110  }
111  if (!monitorRegistered_)
112  {
113  registerMonitor();
114  if (!monitorRegistered_)
115  {
116  return fragmentPtrs;
117  }
118  }
119 
120  try
121  {
122  auto result = transfer_->receiveFragment(*fragmentPtr, timeoutInUsecs_);
123 
125  {
126  receivedFragment = true;
127  fragments_received++;
128 
129  static size_t cntr = 0;
130  auto mod = ++cntr % 10;
131  auto suffix = "-th";
132  if (mod == 1)
133  {
134  suffix = "-st";
135  }
136  if (mod == 2)
137  {
138  suffix = "-nd";
139  }
140  if (mod == 3)
141  {
142  suffix = "-rd";
143  }
144  TLOG(TLVL_INFO) << "Received " << cntr << suffix << " event, "
145  << "seqID == " << fragmentPtr->sequenceID()
146  << ", type == " << fragmentPtr->typeString();
147  last_received_data_ = std::chrono::steady_clock::now();
148  continue;
149  }
151  {
152  TLOG(TLVL_ERROR) << "Transfer Plugin disconnected or other unrecoverable error. Shutting down.";
153  if (multi_run_mode_)
154  {
155  unregisterMonitor();
156  initialized = false;
157  continue;
158  }
159  return fragmentPtrs;
160  }
161  else
162  {
163  auto tlvl = TLVL_TRACE;
164  if (artdaq::TimeUtils::GetElapsedTime(last_report_) > 1.0 && artdaq::TimeUtils::GetElapsedTime(last_received_data_) > 1.0)
165  {
166  tlvl = TLVL_WARNING;
167  last_report_ = std::chrono::steady_clock::now();
168  }
169 
170  auto last_received_milliseconds = artdaq::TimeUtils::GetElapsedTimeMilliseconds(last_received_data_);
171 
172  // 02-Jun-2018, KAB: added status/result printout
173  // to-do: add another else clause that explicitly checks for RECV_TIMEOUT
174  TLOG(tlvl) << "Timeout occurred in call to transfer_->receiveFragmentFrom; will try again"
175  << ", status = " << result << ", last received data " << last_received_milliseconds << " ms ago.";
176  }
177  }
178  catch (...)
179  {
180  ExceptionHandler(ExceptionHandlerRethrow::yes,
181  "Problem receiving data in TransferWrapper::receiveMessage");
182  }
183  }
184 
185  if (fragmentPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
186  {
187  //if (monitorRegistered_)
188  //{
189  // unregisterMonitor();
190  //}
191  if (multi_run_mode_)
192  {
193  unregisterMonitor();
194  initialized = false;
195  continue;
196  }
197 
198  return fragmentPtrs;
199  }
200 
201  checkIntegrity(*fragmentPtr);
202 
203  if (initialized || fragmentPtr->type() == artdaq::Fragment::InitFragmentType)
204  {
205  initialized = true;
206  fragmentPtrs.push_back(std::move(fragmentPtr));
207  break;
208  }
209 
210  if (fragments_received > maxEventsBeforeInit_)
211  {
212  throw cet::exception("TransferWrapper") << "First " << maxEventsBeforeInit_ << " events received did not include the \"Init\" event containing necessary info for art; exiting..."; // NOLINT(cert-err60-cpp)
213  }
214  }
215 
216  return fragmentPtrs;
217 }
218 
219 std::unordered_map<artdaq::Fragment::type_t, std::unique_ptr<artdaq::Fragments>> artdaq::TransferWrapper::receiveMessages()
220 {
221  std::unordered_map<artdaq::Fragment::type_t, std::unique_ptr<artdaq::Fragments>> output;
222 
223  auto ptrs = receiveMessage();
224  for (auto& ptr : ptrs)
225  {
226  auto fragType = ptr->type();
227  auto fragPtr = ptr.release();
228  ptr.reset(nullptr);
229 
230  if (output.count(fragType) == 0u)
231  {
232  output[fragType] = std::make_unique<artdaq::Fragments>();
233  }
234 
235  output[fragType]->emplace_back(std::move(*fragPtr));
236  }
237 
238  return output;
239 }
240 
241 void artdaq::TransferWrapper::checkIntegrity(const artdaq::Fragment& fragment) const
242 {
243  const size_t artdaqheader = artdaq::detail::RawFragmentHeader::num_words() *
244  sizeof(artdaq::detail::RawFragmentHeader::RawDataType);
245  const auto payload = static_cast<size_t>(fragment.dataEndBytes() - fragment.dataBeginBytes());
246  const size_t metadata = sizeof(artdaq::NetMonHeader);
247  const size_t totalsize = fragment.sizeBytes();
248 
249  const auto type = static_cast<size_t>(fragment.type());
250 
251  if (totalsize != artdaqheader + metadata + payload)
252  {
253  std::stringstream errmsg;
254  errmsg << "Error: artdaq fragment of type " << fragment.typeString() << ", sequence ID " << fragment.sequenceID() << " has internally inconsistent measures of its size, signalling data corruption: in bytes,"
255  << " total size = " << totalsize << ", artdaq fragment header = " << artdaqheader << ", metadata = " << metadata << ", payload = " << payload;
256 
257  TLOG(TLVL_ERROR) << errmsg.str();
258 
259  if (quitOnFragmentIntegrityProblem_)
260  {
261  throw cet::exception("TransferWrapper") << errmsg.str(); // NOLINT(cert-err60-cpp)
262  }
263 
264  return;
265  }
266 
267  auto findloc = std::find(allowedFragmentTypes_.begin(), allowedFragmentTypes_.end(), static_cast<int>(type));
268 
269  if (findloc == allowedFragmentTypes_.end())
270  {
271  std::stringstream errmsg;
272  errmsg << "Error: artdaq fragment appears to have type "
273  << type << ", not found in the allowed fragment types list";
274 
275  TLOG(TLVL_ERROR) << errmsg.str();
276  if (quitOnFragmentIntegrityProblem_)
277  {
278  throw cet::exception("TransferWrapper") << errmsg.str(); // NOLINT(cert-err60-cpp)
279  }
280 
281  return;
282  }
283 }
284 
285 void artdaq::TransferWrapper::registerMonitor()
286 {
287  try
288  {
289  transfer_.reset(nullptr);
290  transfer_ = MakeTransferPlugin(pset_, "transfer_plugin", TransferInterface::Role::kReceive);
291  }
292  catch (...)
293  {
294  ExceptionHandler(ExceptionHandlerRethrow::yes,
295  "TransferWrapper: failure in call to MakeTransferPlugin");
296  }
297 
298  auto start = std::chrono::steady_clock::now();
299  auto sts = getDispatcherStatus();
300  while (sts != "Running" && (runningStateTimeout_ == 0 || TimeUtils::GetElapsedTime(start) < runningStateTimeout_))
301  {
302  TLOG(TLVL_DEBUG) << "Dispatcher state: " << sts;
303  if (gSignalStatus != 0)
304  {
305  TLOG(TLVL_INFO) << "Ctrl-C appears to have been hit";
306  return;
307  }
308  TLOG(TLVL_INFO) << "Waited " << std::fixed << std::setprecision(2) << TimeUtils::GetElapsedTime(start) << " s / " << runningStateTimeout_ << " s for Dispatcher to enter the Running state (state=" << sts << ")";
309  usleep(runningStateInterval_us_);
310  sts = getDispatcherStatus();
311  }
312  if (sts != "Running")
313  {
314  return;
315  }
316 
317  auto dispatcherConfig = pset_.get<fhicl::ParameterSet>("dispatcher_config");
318 
319  int retry = 3;
320 
321  while (retry > 0)
322  {
323  TLOG(TLVL_INFO) << "Attempting to register this monitor (\"" << transfer_->uniqueLabel()
324  << "\") with the dispatcher aggregator";
325 
326  auto status = commander_->send_register_monitor(dispatcherConfig.to_string());
327 
328  TLOG(TLVL_INFO) << "Response from dispatcher is \"" << status << "\"";
329 
330  if (status == "Success")
331  {
332  monitorRegistered_ = true;
333  break;
334  }
335 
336  TLOG(TLVL_WARNING) << "Error in TransferWrapper: attempt to register with dispatcher did not result in the \"Success\" response";
337  usleep(100000);
338 
339  retry--;
340  }
341 }
342 
343 void artdaq::TransferWrapper::unregisterMonitor()
344 {
345  if (!monitorRegistered_)
346  {
347  TLOG(TLVL_WARNING) << "The function to unregister the monitor was called, but the monitor doesn't appear to be registered";
348  return;
349  }
350 
351  auto start_time = std::chrono::steady_clock::now();
352  bool waiting = true;
353  while (artdaq::TimeUtils::GetElapsedTime(start_time) < 5.0 && waiting)
354  {
355  std::string sts = getDispatcherStatus();
356 
357  if (sts.empty())
358  return;
359 
360  if (sts == "busy")
361  {
362  TLOG(TLVL_INFO) << "The Dispatcher returned \"busy\", will wait 0.5s and retry";
363  usleep(500000);
364  continue;
365  }
366 
367  if (sts != "Running" && sts != "Ready")
368  {
369  TLOG(TLVL_WARNING) << "The Dispatcher is not in the Running or Ready state, will not attempt to unregister (state: " << sts << ")";
370  return;
371  }
372  waiting = false;
373  }
374  if (waiting)
375  {
376  TLOG(TLVL_WARNING) << "A timeout occurred waiting for the Dispatcher to leave the \"busy\" state, will not attempt to unregister";
377  return;
378  }
379 
380  int retry = 3;
381  while (retry > 0)
382  {
383  TLOG(TLVL_INFO) << "Requesting that this monitor (" << transfer_->uniqueLabel()
384  << ") be unregistered from the dispatcher aggregator";
385 
386  auto status = commander_->send_unregister_monitor(transfer_->uniqueLabel());
387 
388  TLOG(TLVL_INFO) << "Response from dispatcher is \"" << status << "\"";
389 
390  if (status == "Success")
391  {
392  break;
393  }
394  else if (status == "busy")
395  {
396  TLOG(TLVL_DEBUG) << "The Dispatcher returned \"busy\", will retry in 0.5s";
397  }
398  else
399  {
400  TLOG(TLVL_WARNING) << "The Dispatcher returned status " << status << " when attempting to unregister this monitor!";
401  //throw cet::exception("TransferWrapper") << "Error in TransferWrapper: attempt to unregister with dispatcher did not result in the \"Success\" response";
402  }
403  retry--;
404  usleep(500000);
405  }
406 
407  TLOG(TLVL_INFO) << "Successfully unregistered the monitor from the Dispatcher";
408  monitorRegistered_ = false;
409 }
410 
411 std::string artdaq::TransferWrapper::getDispatcherStatus()
412 {
413  try
414  {
415  return commander_->send_status();
416  }
417  catch (std::exception const& ex)
418  {
419  TLOG(TLVL_WARNING) << "An exception was thrown trying to collect the Dispatcher's status. Most likely cause is the application is no longer running.";
420  return "";
421  }
422 }
423 
425 {
426  if (monitorRegistered_)
427  {
428  try
429  {
430  unregisterMonitor();
431  }
432  catch (...)
433  {
434  ExceptionHandler(ExceptionHandlerRethrow::no,
435  "An exception occurred when trying to unregister monitor during TransferWrapper's destruction");
436  }
437  }
439 }
Commandable is the base class for all artdaq components which implement the artdaq state machine...
Definition: Commandable.hh:20
TransferWrapper(const fhicl::ParameterSet &pset)
TransferWrapper Constructor.
static void CleanUpGlobals()
Clean up statically-allocated Manager class instances.
Definition: Globals.hh:150
virtual ~TransferWrapper()
TransferWrapper Destructor.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
artdaq::FragmentPtrs receiveMessage()
Receive a Fragment from the TransferInterface, and send it to art.
This TransferInterface is a Receiver.
Header with length information for NetMonTransport messages.
Definition: NetMonHeader.hh:13
volatile std::sig_atomic_t gSignalStatus
Stores singal from signal handler.
std::unique_ptr< artdaq::CommanderInterface > MakeCommanderPlugin(const fhicl::ParameterSet &commander_pset, artdaq::Commandable &commandable)
Load a CommanderInterface plugin.
Value that is to be returned when a Transfer plugin determines that no more data will be arriving...
For code clarity, things checking for successful receive should check retval &gt;= NO_RANK_INFO.
std::unordered_map< artdaq::Fragment::type_t, std::unique_ptr< artdaq::Fragments > > receiveMessages()
Receive all messsages for an event from ArtdaqSharedMemoryService.