artdaq  v3_09_03
TransferWrapper.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME "TransferWrapper"
3 
4 #include "artdaq-core/Data/Fragment.hh"
5 #include "artdaq-core/Utilities/ExceptionHandler.hh"
6 #include "artdaq-core/Utilities/TimeUtils.hh"
7 #include "artdaq/ArtModules/detail/TransferWrapper.hh"
8 #include "artdaq/DAQdata/NetMonHeader.hh"
9 #include "artdaq/ExternalComms/MakeCommanderPlugin.hh"
10 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
11 
12 #include "cetlib/BasicPluginFactory.h"
13 #include "cetlib_except/exception.h"
14 #include "fhiclcpp/ParameterSet.h"
15 
16 #include <csignal>
17 #include <iostream>
18 #include <limits>
19 #include <memory>
20 #include <sstream>
21 #include <string>
22 
23 namespace {
24 volatile std::sig_atomic_t gSignalStatus = 0;
25 }
26 
31 void signal_handler(int signal)
32 {
33  gSignalStatus = signal;
34 }
35 
36 artdaq::TransferWrapper::TransferWrapper(const fhicl::ParameterSet& pset)
37  : timeoutInUsecs_(pset.get<std::size_t>("timeoutInUsecs", 100000))
38  , transfer_(nullptr)
39  , commander_(nullptr)
40  , pset_(pset)
41  , dispatcherHost_(pset.get<std::string>("dispatcherHost", "localhost"))
42  , dispatcherPort_(pset.get<std::string>("dispatcherPort", "5266"))
43  , serverUrl_(pset.get<std::string>("server_url", "http://" + dispatcherHost_ + ":" + dispatcherPort_ + "/RPC2"))
44  , maxEventsBeforeInit_(pset.get<std::size_t>("maxEventsBeforeInit", 5))
45  , allowedFragmentTypes_(pset.get<std::vector<int>>("allowedFragmentTypes", {226, 227, 229}))
46  , runningStateTimeout_(pset.get<double>("dispatcherConnectTimeout", 0))
47  , runningStateInterval_us_(pset.get<size_t>("dispatcherConnectRetryInterval_us", 1000000))
48  , quitOnFragmentIntegrityProblem_(pset.get<bool>("quitOnFragmentIntegrityProblem", true))
49  , multi_run_mode_(pset.get<bool>("allowMultipleRuns", false))
50  , monitorRegistered_(false)
51 {
52  std::signal(SIGINT, signal_handler);
53 
54  try
55  {
56  if (metricMan)
57  {
58  metricMan->initialize(pset.get<fhicl::ParameterSet>("metrics", fhicl::ParameterSet()), "Online Monitor");
59  metricMan->do_start();
60  }
61  }
62  catch (...)
63  {
64  ExceptionHandler(ExceptionHandlerRethrow::no, "TransferWrapper: could not configure metrics");
65  }
66 
67  // Clamp possible values
68  if (runningStateInterval_us_ < 1000)
69  {
70  TLOG(TLVL_WARNING) << "Invalid value " << runningStateInterval_us_ << " us detected for dispatcherConnectRetryInterval_us. Setting to 1000 us";
71  runningStateInterval_us_ = 1000;
72  }
73  if (runningStateInterval_us_ > 30000000)
74  {
75  TLOG(TLVL_WARNING) << "Invalid value " << runningStateInterval_us_ << " us detected for dispatcherConnectRetryInterval_us. Setting to 30,000,000 us";
76  runningStateInterval_us_ = 30000000;
77  }
78 
79  fhicl::ParameterSet new_pset(pset);
80  if (!new_pset.has_key("server_url"))
81  {
82  new_pset.put<std::string>("server_url", serverUrl_);
83  }
84 
86  commander_ = MakeCommanderPlugin(new_pset, c);
87 }
88 
90 {
91  artdaq::FragmentPtrs fragmentPtrs;
92  bool receivedFragment = false;
93  static bool initialized = false;
94  static size_t fragments_received = 0;
95 
96  while (gSignalStatus == 0)
97  {
98  receivedFragment = false;
99  auto fragmentPtr = std::make_unique<artdaq::Fragment>();
100 
101  while (!receivedFragment)
102  {
103  if (gSignalStatus != 0)
104  {
105  TLOG(TLVL_INFO) << "Ctrl-C appears to have been hit";
106  unregisterMonitor();
107  return fragmentPtrs;
108  }
109  if (!monitorRegistered_)
110  {
111  registerMonitor();
112  if (!monitorRegistered_)
113  {
114  return fragmentPtrs;
115  }
116  }
117 
118  try
119  {
120  auto result = transfer_->receiveFragment(*fragmentPtr, timeoutInUsecs_);
121 
123  {
124  receivedFragment = true;
125  fragments_received++;
126 
127  static size_t cntr = 0;
128  auto mod = ++cntr % 10;
129  auto suffix = "-th";
130  if (mod == 1)
131  {
132  suffix = "-st";
133  }
134  if (mod == 2)
135  {
136  suffix = "-nd";
137  }
138  if (mod == 3)
139  {
140  suffix = "-rd";
141  }
142  TLOG(TLVL_INFO) << "Received " << cntr << suffix << " event, "
143  << "seqID == " << fragmentPtr->sequenceID()
144  << ", type == " << fragmentPtr->typeString();
145  continue;
146  }
148  {
149  TLOG(TLVL_ERROR) << "Transfer Plugin disconnected or other unrecoverable error. Shutting down.";
150  unregisterMonitor();
151  initialized = false;
152  continue;
153  }
154  else
155  {
156  // 02-Jun-2018, KAB: added status/result printout
157  // to-do: add another else clause that explicitly checks for RECV_TIMEOUT
158  TLOG(TLVL_WARNING) << "Timeout occurred in call to transfer_->receiveFragmentFrom; will try again"
159  << ", status = " << result;
160  }
161  }
162  catch (...)
163  {
164  ExceptionHandler(ExceptionHandlerRethrow::yes,
165  "Problem receiving data in TransferWrapper::receiveMessage");
166  }
167  }
168 
169  if (fragmentPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
170  {
171  //if (monitorRegistered_)
172  //{
173  // unregisterMonitor();
174  //}
175  if (multi_run_mode_)
176  {
177  unregisterMonitor();
178  initialized = false;
179  continue;
180  }
181 
182  return fragmentPtrs;
183  }
184 
185  checkIntegrity(*fragmentPtr);
186 
187  if (initialized || fragmentPtr->type() == artdaq::Fragment::InitFragmentType)
188  {
189  initialized = true;
190  fragmentPtrs.push_back(std::move(fragmentPtr));
191  break;
192  }
193 
194  if (fragments_received > maxEventsBeforeInit_)
195  {
196  throw cet::exception("TransferWrapper") << "First " << maxEventsBeforeInit_ << " events received did not include the \"Init\" event containing necessary info for art; exiting..."; // NOLINT(cert-err60-cpp)
197  }
198  }
199 
200  return fragmentPtrs;
201 }
202 
203 std::unordered_map<artdaq::Fragment::type_t, std::unique_ptr<artdaq::Fragments>> artdaq::TransferWrapper::receiveMessages()
204 {
205  std::unordered_map<artdaq::Fragment::type_t, std::unique_ptr<artdaq::Fragments>> output;
206 
207  auto ptrs = receiveMessage();
208  for (auto& ptr : ptrs)
209  {
210  auto fragType = ptr->type();
211  auto fragPtr = ptr.release();
212  ptr.reset(nullptr);
213 
214  if (output.count(fragType) == 0u)
215  {
216  output[fragType] = std::make_unique<artdaq::Fragments>();
217  }
218 
219  output[fragType]->emplace_back(std::move(*fragPtr));
220  }
221 
222  return output;
223 }
224 
225 void artdaq::TransferWrapper::checkIntegrity(const artdaq::Fragment& fragment) const
226 {
227  const size_t artdaqheader = artdaq::detail::RawFragmentHeader::num_words() *
228  sizeof(artdaq::detail::RawFragmentHeader::RawDataType);
229  const auto payload = static_cast<size_t>(fragment.dataEndBytes() - fragment.dataBeginBytes());
230  const size_t metadata = sizeof(artdaq::NetMonHeader);
231  const size_t totalsize = fragment.sizeBytes();
232 
233  const auto type = static_cast<size_t>(fragment.type());
234 
235  if (totalsize != artdaqheader + metadata + payload)
236  {
237  std::stringstream errmsg;
238  errmsg << "Error: artdaq fragment of type " << fragment.typeString() << ", sequence ID " << fragment.sequenceID() << " has internally inconsistent measures of its size, signalling data corruption: in bytes,"
239  << " total size = " << totalsize << ", artdaq fragment header = " << artdaqheader << ", metadata = " << metadata << ", payload = " << payload;
240 
241  TLOG(TLVL_ERROR) << errmsg.str();
242 
243  if (quitOnFragmentIntegrityProblem_)
244  {
245  throw cet::exception("TransferWrapper") << errmsg.str(); // NOLINT(cert-err60-cpp)
246  }
247 
248  return;
249  }
250 
251  auto findloc = std::find(allowedFragmentTypes_.begin(), allowedFragmentTypes_.end(), static_cast<int>(type));
252 
253  if (findloc == allowedFragmentTypes_.end())
254  {
255  std::stringstream errmsg;
256  errmsg << "Error: artdaq fragment appears to have type "
257  << type << ", not found in the allowed fragment types list";
258 
259  TLOG(TLVL_ERROR) << errmsg.str();
260  if (quitOnFragmentIntegrityProblem_)
261  {
262  throw cet::exception("TransferWrapper") << errmsg.str(); // NOLINT(cert-err60-cpp)
263  }
264 
265  return;
266  }
267 }
268 
269 void artdaq::TransferWrapper::registerMonitor()
270 {
271  try
272  {
273  transfer_.reset(nullptr);
274  transfer_ = MakeTransferPlugin(pset_, "transfer_plugin", TransferInterface::Role::kReceive);
275  }
276  catch (...)
277  {
278  ExceptionHandler(ExceptionHandlerRethrow::yes,
279  "TransferWrapper: failure in call to MakeTransferPlugin");
280  }
281 
282  auto start = std::chrono::steady_clock::now();
283  auto sts = getDispatcherStatus();
284  while (sts != "Running" && (runningStateTimeout_ == 0 || TimeUtils::GetElapsedTime(start) < runningStateTimeout_))
285  {
286  TLOG(TLVL_DEBUG) << "Dispatcher state: " << sts;
287  if (gSignalStatus != 0)
288  {
289  TLOG(TLVL_INFO) << "Ctrl-C appears to have been hit";
290  return;
291  }
292  TLOG(TLVL_INFO) << "Waited " << std::fixed << std::setprecision(2) << TimeUtils::GetElapsedTime(start) << " s / " << runningStateTimeout_ << " s for Dispatcher to enter the Running state (state=" << sts << ")";
293  usleep(runningStateInterval_us_);
294  sts = getDispatcherStatus();
295  }
296  if (sts != "Running")
297  {
298  return;
299  }
300 
301  auto dispatcherConfig = pset_.get<fhicl::ParameterSet>("dispatcher_config");
302 
303  int retry = 3;
304 
305  while (retry > 0)
306  {
307  TLOG(TLVL_INFO) << "Attempting to register this monitor (\"" << transfer_->uniqueLabel()
308  << "\") with the dispatcher aggregator";
309 
310  auto status = commander_->send_register_monitor(dispatcherConfig.to_string());
311 
312  TLOG(TLVL_INFO) << "Response from dispatcher is \"" << status << "\"";
313 
314  if (status == "Success")
315  {
316  monitorRegistered_ = true;
317  break;
318  }
319 
320  TLOG(TLVL_WARNING) << "Error in TransferWrapper: attempt to register with dispatcher did not result in the \"Success\" response";
321  usleep(100000);
322 
323  retry--;
324  }
325 }
326 
327 void artdaq::TransferWrapper::unregisterMonitor()
328 {
329  if (!monitorRegistered_)
330  {
331  TLOG(TLVL_WARNING) << "The function to unregister the monitor was called, but the monitor doesn't appear to be registered";
332  return;
333  }
334 
335  auto start_time = std::chrono::steady_clock::now();
336  bool waiting = true;
337  while (artdaq::TimeUtils::GetElapsedTime(start_time) < 5.0 && waiting)
338  {
339  std::string sts = getDispatcherStatus();
340 
341  if (sts.empty())
342  return;
343 
344  if (sts == "busy")
345  {
346  TLOG(TLVL_INFO) << "The Dispatcher returned \"busy\", will wait 0.5s and retry";
347  usleep(500000);
348  continue;
349  }
350 
351  if (sts != "Running" && sts != "Ready")
352  {
353  TLOG(TLVL_WARNING) << "The Dispatcher is not in the Running or Ready state, will not attempt to unregister (state: " << sts << ")";
354  return;
355  }
356  waiting = false;
357  }
358  if (waiting)
359  {
360  TLOG(TLVL_WARNING) << "A timeout occurred waiting for the Dispatcher to leave the \"busy\" state, will not attempt to unregister";
361  return;
362  }
363 
364  int retry = 3;
365  while (retry > 0)
366  {
367  TLOG(TLVL_INFO) << "Requesting that this monitor (" << transfer_->uniqueLabel()
368  << ") be unregistered from the dispatcher aggregator";
369 
370  auto status = commander_->send_unregister_monitor(transfer_->uniqueLabel());
371 
372  TLOG(TLVL_INFO) << "Response from dispatcher is \"" << status << "\"";
373 
374  if (status == "Success")
375  {
376  break;
377  }
378  else if (status == "busy")
379  {
380  TLOG(TLVL_DEBUG) << "The Dispatcher returned \"busy\", will retry in 0.5s";
381  }
382  else
383  {
384  TLOG(TLVL_WARNING) << "The Dispatcher returned status " << status << " when attempting to unregister this monitor!";
385  //throw cet::exception("TransferWrapper") << "Error in TransferWrapper: attempt to unregister with dispatcher did not result in the \"Success\" response";
386  }
387  retry--;
388  usleep(500000);
389  }
390 
391  TLOG(TLVL_INFO) << "Successfully unregistered the monitor from the Dispatcher";
392  monitorRegistered_ = false;
393 }
394 
395 std::string artdaq::TransferWrapper::getDispatcherStatus()
396 {
397  try
398  {
399  return commander_->send_status();
400  }
401  catch (std::exception const& ex)
402  {
403  TLOG(TLVL_WARNING) << "An exception was thrown trying to collect the Dispatcher's status. Most likely cause is the application is no longer running.";
404  return "";
405  }
406 }
407 
409 {
410  if (monitorRegistered_)
411  {
412  try
413  {
414  unregisterMonitor();
415  }
416  catch (...)
417  {
418  ExceptionHandler(ExceptionHandlerRethrow::no,
419  "An exception occurred when trying to unregister monitor during TransferWrapper's destruction");
420  }
421  }
423 }
Commandable is the base class for all artdaq components which implement the artdaq state machine...
Definition: Commandable.hh:20
TransferWrapper(const fhicl::ParameterSet &pset)
TransferWrapper Constructor.
static void CleanUpGlobals()
Clean up statically-allocated Manager class instances.
Definition: Globals.hh:150
virtual ~TransferWrapper()
TransferWrapper Destructor.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
artdaq::FragmentPtrs receiveMessage()
Receive a Fragment from the TransferInterface, and send it to art.
This TransferInterface is a Receiver.
Header with length information for NetMonTransport messages.
Definition: NetMonHeader.hh:13
volatile std::sig_atomic_t gSignalStatus
Stores singal from signal handler.
std::unique_ptr< artdaq::CommanderInterface > MakeCommanderPlugin(const fhicl::ParameterSet &commander_pset, artdaq::Commandable &commandable)
Load a CommanderInterface plugin.
Value that is to be returned when a Transfer plugin determines that no more data will be arriving...
For code clarity, things checking for successful receive should check retval &gt;= NO_RANK_INFO.
std::unordered_map< artdaq::Fragment::type_t, std::unique_ptr< artdaq::Fragments > > receiveMessages()
Receive all messsages for an event from ArtdaqSharedMemoryService.