artdaq  v3_09_01
TransferWrapper.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME "TransferWrapper"
3 
4 #include "artdaq-core/Data/Fragment.hh"
5 #include "artdaq-core/Utilities/ExceptionHandler.hh"
6 #include "artdaq/ArtModules/detail/TransferWrapper.hh"
7 #include "artdaq/DAQdata/NetMonHeader.hh"
8 #include "artdaq/ExternalComms/MakeCommanderPlugin.hh"
9 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
10 
11 #include "cetlib/BasicPluginFactory.h"
12 #include "cetlib_except/exception.h"
13 #include "fhiclcpp/ParameterSet.h"
14 
15 #include <csignal>
16 #include <iostream>
17 #include <limits>
18 #include <memory>
19 #include <sstream>
20 #include <string>
21 
22 namespace {
23 volatile std::sig_atomic_t gSignalStatus = 0;
24 }
25 
30 void signal_handler(int signal)
31 {
32  gSignalStatus = signal;
33 }
34 
35 artdaq::TransferWrapper::TransferWrapper(const fhicl::ParameterSet& pset)
36  : timeoutInUsecs_(pset.get<std::size_t>("timeoutInUsecs", 100000))
37  , transfer_(nullptr)
38  , commander_(nullptr)
39  , pset_(pset)
40  , dispatcherHost_(pset.get<std::string>("dispatcherHost", "localhost"))
41  , dispatcherPort_(pset.get<std::string>("dispatcherPort", "5266"))
42  , serverUrl_(pset.get<std::string>("server_url", "http://" + dispatcherHost_ + ":" + dispatcherPort_ + "/RPC2"))
43  , maxEventsBeforeInit_(pset.get<std::size_t>("maxEventsBeforeInit", 5))
44  , allowedFragmentTypes_(pset.get<std::vector<int>>("allowedFragmentTypes", {226, 227, 229}))
45  , runningStateTimeout_(pset.get<double>("dispatcherConnectTimeout", 0))
46  , runningStateInterval_us_(pset.get<size_t>("dispatcherConnectRetryInterval_us", 1000000))
47  , quitOnFragmentIntegrityProblem_(pset.get<bool>("quitOnFragmentIntegrityProblem", true))
48  , multi_run_mode_(pset.get<bool>("allowMultipleRuns", false))
49  , monitorRegistered_(false)
50 {
51  std::signal(SIGINT, signal_handler);
52 
53  try
54  {
55  if (metricMan)
56  {
57  metricMan->initialize(pset.get<fhicl::ParameterSet>("metrics", fhicl::ParameterSet()), "Online Monitor");
58  metricMan->do_start();
59  }
60  }
61  catch (...)
62  {
63  ExceptionHandler(ExceptionHandlerRethrow::no, "TransferWrapper: could not configure metrics");
64  }
65 
66  // Clamp possible values
67  if (runningStateInterval_us_ < 1000)
68  {
69  TLOG(TLVL_WARNING) << "Invalid value " << runningStateInterval_us_ << " us detected for dispatcherConnectRetryInterval_us. Setting to 1000 us";
70  runningStateInterval_us_ = 1000;
71  }
72  if (runningStateInterval_us_ > 30000000)
73  {
74  TLOG(TLVL_WARNING) << "Invalid value " << runningStateInterval_us_ << " us detected for dispatcherConnectRetryInterval_us. Setting to 30,000,000 us";
75  runningStateInterval_us_ = 30000000;
76  }
77 
78  fhicl::ParameterSet new_pset(pset);
79  if (!new_pset.has_key("server_url"))
80  {
81  new_pset.put<std::string>("server_url", serverUrl_);
82  }
83 
85  commander_ = MakeCommanderPlugin(new_pset, c);
86 }
87 
89 {
90  artdaq::FragmentPtrs fragmentPtrs;
91  bool receivedFragment = false;
92  static bool initialized = false;
93  static size_t fragments_received = 0;
94 
95  while (gSignalStatus == 0)
96  {
97  receivedFragment = false;
98  auto fragmentPtr = std::make_unique<artdaq::Fragment>();
99 
100  while (!receivedFragment)
101  {
102  if (gSignalStatus != 0)
103  {
104  TLOG(TLVL_INFO) << "Ctrl-C appears to have been hit";
105  unregisterMonitor();
106  return fragmentPtrs;
107  }
108  if (!monitorRegistered_)
109  {
110  registerMonitor();
111  if (!monitorRegistered_)
112  {
113  return fragmentPtrs;
114  }
115  }
116 
117  try
118  {
119  auto result = transfer_->receiveFragment(*fragmentPtr, timeoutInUsecs_);
120 
122  {
123  receivedFragment = true;
124  fragments_received++;
125 
126  static size_t cntr = 0;
127  auto mod = ++cntr % 10;
128  auto suffix = "-th";
129  if (mod == 1)
130  {
131  suffix = "-st";
132  }
133  if (mod == 2)
134  {
135  suffix = "-nd";
136  }
137  if (mod == 3)
138  {
139  suffix = "-rd";
140  }
141  TLOG(TLVL_INFO) << "Received " << cntr << suffix << " event, "
142  << "seqID == " << fragmentPtr->sequenceID()
143  << ", type == " << fragmentPtr->typeString();
144  continue;
145  }
147  {
148  TLOG(TLVL_ERROR) << "Transfer Plugin disconnected or other unrecoverable error. Shutting down.";
149  unregisterMonitor();
150  initialized = false;
151  continue;
152  }
153  else
154  {
155  // 02-Jun-2018, KAB: added status/result printout
156  // to-do: add another else clause that explicitly checks for RECV_TIMEOUT
157  TLOG(TLVL_WARNING) << "Timeout occurred in call to transfer_->receiveFragmentFrom; will try again"
158  << ", status = " << result;
159  }
160  }
161  catch (...)
162  {
163  ExceptionHandler(ExceptionHandlerRethrow::yes,
164  "Problem receiving data in TransferWrapper::receiveMessage");
165  }
166  }
167 
168  if (fragmentPtr->type() == artdaq::Fragment::EndOfDataFragmentType)
169  {
170  //if (monitorRegistered_)
171  //{
172  // unregisterMonitor();
173  //}
174  if (multi_run_mode_)
175  {
176  unregisterMonitor();
177  initialized = false;
178  continue;
179  }
180 
181  return fragmentPtrs;
182  }
183 
184  checkIntegrity(*fragmentPtr);
185 
186  if (initialized || fragmentPtr->type() == artdaq::Fragment::InitFragmentType)
187  {
188  initialized = true;
189  fragmentPtrs.push_back(std::move(fragmentPtr));
190  break;
191  }
192 
193  if (fragments_received > maxEventsBeforeInit_)
194  {
195  throw cet::exception("TransferWrapper") << "First " << maxEventsBeforeInit_ << " events received did not include the \"Init\" event containing necessary info for art; exiting..."; // NOLINT(cert-err60-cpp)
196  }
197  }
198 
199  return fragmentPtrs;
200 }
201 
202 std::unordered_map<artdaq::Fragment::type_t, std::unique_ptr<artdaq::Fragments>> artdaq::TransferWrapper::receiveMessages()
203 {
204  std::unordered_map<artdaq::Fragment::type_t, std::unique_ptr<artdaq::Fragments>> output;
205 
206  auto ptrs = receiveMessage();
207  for (auto& ptr : ptrs)
208  {
209  auto fragType = ptr->type();
210  auto fragPtr = ptr.release();
211  ptr.reset(nullptr);
212 
213  if (output.count(fragType) == 0u)
214  {
215  output[fragType] = std::make_unique<artdaq::Fragments>();
216  }
217 
218  output[fragType]->emplace_back(std::move(*fragPtr));
219  }
220 
221  return output;
222 }
223 
224 void artdaq::TransferWrapper::checkIntegrity(const artdaq::Fragment& fragment) const
225 {
226  const size_t artdaqheader = artdaq::detail::RawFragmentHeader::num_words() *
227  sizeof(artdaq::detail::RawFragmentHeader::RawDataType);
228  const auto payload = static_cast<size_t>(fragment.dataEndBytes() - fragment.dataBeginBytes());
229  const size_t metadata = sizeof(artdaq::NetMonHeader);
230  const size_t totalsize = fragment.sizeBytes();
231 
232  const auto type = static_cast<size_t>(fragment.type());
233 
234  if (totalsize != artdaqheader + metadata + payload)
235  {
236  std::stringstream errmsg;
237  errmsg << "Error: artdaq fragment of type " << fragment.typeString() << ", sequence ID " << fragment.sequenceID() << " has internally inconsistent measures of its size, signalling data corruption: in bytes,"
238  << " total size = " << totalsize << ", artdaq fragment header = " << artdaqheader << ", metadata = " << metadata << ", payload = " << payload;
239 
240  TLOG(TLVL_ERROR) << errmsg.str();
241 
242  if (quitOnFragmentIntegrityProblem_)
243  {
244  throw cet::exception("TransferWrapper") << errmsg.str(); // NOLINT(cert-err60-cpp)
245  }
246 
247  return;
248  }
249 
250  auto findloc = std::find(allowedFragmentTypes_.begin(), allowedFragmentTypes_.end(), static_cast<int>(type));
251 
252  if (findloc == allowedFragmentTypes_.end())
253  {
254  std::stringstream errmsg;
255  errmsg << "Error: artdaq fragment appears to have type "
256  << type << ", not found in the allowed fragment types list";
257 
258  TLOG(TLVL_ERROR) << errmsg.str();
259  if (quitOnFragmentIntegrityProblem_)
260  {
261  throw cet::exception("TransferWrapper") << errmsg.str(); // NOLINT(cert-err60-cpp)
262  }
263 
264  return;
265  }
266 }
267 
268 void artdaq::TransferWrapper::registerMonitor()
269 {
270  try
271  {
272  transfer_.reset(nullptr);
273  transfer_ = MakeTransferPlugin(pset_, "transfer_plugin", TransferInterface::Role::kReceive);
274  }
275  catch (...)
276  {
277  ExceptionHandler(ExceptionHandlerRethrow::yes,
278  "TransferWrapper: failure in call to MakeTransferPlugin");
279  }
280 
281  auto start = std::chrono::steady_clock::now();
282  auto sts = getDispatcherStatus();
283  while (sts != "Running" && (runningStateTimeout_ == 0 || TimeUtils::GetElapsedTime(start) < runningStateTimeout_))
284  {
285  TLOG(TLVL_DEBUG) << "Dispatcher state: " << sts;
286  if (gSignalStatus != 0)
287  {
288  TLOG(TLVL_INFO) << "Ctrl-C appears to have been hit";
289  return;
290  }
291  TLOG(TLVL_INFO) << "Waited " << std::fixed << std::setprecision(2) << TimeUtils::GetElapsedTime(start) << " s / " << runningStateTimeout_ << " s for Dispatcher to enter the Running state (state=" << sts << ")";
292  usleep(runningStateInterval_us_);
293  sts = getDispatcherStatus();
294  }
295  if (sts != "Running")
296  {
297  return;
298  }
299 
300  auto dispatcherConfig = pset_.get<fhicl::ParameterSet>("dispatcher_config");
301 
302  int retry = 3;
303 
304  while (retry > 0)
305  {
306  TLOG(TLVL_INFO) << "Attempting to register this monitor (\"" << transfer_->uniqueLabel()
307  << "\") with the dispatcher aggregator";
308 
309  auto status = commander_->send_register_monitor(dispatcherConfig.to_string());
310 
311  TLOG(TLVL_INFO) << "Response from dispatcher is \"" << status << "\"";
312 
313  if (status == "Success")
314  {
315  monitorRegistered_ = true;
316  break;
317  }
318 
319  TLOG(TLVL_WARNING) << "Error in TransferWrapper: attempt to register with dispatcher did not result in the \"Success\" response";
320  usleep(100000);
321 
322  retry--;
323  }
324 }
325 
326 void artdaq::TransferWrapper::unregisterMonitor()
327 {
328  if (!monitorRegistered_)
329  {
330  TLOG(TLVL_WARNING) << "The function to unregister the monitor was called, but the monitor doesn't appear to be registered";
331  return;
332  }
333 
334  std::string sts = getDispatcherStatus();
335 
336  if (sts.empty())
337  {
338  return;
339  }
340 
341  if (sts != "Running" && sts != "Ready")
342  {
343  TLOG(TLVL_WARNING) << "The Dispatcher is not in the Running or Ready state, will not attempt to unregister";
344  return;
345  }
346 
347  int retry = 3;
348  while (retry > 0)
349  {
350  TLOG(TLVL_INFO) << "Requesting that this monitor (" << transfer_->uniqueLabel()
351  << ") be unregistered from the dispatcher aggregator";
352 
353  auto status = commander_->send_unregister_monitor(transfer_->uniqueLabel());
354 
355  TLOG(TLVL_INFO) << "Response from dispatcher is \"" << status << "\"";
356 
357  if (status == "Success")
358  {
359  break;
360  }
361  if (status == "busy")
362  {}
363  else
364  {
365  TLOG(TLVL_WARNING) << "The Dispatcher returned status " << status << " when attempting to unregister this monitor!";
366  //throw cet::exception("TransferWrapper") << "Error in TransferWrapper: attempt to unregister with dispatcher did not result in the \"Success\" response";
367  }
368  retry--;
369  usleep(500000);
370  }
371  monitorRegistered_ = false;
372 }
373 
374 std::string artdaq::TransferWrapper::getDispatcherStatus()
375 {
376  try
377  {
378  return commander_->send_status();
379  }
380  catch (std::exception const& ex)
381  {
382  TLOG(TLVL_WARNING) << "An exception was thrown trying to collect the Dispatcher's status. Most likely cause is the application is no longer running.";
383  return "";
384  }
385 }
386 
388 {
389  if (monitorRegistered_)
390  {
391  try
392  {
393  unregisterMonitor();
394  }
395  catch (...)
396  {
397  ExceptionHandler(ExceptionHandlerRethrow::no,
398  "An exception occurred when trying to unregister monitor during TransferWrapper's destruction");
399  }
400  }
402 }
Commandable is the base class for all artdaq components which implement the artdaq state machine...
Definition: Commandable.hh:20
TransferWrapper(const fhicl::ParameterSet &pset)
TransferWrapper Constructor.
static void CleanUpGlobals()
Clean up statically-allocated Manager class instances.
Definition: Globals.hh:150
virtual ~TransferWrapper()
TransferWrapper Destructor.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, const std::string &plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
artdaq::FragmentPtrs receiveMessage()
Receive a Fragment from the TransferInterface, and send it to art.
This TransferInterface is a Receiver.
Header with length information for NetMonTransport messages.
Definition: NetMonHeader.hh:13
volatile std::sig_atomic_t gSignalStatus
Stores singal from signal handler.
std::unique_ptr< artdaq::CommanderInterface > MakeCommanderPlugin(const fhicl::ParameterSet &commander_pset, artdaq::Commandable &commandable)
Load a CommanderInterface plugin.
Value that is to be returned when a Transfer plugin determines that no more data will be arriving...
For code clarity, things checking for successful receive should check retval &gt;= NO_RANK_INFO.
std::unordered_map< artdaq::Fragment::type_t, std::unique_ptr< artdaq::Fragments > > receiveMessages()
Receive all messsages for an event from ArtdaqSharedMemoryService.