artdaq  v3_06_00
RequestReceiver.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_RequestReceiver").c_str()
3 
4 #include "artdaq/DAQrate/RequestReceiver.hh"
5 #include "artdaq/DAQdata/Globals.hh"
6 #include "artdaq/DAQrate/detail/RequestMessage.hh"
7 
8 #include <boost/exception/all.hpp>
9 #include <boost/throw_exception.hpp>
10 
11 #include <limits>
12 #include <iterator>
13 
14 #include "canvas/Utilities/Exception.h"
15 #include "cetlib_except/exception.h"
16 #include "fhiclcpp/ParameterSet.h"
17 
18 #include "artdaq-core/Utilities/SimpleLookupPolicy.hh"
19 #include "artdaq-core/Data/Fragment.hh"
20 #include "artdaq-core/Data/ContainerFragmentLoader.hh"
21 #include "artdaq-core/Utilities/ExceptionHandler.hh"
22 #include "artdaq-core/Utilities/TimeUtils.hh"
23 
24 #include <fstream>
25 #include <iomanip>
26 #include <iterator>
27 #include <iostream>
28 #include <iomanip>
29 #include <algorithm>
30 #include <sys/poll.h>
31 #include <arpa/inet.h>
32 #include <netinet/in.h>
34 
36  : request_port_(3001)
37  , request_addr_("227.128.12.26")
38  , running_(false)
39  , run_number_(0)
40  , request_socket_(-1)
41  , requests_()
42  , request_timing_()
43  , request_stop_requested_(false)
44  , request_received_(false)
45  , end_of_run_timeout_ms_(1000)
46  , should_stop_(false)
47  , highest_seen_request_(0)
48  , out_of_order_requests_()
49  , request_increment_(1)
50 {}
51 
52 artdaq::RequestReceiver::RequestReceiver(const fhicl::ParameterSet& ps)
53  : request_port_(ps.get<int>("request_port", 3001))
54  , request_addr_(ps.get<std::string>("request_address", "227.128.12.26"))
55  , multicast_in_addr_(ps.get<std::string>("multicast_interface_ip", "0.0.0.0"))
56  , running_(false)
57  , run_number_(0)
58  , request_socket_(-1)
59  , requests_()
60  , request_timing_()
61  , request_stop_requested_(false)
62  , request_received_(false)
63  , end_of_run_timeout_ms_(ps.get<size_t>("end_of_run_quiet_timeout_ms", 1000))
64  , should_stop_(false)
65  , highest_seen_request_(0)
66  , out_of_order_requests_()
67  , request_increment_(ps.get<artdaq::Fragment::sequence_id_t>("request_increment", 1))
68 {
70 }
71 
73 {
74  TLOG(TLVL_INFO) << "Setting up request listen socket, rank=" << my_rank << ", address=" << request_addr_ << ":" << request_port_
75  << ", multicast interface=" << multicast_in_addr_;
76  request_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
77  if (request_socket_ < 0)
78  {
79  TLOG(TLVL_ERROR) << "Error creating socket for receiving data requests! err=" << strerror(errno);
80  exit(1);
81  }
82 
83  struct sockaddr_in si_me_request;
84 
85  int yes = 1;
86  if (setsockopt(request_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
87  {
88  TLOG(TLVL_ERROR) << "Unable to enable port reuse on request socket, err=" << strerror(errno);
89  exit(1);
90  }
91  memset(&si_me_request, 0, sizeof(si_me_request));
92  si_me_request.sin_family = AF_INET;
93  si_me_request.sin_port = htons(request_port_);
94  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
95  if (bind(request_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
96  {
97  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << request_port_ << ", err=" << strerror(errno);
98  exit(1);
99  }
100 
101  if (request_addr_ != "localhost")
102  {
103  struct ip_mreq mreq;
104  int sts = ResolveHost(request_addr_.c_str(), mreq.imr_multiaddr);
105  if (sts == -1)
106  {
107  TLOG(TLVL_ERROR) << "Unable to resolve multicast request address, err=" << strerror(errno);
108  exit(1);
109  }
110  sts = GetInterfaceForNetwork(multicast_in_addr_.c_str(), mreq.imr_interface);
111  if (sts == -1)
112  {
113  TLOG(TLVL_ERROR) << "Unable to resolve hostname for " << multicast_in_addr_;
114  exit(1);
115  }
116  if (setsockopt(request_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
117  {
118  TLOG(TLVL_ERROR) << "Unable to join multicast group, err=" << strerror(errno);
119  exit(1);
120  }
121  }
122  TLOG(TLVL_INFO) << "Done setting up request socket, rank=" << my_rank;
123 }
124 
125 artdaq::RequestReceiver::~RequestReceiver()
126 {
127  stopRequestReception(true);
128 }
129 
131 {
132  std::unique_lock<std::mutex> lk(state_mutex_);
133  if (!request_received_ && !force)
134  {
135  TLOG(TLVL_ERROR) << "Stop request received by RequestReceiver, but no requests have ever been received." << std::endl
136  << "Check that UDP port " << request_port_ << " is open in the firewall config.";
137  }
138  should_stop_ = true;
139  if (running_) {
140  TLOG(TLVL_DEBUG) << "Joining requestThread";
141  if (requestThread_.joinable()) requestThread_.join();
142  bool once = true;
143  while (running_) {
144  if (once) TLOG(TLVL_ERROR) << "running_ is true after thread join! Should NOT happen";
145  once = false;
146  usleep(10000);
147  }
148  }
149 
150  if (request_socket_ != -1)
151  {
152  close(request_socket_);
153  request_socket_ = -1;
154  }
155  request_received_ = false;
156  highest_seen_request_ = 0;
157 }
158 
160 {
161  std::unique_lock<std::mutex> lk(state_mutex_);
162  if (requestThread_.joinable()) requestThread_.join();
163  should_stop_ = false;
164  request_stop_requested_ = false;
165 
166  if (request_socket_ == -1)
167  {
168  TLOG(TLVL_INFO) << "Connecting Request Reception socket";
169  setupRequestListener();
170  }
171 
172  TLOG(TLVL_INFO) << "Starting Request Reception Thread";
173  try {
174  requestThread_ = boost::thread(&RequestReceiver::receiveRequestsLoop, this);
175  }
176  catch (const boost::exception& e)
177  {
178  TLOG(TLVL_ERROR) << "Caught boost::exception starting Request Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
179  std::cerr << "Caught boost::exception starting Request Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
180  exit(5);
181  }
182  running_ = true;
183 }
184 
186 {
187  while (!should_stop_)
188  {
189  TLOG(16) << "receiveRequestsLoop: Polling Request socket for new requests";
190 
191  if (request_socket_ == -1)
192  {
193  setupRequestListener();
194  }
195 
196  int ms_to_wait = 10;
197  struct pollfd ufds[1];
198  ufds[0].fd = request_socket_;
199  ufds[0].events = POLLIN | POLLPRI | POLLERR;
200  int rv = poll(ufds, 1, ms_to_wait);
201 
202  // Continue loop if no message received or message does not have correct event ID
203  if (rv <= 0 || (ufds[0].revents != POLLIN && ufds[0].revents != POLLPRI))
204  {
205  if (rv == 1 && (ufds[0].revents & (POLLNVAL | POLLERR | POLLHUP)))
206  {
207  close(request_socket_);
208  request_socket_ = -1;
209  }
210  if (request_stop_requested_ && TimeUtils::GetElapsedTimeMilliseconds(request_stop_timeout_) > end_of_run_timeout_ms_)
211  {
212  break;
213  }
214  continue;
215  }
216 
217  TLOG(11) << "Received packet on Request channel";
218  std::vector<uint8_t> buffer(MAX_REQUEST_MESSAGE_SIZE);
219  struct sockaddr_in from;
220  socklen_t len = sizeof(from);
221  auto sts = recvfrom(request_socket_, &buffer[0], MAX_REQUEST_MESSAGE_SIZE, 0, (struct sockaddr*)&from, &len);
222  if (sts < 0)
223  {
224  TLOG(TLVL_ERROR) << "Error receiving request message header err=" << strerror(errno);
225  close(request_socket_);
226  request_socket_ = -1;
227  continue;
228  }
229 
230  auto hdr_buffer = reinterpret_cast<artdaq::detail::RequestHeader*>(&buffer[0]);
231  TLOG(11) << "Request header word: 0x" << std::hex << hdr_buffer->header << std::dec << ", packet_count: " << hdr_buffer->packet_count << " from rank " << hdr_buffer->rank << ", " << inet_ntoa(from.sin_addr) << ":" << from.sin_port << ", run number: " << hdr_buffer->run_number;
232  if (!hdr_buffer->isValid()) continue;
233 
234  request_received_ = true;
235 
236  // 19-Dec-2018, KAB: added check on current run number
237  if (run_number_ != 0 && hdr_buffer->run_number != run_number_)
238  {
239  TLOG(TLVL_WARNING) << "Received a Request Message with the wrong run number ("
240  << hdr_buffer->run_number << "), expected " << run_number_
241  << ", ignoring this request.";
242  continue;
243  }
244 
245  if (hdr_buffer->mode == artdaq::detail::RequestMessageMode::EndOfRun)
246  {
247  TLOG(TLVL_INFO) << "Received Request Message with the EndOfRun marker. (Re)Starting 1-second timeout for receiving all outstanding requests...";
248  request_stop_timeout_ = std::chrono::steady_clock::now();
249  request_stop_requested_ = true;
250  }
251 
252  std::vector<artdaq::detail::RequestPacket> pkt_buffer(hdr_buffer->packet_count);
253  memcpy(&pkt_buffer[0], &buffer[sizeof(artdaq::detail::RequestHeader)], sizeof(artdaq::detail::RequestPacket) * hdr_buffer->packet_count);
254  bool anyNew = false;
255 
256  if (should_stop_) break;
257 
258  for (auto& buffer : pkt_buffer)
259  {
260  TLOG(20) << "Request Packet: hdr=" << /*std::dec <<*/ buffer.header << ", seq=" << buffer.sequence_id << ", ts=" << buffer.timestamp;
261  if (!buffer.isValid()) continue;
262  std::unique_lock<std::mutex> tlk(request_mutex_);
263  if (requests_.count(buffer.sequence_id) && requests_[buffer.sequence_id] != buffer.timestamp)
264  {
265  TLOG(TLVL_ERROR) << "Received conflicting request for SeqID "
266  << buffer.sequence_id << "!"
267  << " Old ts=" << requests_[buffer.sequence_id]
268  << ", new ts=" << buffer.timestamp << ". Keeping OLD!";
269  }
270  else if (!requests_.count(buffer.sequence_id))
271  {
272  int delta = buffer.sequence_id - highest_seen_request_;
273  TLOG(11) << "Received request for sequence ID " << buffer.sequence_id
274  << " and timestamp " << buffer.timestamp << " (delta: " << delta << ")";
275  if (delta <= 0 || out_of_order_requests_.count(buffer.sequence_id))
276  {
277  TLOG(11) << "Already serviced this request ( sequence ID " << buffer.sequence_id << ")! Ignoring...";
278  }
279  else
280  {
281  requests_[buffer.sequence_id] = buffer.timestamp;
282  request_timing_[buffer.sequence_id] = std::chrono::steady_clock::now();
283  anyNew = true;
284  }
285  }
286  }
287  if (anyNew)
288  {
289  request_cv_.notify_all();
290  }
291  }
292  TLOG(TLVL_DEBUG) << "Ending Request Thread";
293  running_ = false;
294 }
295 
296 void artdaq::RequestReceiver::RemoveRequest(artdaq::Fragment::sequence_id_t reqID)
297 {
298  TLOG(10) << "RemoveRequest: Removing request for id " << reqID;
299  std::unique_lock<std::mutex> lk(request_mutex_);
300  requests_.erase(reqID);
301 
302  if (reqID > highest_seen_request_)
303  {
304  TLOG(10) << "RemoveRequest: out_of_order_requests_.size() == " << out_of_order_requests_.size() << ", reqID=" << reqID << ", expected=" << highest_seen_request_ + request_increment_;
305  if (out_of_order_requests_.size() || reqID != highest_seen_request_ + request_increment_)
306  {
307  out_of_order_requests_.insert(reqID);
308 
309  auto it = out_of_order_requests_.begin();
310  while (it != out_of_order_requests_.end() && !should_stop_) // Stop accounting for requests after stop
311  {
312  if (*it == highest_seen_request_ + request_increment_)
313  {
314  highest_seen_request_ = *it;
315  it = out_of_order_requests_.erase(it);
316  }
317  else
318  {
319  break;
320  }
321  }
322  }
323  else // no out-of-order requests and this request is highest seen + request_increment_
324  {
325  highest_seen_request_ = reqID;
326  }
327  TLOG(10) << "RemoveRequest: reqID=" << reqID << " Setting highest_seen_request_ to " << highest_seen_request_;
328  }
329  if (metricMan && request_timing_.count(reqID))
330  {
331  metricMan->sendMetric("Request Response Time", TimeUtils::GetElapsedTime(request_timing_[reqID]), "seconds", 2, MetricMode::Average);
332  }
333  request_timing_.erase(reqID);
334 }
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
void startRequestReception()
Enables (starts) the reception of data requests.
End of Run mode (Used to end request processing on receiver)
RequestReceiver()
RequestReceiver Default Constructor.
int GetInterfaceForNetwork(char const *host_in, in_addr &addr)
Convert an IP address to the network address of the interface sharing the subnet mask.
Definition: TCPConnect.cc:217
Header of a RequestMessage. Contains magic bytes for validation and a count of expected RequestPacket...
void setupRequestListener()
Opens the socket used to listen for data requests.
void receiveRequestsLoop()
This function receives data request packets, adding new requests to the request list.
The RequestPacket contains information about a single data request.
void stopRequestReception(bool force=false)
Disables (stops) the reception of data requests.
void RemoveRequest(artdaq::Fragment::sequence_id_t reqID)
Remove the request with the given sequence ID from the request map