artdaq  v3_04_01
RequestReceiver.cc
1 #define TRACE_NAME (app_name + "_RequestReceiver").c_str()
2 #include "artdaq/DAQdata/Globals.hh"
3 
4 #include "artdaq/DAQrate/RequestReceiver.hh"
5 #include "artdaq/DAQdata/Globals.hh"
6 #include "artdaq/DAQrate/detail/RequestMessage.hh"
7 
8 #include <boost/exception/all.hpp>
9 #include <boost/throw_exception.hpp>
10 
11 #include <limits>
12 #include <iterator>
13 
14 #include "canvas/Utilities/Exception.h"
15 #include "cetlib_except/exception.h"
16 #include "fhiclcpp/ParameterSet.h"
17 
18 #include "artdaq-core/Utilities/SimpleLookupPolicy.hh"
19 #include "artdaq-core/Data/Fragment.hh"
20 #include "artdaq-core/Data/ContainerFragmentLoader.hh"
21 #include "artdaq-core/Utilities/ExceptionHandler.hh"
22 #include "artdaq-core/Utilities/TimeUtils.hh"
23 
24 #include <fstream>
25 #include <iomanip>
26 #include <iterator>
27 #include <iostream>
28 #include <iomanip>
29 #include <algorithm>
30 #include <sys/poll.h>
31 #include <arpa/inet.h>
32 #include <netinet/in.h>
34 
36  : request_port_(3001)
37  , request_addr_("227.128.12.26")
38  , running_(false)
39  , run_number_(0)
40  , requests_()
41  , request_timing_()
42  , request_stop_requested_(false)
43  , request_received_(false)
44  , end_of_run_timeout_ms_(1000)
45  , should_stop_(false)
46  , highest_seen_request_(0)
47  , out_of_order_requests_()
48  , request_increment_(1)
49 {}
50 
51 artdaq::RequestReceiver::RequestReceiver(const fhicl::ParameterSet& ps)
52  : request_port_(ps.get<int>("request_port", 3001))
53  , request_addr_(ps.get<std::string>("request_address", "227.128.12.26"))
54  , multicast_out_addr_(ps.get<std::string>("multicast_interface_ip", "0.0.0.0"))
55  , running_(false)
56  , run_number_(0)
57  , requests_()
58  , request_timing_()
59  , request_stop_requested_(false)
60  , request_received_(false)
61  , end_of_run_timeout_ms_(ps.get<size_t>("end_of_run_quiet_timeout_ms", 1000))
62  , should_stop_(false)
63  , highest_seen_request_(0)
64  , out_of_order_requests_()
65  , request_increment_(ps.get<artdaq::Fragment::sequence_id_t>("request_increment", 1))
66 {
68 }
69 
71 {
72  TLOG(TLVL_INFO) << "Setting up request listen socket, rank=" << my_rank << ", address=" << request_addr_ << ":" << request_port_;
73  request_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
74  if (request_socket_ < 0)
75  {
76  TLOG(TLVL_ERROR) << "Error creating socket for receiving data requests! err=" << strerror(errno);
77  exit(1);
78  }
79 
80  struct sockaddr_in si_me_request;
81 
82  int yes = 1;
83  if (setsockopt(request_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
84  {
85  TLOG(TLVL_ERROR) << "Unable to enable port reuse on request socket, err=" << strerror(errno);
86  exit(1);
87  }
88  memset(&si_me_request, 0, sizeof(si_me_request));
89  si_me_request.sin_family = AF_INET;
90  si_me_request.sin_port = htons(request_port_);
91  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
92  if (bind(request_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
93  {
94  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << request_port_ << ", err=" << strerror(errno);
95  exit(1);
96  }
97 
98  if (request_addr_ != "localhost")
99  {
100  struct ip_mreq mreq;
101  int sts = ResolveHost(request_addr_.c_str(), mreq.imr_multiaddr);
102  if (sts == -1)
103  {
104  TLOG(TLVL_ERROR) << "Unable to resolve multicast request address, err=" << strerror(errno);
105  exit(1);
106  }
107  sts = GetInterfaceForNetwork(multicast_out_addr_.c_str(), mreq.imr_interface);
108  if (sts == -1)
109  {
110  TLOG(TLVL_ERROR) << "Unable to resolve hostname for " << multicast_out_addr_;
111  exit(1);
112  }
113  if (setsockopt(request_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
114  {
115  TLOG(TLVL_ERROR) << "Unable to join multicast group, err=" << strerror(errno);
116  exit(1);
117  }
118  }
119  TLOG(TLVL_INFO) << "Done setting up request socket, rank=" << my_rank;
120 }
121 
122 artdaq::RequestReceiver::~RequestReceiver()
123 {
124  stopRequestReception(true);
125 }
126 
128 {
129  std::unique_lock<std::mutex> lk(state_mutex_);
130  if (!request_received_ && !force)
131  {
132  TLOG(TLVL_ERROR) << "Stop request received by RequestReceiver, but no requests have ever been received." << std::endl
133  << "Check that UDP port " << request_port_ << " is open in the firewall config.";
134  }
135  should_stop_ = true;
136  if (running_) {
137  TLOG(TLVL_DEBUG) << "Joining requestThread";
138  if (requestThread_.joinable()) requestThread_.join();
139  bool once = true;
140  while (running_) {
141  if (once) TLOG(TLVL_ERROR) << "running_ is true after thread join! Should NOT happen";
142  once = false;
143  usleep(10000);
144  }
145  }
146 
147  if (request_socket_ != -1)
148  {
149  close(request_socket_);
150  request_socket_ = -1;
151  }
152  request_received_ = false;
153  highest_seen_request_ = 0;
154 }
155 
157 {
158  std::unique_lock<std::mutex> lk(state_mutex_);
159  if (requestThread_.joinable()) requestThread_.join();
160  should_stop_ = false;
161  request_stop_requested_ = false;
162 
163  if (request_socket_ == -1)
164  {
165  TLOG(TLVL_INFO) << "Connecting Request Reception socket";
166  setupRequestListener();
167  }
168 
169  TLOG(TLVL_INFO) << "Starting Request Reception Thread";
170  try {
171  requestThread_ = boost::thread(&RequestReceiver::receiveRequestsLoop, this);
172  }
173  catch (const boost::exception& e)
174  {
175  TLOG(TLVL_ERROR) << "Caught boost::exception starting Request Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
176  std::cerr << "Caught boost::exception starting Request Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
177  exit(5);
178  }
179  running_ = true;
180 }
181 
183 {
184  while (!should_stop_)
185  {
186  TLOG(16) << "receiveRequestsLoop: Polling Request socket for new requests";
187 
188  if (request_socket_ == -1)
189  {
190  setupRequestListener();
191  }
192 
193  int ms_to_wait = 10;
194  struct pollfd ufds[1];
195  ufds[0].fd = request_socket_;
196  ufds[0].events = POLLIN | POLLPRI | POLLERR;
197  int rv = poll(ufds, 1, ms_to_wait);
198 
199  // Continue loop if no message received or message does not have correct event ID
200  if (rv <= 0 || (ufds[0].revents != POLLIN && ufds[0].revents != POLLPRI))
201  {
202  if (rv == 1 && (ufds[0].revents & (POLLNVAL | POLLERR | POLLHUP)))
203  {
204  close(request_socket_);
205  request_socket_ = -1;
206  }
207  if (request_stop_requested_ && TimeUtils::GetElapsedTimeMilliseconds(request_stop_timeout_) > end_of_run_timeout_ms_)
208  {
209  break;
210  }
211  continue;
212  }
213 
214  TLOG(11) << "Received packet on Request channel";
215  std::vector<uint8_t> buffer(MAX_REQUEST_MESSAGE_SIZE);
216  struct sockaddr_in from;
217  socklen_t len = sizeof(from);
218  auto sts = recvfrom(request_socket_, &buffer[0], MAX_REQUEST_MESSAGE_SIZE, 0, (struct sockaddr*)&from, &len);
219  if (sts < 0)
220  {
221  TLOG(TLVL_ERROR) << "Error receiving request message header err=" << strerror(errno);
222  close(request_socket_);
223  request_socket_ = -1;
224  continue;
225  }
226 
227  auto hdr_buffer = reinterpret_cast<artdaq::detail::RequestHeader*>(&buffer[0]);
228  TLOG(11) << "Request header word: 0x" << std::hex << hdr_buffer->header << std::dec << ", packet_count: " << hdr_buffer->packet_count << " from rank " << hdr_buffer->rank << ", " << inet_ntoa(from.sin_addr) << ":" << from.sin_port << ", run number: " << hdr_buffer->run_number;
229  if (!hdr_buffer->isValid()) continue;
230 
231  request_received_ = true;
232 
233  // 19-Dec-2018, KAB: added check on current run number
234  if (run_number_ != 0 && hdr_buffer->run_number != run_number_)
235  {
236  TLOG(TLVL_WARNING) << "Received a Request Message with the wrong run number ("
237  << hdr_buffer->run_number << "), expected " << run_number_
238  << ", ignoring this request.";
239  continue;
240  }
241 
242  if (hdr_buffer->mode == artdaq::detail::RequestMessageMode::EndOfRun)
243  {
244  TLOG(TLVL_INFO) << "Received Request Message with the EndOfRun marker. (Re)Starting 1-second timeout for receiving all outstanding requests...";
245  request_stop_timeout_ = std::chrono::steady_clock::now();
246  request_stop_requested_ = true;
247  }
248 
249  std::vector<artdaq::detail::RequestPacket> pkt_buffer(hdr_buffer->packet_count);
250  memcpy(&pkt_buffer[0], &buffer[sizeof(artdaq::detail::RequestHeader)], sizeof(artdaq::detail::RequestPacket) * hdr_buffer->packet_count);
251  bool anyNew = false;
252 
253  if (should_stop_) break;
254 
255  for (auto& buffer : pkt_buffer)
256  {
257  TLOG(20) << "Request Packet: hdr=" << /*std::dec <<*/ buffer.header << ", seq=" << buffer.sequence_id << ", ts=" << buffer.timestamp;
258  if (!buffer.isValid()) continue;
259  std::unique_lock<std::mutex> tlk(request_mutex_);
260  if (requests_.count(buffer.sequence_id) && requests_[buffer.sequence_id] != buffer.timestamp)
261  {
262  TLOG(TLVL_ERROR) << "Received conflicting request for SeqID "
263  << buffer.sequence_id << "!"
264  << " Old ts=" << requests_[buffer.sequence_id]
265  << ", new ts=" << buffer.timestamp << ". Keeping OLD!";
266  }
267  else if (!requests_.count(buffer.sequence_id))
268  {
269  int delta = buffer.sequence_id - highest_seen_request_;
270  TLOG(11) << "Received request for sequence ID " << buffer.sequence_id
271  << " and timestamp " << buffer.timestamp << " (delta: " << delta << ")";
272  if (delta <= 0 || out_of_order_requests_.count(buffer.sequence_id))
273  {
274  TLOG(11) << "Already serviced this request ( sequence ID " << buffer.sequence_id << ")! Ignoring...";
275  }
276  else
277  {
278  requests_[buffer.sequence_id] = buffer.timestamp;
279  request_timing_[buffer.sequence_id] = std::chrono::steady_clock::now();
280  anyNew = true;
281  }
282  }
283  }
284  if (anyNew)
285  {
286  request_cv_.notify_all();
287  }
288  }
289  TLOG(TLVL_DEBUG) << "Ending Request Thread";
290  running_ = false;
291 }
292 
293 void artdaq::RequestReceiver::RemoveRequest(artdaq::Fragment::sequence_id_t reqID)
294 {
295  TLOG(10) << "RemoveRequest: Removing request for id " << reqID;
296  std::unique_lock<std::mutex> lk(request_mutex_);
297  requests_.erase(reqID);
298 
299  if (reqID > highest_seen_request_)
300  {
301  TLOG(10) << "RemoveRequest: out_of_order_requests_.size() == " << out_of_order_requests_.size() << ", reqID=" << reqID << ", expected=" << highest_seen_request_ + request_increment_;
302  if (out_of_order_requests_.size() || reqID != highest_seen_request_ + request_increment_)
303  {
304  out_of_order_requests_.insert(reqID);
305 
306  auto it = out_of_order_requests_.begin();
307  while (it != out_of_order_requests_.end() && !should_stop_) // Stop accounting for requests after stop
308  {
309  if (*it == highest_seen_request_ + request_increment_)
310  {
311  highest_seen_request_ = *it;
312  it = out_of_order_requests_.erase(it);
313  }
314  else
315  {
316  break;
317  }
318  }
319  }
320  else // no out-of-order requests and this request is highest seen + request_increment_
321  {
322  highest_seen_request_ = reqID;
323  }
324  TLOG(10) << "RemoveRequest: reqID=" << reqID << " Setting highest_seen_request_ to " << highest_seen_request_;
325  }
326  if (metricMan && request_timing_.count(reqID))
327  {
328  metricMan->sendMetric("Request Response Time", TimeUtils::GetElapsedTime(request_timing_[reqID]), "seconds", 2, MetricMode::Average);
329  }
330  request_timing_.erase(reqID);
331 }
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
void startRequestReception()
Enables (starts) the reception of data requests.
End of Run mode (Used to end request processing on receiver)
RequestReceiver()
RequestReceiver Default Constructor.
int GetInterfaceForNetwork(char const *host_in, in_addr &addr)
Convert an IP address to the network address of the interface sharing the subnet mask.
Definition: TCPConnect.cc:217
Header of a RequestMessage. Contains magic bytes for validation and a count of expected RequestPacket...
void setupRequestListener()
Opens the socket used to listen for data requests.
void receiveRequestsLoop()
This function receives data request packets, adding new requests to the request list.
The RequestPacket contains information about a single data request.
void stopRequestReception(bool force=false)
Disables (stops) the reception of data requests.
void RemoveRequest(artdaq::Fragment::sequence_id_t reqID)
Remove the request with the given sequence ID from the request map