artdaq  v3_05_00
RequestReceiver.cc
1 #define TRACE_NAME (app_name + "_RequestReceiver").c_str()
2 #include "artdaq/DAQdata/Globals.hh"
3 
4 #include "artdaq/DAQrate/RequestReceiver.hh"
5 #include "artdaq/DAQdata/Globals.hh"
6 #include "artdaq/DAQrate/detail/RequestMessage.hh"
7 
8 #include <boost/exception/all.hpp>
9 #include <boost/throw_exception.hpp>
10 
11 #include <limits>
12 #include <iterator>
13 
14 #include "canvas/Utilities/Exception.h"
15 #include "cetlib_except/exception.h"
16 #include "fhiclcpp/ParameterSet.h"
17 
18 #include "artdaq-core/Utilities/SimpleLookupPolicy.hh"
19 #include "artdaq-core/Data/Fragment.hh"
20 #include "artdaq-core/Data/ContainerFragmentLoader.hh"
21 #include "artdaq-core/Utilities/ExceptionHandler.hh"
22 #include "artdaq-core/Utilities/TimeUtils.hh"
23 
24 #include <fstream>
25 #include <iomanip>
26 #include <iterator>
27 #include <iostream>
28 #include <iomanip>
29 #include <algorithm>
30 #include <sys/poll.h>
31 #include <arpa/inet.h>
32 #include <netinet/in.h>
34 
36  : request_port_(3001)
37  , request_addr_("227.128.12.26")
38  , running_(false)
39  , run_number_(0)
40  , request_socket_(-1)
41  , requests_()
42  , request_timing_()
43  , request_stop_requested_(false)
44  , request_received_(false)
45  , end_of_run_timeout_ms_(1000)
46  , should_stop_(false)
47  , highest_seen_request_(0)
48  , out_of_order_requests_()
49  , request_increment_(1)
50 {}
51 
52 artdaq::RequestReceiver::RequestReceiver(const fhicl::ParameterSet& ps)
53  : request_port_(ps.get<int>("request_port", 3001))
54  , request_addr_(ps.get<std::string>("request_address", "227.128.12.26"))
55  , multicast_out_addr_(ps.get<std::string>("multicast_interface_ip", "0.0.0.0"))
56  , running_(false)
57  , run_number_(0)
58  , request_socket_(-1)
59  , requests_()
60  , request_timing_()
61  , request_stop_requested_(false)
62  , request_received_(false)
63  , end_of_run_timeout_ms_(ps.get<size_t>("end_of_run_quiet_timeout_ms", 1000))
64  , should_stop_(false)
65  , highest_seen_request_(0)
66  , out_of_order_requests_()
67  , request_increment_(ps.get<artdaq::Fragment::sequence_id_t>("request_increment", 1))
68 {
70 }
71 
73 {
74  TLOG(TLVL_INFO) << "Setting up request listen socket, rank=" << my_rank << ", address=" << request_addr_ << ":" << request_port_;
75  request_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
76  if (request_socket_ < 0)
77  {
78  TLOG(TLVL_ERROR) << "Error creating socket for receiving data requests! err=" << strerror(errno);
79  exit(1);
80  }
81 
82  struct sockaddr_in si_me_request;
83 
84  int yes = 1;
85  if (setsockopt(request_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
86  {
87  TLOG(TLVL_ERROR) << "Unable to enable port reuse on request socket, err=" << strerror(errno);
88  exit(1);
89  }
90  memset(&si_me_request, 0, sizeof(si_me_request));
91  si_me_request.sin_family = AF_INET;
92  si_me_request.sin_port = htons(request_port_);
93  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
94  if (bind(request_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
95  {
96  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << request_port_ << ", err=" << strerror(errno);
97  exit(1);
98  }
99 
100  if (request_addr_ != "localhost")
101  {
102  struct ip_mreq mreq;
103  int sts = ResolveHost(request_addr_.c_str(), mreq.imr_multiaddr);
104  if (sts == -1)
105  {
106  TLOG(TLVL_ERROR) << "Unable to resolve multicast request address, err=" << strerror(errno);
107  exit(1);
108  }
109  sts = GetInterfaceForNetwork(multicast_out_addr_.c_str(), mreq.imr_interface);
110  if (sts == -1)
111  {
112  TLOG(TLVL_ERROR) << "Unable to resolve hostname for " << multicast_out_addr_;
113  exit(1);
114  }
115  if (setsockopt(request_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
116  {
117  TLOG(TLVL_ERROR) << "Unable to join multicast group, err=" << strerror(errno);
118  exit(1);
119  }
120  }
121  TLOG(TLVL_INFO) << "Done setting up request socket, rank=" << my_rank;
122 }
123 
124 artdaq::RequestReceiver::~RequestReceiver()
125 {
126  stopRequestReception(true);
127 }
128 
130 {
131  std::unique_lock<std::mutex> lk(state_mutex_);
132  if (!request_received_ && !force)
133  {
134  TLOG(TLVL_ERROR) << "Stop request received by RequestReceiver, but no requests have ever been received." << std::endl
135  << "Check that UDP port " << request_port_ << " is open in the firewall config.";
136  }
137  should_stop_ = true;
138  if (running_) {
139  TLOG(TLVL_DEBUG) << "Joining requestThread";
140  if (requestThread_.joinable()) requestThread_.join();
141  bool once = true;
142  while (running_) {
143  if (once) TLOG(TLVL_ERROR) << "running_ is true after thread join! Should NOT happen";
144  once = false;
145  usleep(10000);
146  }
147  }
148 
149  if (request_socket_ != -1)
150  {
151  close(request_socket_);
152  request_socket_ = -1;
153  }
154  request_received_ = false;
155  highest_seen_request_ = 0;
156 }
157 
159 {
160  std::unique_lock<std::mutex> lk(state_mutex_);
161  if (requestThread_.joinable()) requestThread_.join();
162  should_stop_ = false;
163  request_stop_requested_ = false;
164 
165  if (request_socket_ == -1)
166  {
167  TLOG(TLVL_INFO) << "Connecting Request Reception socket";
168  setupRequestListener();
169  }
170 
171  TLOG(TLVL_INFO) << "Starting Request Reception Thread";
172  try {
173  requestThread_ = boost::thread(&RequestReceiver::receiveRequestsLoop, this);
174  }
175  catch (const boost::exception& e)
176  {
177  TLOG(TLVL_ERROR) << "Caught boost::exception starting Request Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
178  std::cerr << "Caught boost::exception starting Request Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
179  exit(5);
180  }
181  running_ = true;
182 }
183 
185 {
186  while (!should_stop_)
187  {
188  TLOG(16) << "receiveRequestsLoop: Polling Request socket for new requests";
189 
190  if (request_socket_ == -1)
191  {
192  setupRequestListener();
193  }
194 
195  int ms_to_wait = 10;
196  struct pollfd ufds[1];
197  ufds[0].fd = request_socket_;
198  ufds[0].events = POLLIN | POLLPRI | POLLERR;
199  int rv = poll(ufds, 1, ms_to_wait);
200 
201  // Continue loop if no message received or message does not have correct event ID
202  if (rv <= 0 || (ufds[0].revents != POLLIN && ufds[0].revents != POLLPRI))
203  {
204  if (rv == 1 && (ufds[0].revents & (POLLNVAL | POLLERR | POLLHUP)))
205  {
206  close(request_socket_);
207  request_socket_ = -1;
208  }
209  if (request_stop_requested_ && TimeUtils::GetElapsedTimeMilliseconds(request_stop_timeout_) > end_of_run_timeout_ms_)
210  {
211  break;
212  }
213  continue;
214  }
215 
216  TLOG(11) << "Received packet on Request channel";
217  std::vector<uint8_t> buffer(MAX_REQUEST_MESSAGE_SIZE);
218  struct sockaddr_in from;
219  socklen_t len = sizeof(from);
220  auto sts = recvfrom(request_socket_, &buffer[0], MAX_REQUEST_MESSAGE_SIZE, 0, (struct sockaddr*)&from, &len);
221  if (sts < 0)
222  {
223  TLOG(TLVL_ERROR) << "Error receiving request message header err=" << strerror(errno);
224  close(request_socket_);
225  request_socket_ = -1;
226  continue;
227  }
228 
229  auto hdr_buffer = reinterpret_cast<artdaq::detail::RequestHeader*>(&buffer[0]);
230  TLOG(11) << "Request header word: 0x" << std::hex << hdr_buffer->header << std::dec << ", packet_count: " << hdr_buffer->packet_count << " from rank " << hdr_buffer->rank << ", " << inet_ntoa(from.sin_addr) << ":" << from.sin_port << ", run number: " << hdr_buffer->run_number;
231  if (!hdr_buffer->isValid()) continue;
232 
233  request_received_ = true;
234 
235  // 19-Dec-2018, KAB: added check on current run number
236  if (run_number_ != 0 && hdr_buffer->run_number != run_number_)
237  {
238  TLOG(TLVL_WARNING) << "Received a Request Message with the wrong run number ("
239  << hdr_buffer->run_number << "), expected " << run_number_
240  << ", ignoring this request.";
241  continue;
242  }
243 
244  if (hdr_buffer->mode == artdaq::detail::RequestMessageMode::EndOfRun)
245  {
246  TLOG(TLVL_INFO) << "Received Request Message with the EndOfRun marker. (Re)Starting 1-second timeout for receiving all outstanding requests...";
247  request_stop_timeout_ = std::chrono::steady_clock::now();
248  request_stop_requested_ = true;
249  }
250 
251  std::vector<artdaq::detail::RequestPacket> pkt_buffer(hdr_buffer->packet_count);
252  memcpy(&pkt_buffer[0], &buffer[sizeof(artdaq::detail::RequestHeader)], sizeof(artdaq::detail::RequestPacket) * hdr_buffer->packet_count);
253  bool anyNew = false;
254 
255  if (should_stop_) break;
256 
257  for (auto& buffer : pkt_buffer)
258  {
259  TLOG(20) << "Request Packet: hdr=" << /*std::dec <<*/ buffer.header << ", seq=" << buffer.sequence_id << ", ts=" << buffer.timestamp;
260  if (!buffer.isValid()) continue;
261  std::unique_lock<std::mutex> tlk(request_mutex_);
262  if (requests_.count(buffer.sequence_id) && requests_[buffer.sequence_id] != buffer.timestamp)
263  {
264  TLOG(TLVL_ERROR) << "Received conflicting request for SeqID "
265  << buffer.sequence_id << "!"
266  << " Old ts=" << requests_[buffer.sequence_id]
267  << ", new ts=" << buffer.timestamp << ". Keeping OLD!";
268  }
269  else if (!requests_.count(buffer.sequence_id))
270  {
271  int delta = buffer.sequence_id - highest_seen_request_;
272  TLOG(11) << "Received request for sequence ID " << buffer.sequence_id
273  << " and timestamp " << buffer.timestamp << " (delta: " << delta << ")";
274  if (delta <= 0 || out_of_order_requests_.count(buffer.sequence_id))
275  {
276  TLOG(11) << "Already serviced this request ( sequence ID " << buffer.sequence_id << ")! Ignoring...";
277  }
278  else
279  {
280  requests_[buffer.sequence_id] = buffer.timestamp;
281  request_timing_[buffer.sequence_id] = std::chrono::steady_clock::now();
282  anyNew = true;
283  }
284  }
285  }
286  if (anyNew)
287  {
288  request_cv_.notify_all();
289  }
290  }
291  TLOG(TLVL_DEBUG) << "Ending Request Thread";
292  running_ = false;
293 }
294 
295 void artdaq::RequestReceiver::RemoveRequest(artdaq::Fragment::sequence_id_t reqID)
296 {
297  TLOG(10) << "RemoveRequest: Removing request for id " << reqID;
298  std::unique_lock<std::mutex> lk(request_mutex_);
299  requests_.erase(reqID);
300 
301  if (reqID > highest_seen_request_)
302  {
303  TLOG(10) << "RemoveRequest: out_of_order_requests_.size() == " << out_of_order_requests_.size() << ", reqID=" << reqID << ", expected=" << highest_seen_request_ + request_increment_;
304  if (out_of_order_requests_.size() || reqID != highest_seen_request_ + request_increment_)
305  {
306  out_of_order_requests_.insert(reqID);
307 
308  auto it = out_of_order_requests_.begin();
309  while (it != out_of_order_requests_.end() && !should_stop_) // Stop accounting for requests after stop
310  {
311  if (*it == highest_seen_request_ + request_increment_)
312  {
313  highest_seen_request_ = *it;
314  it = out_of_order_requests_.erase(it);
315  }
316  else
317  {
318  break;
319  }
320  }
321  }
322  else // no out-of-order requests and this request is highest seen + request_increment_
323  {
324  highest_seen_request_ = reqID;
325  }
326  TLOG(10) << "RemoveRequest: reqID=" << reqID << " Setting highest_seen_request_ to " << highest_seen_request_;
327  }
328  if (metricMan && request_timing_.count(reqID))
329  {
330  metricMan->sendMetric("Request Response Time", TimeUtils::GetElapsedTime(request_timing_[reqID]), "seconds", 2, MetricMode::Average);
331  }
332  request_timing_.erase(reqID);
333 }
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
void startRequestReception()
Enables (starts) the reception of data requests.
End of Run mode (Used to end request processing on receiver)
RequestReceiver()
RequestReceiver Default Constructor.
int GetInterfaceForNetwork(char const *host_in, in_addr &addr)
Convert an IP address to the network address of the interface sharing the subnet mask.
Definition: TCPConnect.cc:217
Header of a RequestMessage. Contains magic bytes for validation and a count of expected RequestPacket...
void setupRequestListener()
Opens the socket used to listen for data requests.
void receiveRequestsLoop()
This function receives data request packets, adding new requests to the request list.
The RequestPacket contains information about a single data request.
void stopRequestReception(bool force=false)
Disables (stops) the reception of data requests.
void RemoveRequest(artdaq::Fragment::sequence_id_t reqID)
Remove the request with the given sequence ID from the request map