artdaq  v3_03_00
RequestReceiver.cc
1 #define TRACE_NAME (app_name + "_RequestReceiver").c_str()
2 #include "artdaq/DAQdata/Globals.hh"
3 
4 #include "artdaq/DAQrate/RequestReceiver.hh"
5 #include "artdaq/DAQdata/Globals.hh"
6 #include "artdaq/DAQrate/detail/RequestMessage.hh"
7 
8 #include <boost/exception/all.hpp>
9 #include <boost/throw_exception.hpp>
10 
11 #include <limits>
12 #include <iterator>
13 
14 #include "canvas/Utilities/Exception.h"
15 #include "cetlib_except/exception.h"
16 #include "fhiclcpp/ParameterSet.h"
17 
18 #include "artdaq-core/Utilities/SimpleLookupPolicy.hh"
19 #include "artdaq-core/Data/Fragment.hh"
20 #include "artdaq-core/Data/ContainerFragmentLoader.hh"
21 #include "artdaq-core/Utilities/ExceptionHandler.hh"
22 #include "artdaq-core/Utilities/TimeUtils.hh"
23 
24 #include <fstream>
25 #include <iomanip>
26 #include <iterator>
27 #include <iostream>
28 #include <iomanip>
29 #include <algorithm>
30 #include <sys/poll.h>
31 #include <arpa/inet.h>
32 #include <netinet/in.h>
34 
36  : request_port_(3001)
37  , request_addr_("227.128.12.26")
38  , running_(false)
39  , requests_()
40  , request_timing_()
41  , request_stop_requested_(false)
42  , request_received_(false)
43  , end_of_run_timeout_ms_(1000)
44  , should_stop_(false)
45  , highest_seen_request_(0)
46  , out_of_order_requests_()
47  , request_increment_(1)
48 {}
49 
50 artdaq::RequestReceiver::RequestReceiver(const fhicl::ParameterSet& ps)
51  : request_port_(ps.get<int>("request_port", 3001))
52  , request_addr_(ps.get<std::string>("request_address", "227.128.12.26"))
53  , multicast_out_addr_(ps.get<std::string>("multicast_interface_ip", "0.0.0.0"))
54  , running_(false)
55  , requests_()
56  , request_timing_()
57  , request_stop_requested_(false)
58  , request_received_(false)
59  , end_of_run_timeout_ms_(ps.get<size_t>("end_of_run_quiet_timeout_ms", 1000))
60  , should_stop_(false)
61  , highest_seen_request_(0)
62  , out_of_order_requests_()
63  , request_increment_(ps.get<artdaq::Fragment::sequence_id_t>("request_increment", 1))
64 {
66 }
67 
69 {
70  TLOG(TLVL_INFO) << "Setting up request listen socket, rank=" << my_rank << ", address=" << request_addr_ << ":" << request_port_;
71  request_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
72  if (request_socket_ < 0)
73  {
74  TLOG(TLVL_ERROR) << "Error creating socket for receiving data requests! err=" << strerror(errno);
75  exit(1);
76  }
77 
78  struct sockaddr_in si_me_request;
79 
80  int yes = 1;
81  if (setsockopt(request_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
82  {
83  TLOG(TLVL_ERROR) << "Unable to enable port reuse on request socket, err=" << strerror(errno);
84  exit(1);
85  }
86  memset(&si_me_request, 0, sizeof(si_me_request));
87  si_me_request.sin_family = AF_INET;
88  si_me_request.sin_port = htons(request_port_);
89  si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
90  if (bind(request_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
91  {
92  TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << request_port_ << ", err=" << strerror(errno);
93  exit(1);
94  }
95 
96  if (request_addr_ != "localhost")
97  {
98  struct ip_mreq mreq;
99  int sts = ResolveHost(request_addr_.c_str(), mreq.imr_multiaddr);
100  if (sts == -1)
101  {
102  TLOG(TLVL_ERROR) << "Unable to resolve multicast request address, err=" << strerror(errno);
103  exit(1);
104  }
105  sts = GetInterfaceForNetwork(multicast_out_addr_.c_str(), mreq.imr_interface);
106  if (sts == -1)
107  {
108  TLOG(TLVL_ERROR) << "Unable to resolve hostname for " << multicast_out_addr_;
109  exit(1);
110  }
111  if (setsockopt(request_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
112  {
113  TLOG(TLVL_ERROR) << "Unable to join multicast group, err=" << strerror(errno);
114  exit(1);
115  }
116  }
117  TLOG(TLVL_INFO) << "Done setting up request socket, rank=" << my_rank;
118 }
119 
120 artdaq::RequestReceiver::~RequestReceiver()
121 {
122  stopRequestReceiverThread(true);
123 }
124 
126 {
127  std::unique_lock<std::mutex> lk(state_mutex_);
128  if (!request_received_ && !force)
129  {
130  TLOG(TLVL_ERROR) << "Stop request received by RequestReceiver, but no requests have ever been received." << std::endl
131  << "Check that UDP port " << request_port_ << " is open in the firewall config.";
132  }
133  should_stop_ = true;
134  if (running_) {
135  TLOG(TLVL_DEBUG) << "Joining requestThread";
136  if (requestThread_.joinable()) requestThread_.join();
137  bool once = true;
138  while (running_) {
139  if (once) TLOG(TLVL_ERROR) << "running_ is true after thread join! Should NOT happen";
140  once = false;
141  usleep(10000);
142  }
143  }
144 
145  if (request_socket_ != -1)
146  {
147  close(request_socket_);
148  request_socket_ = -1;
149  }
150  request_received_ = false;
151  highest_seen_request_ = 0;
152 }
153 
155 {
156  std::unique_lock<std::mutex> lk(state_mutex_);
157  if (requestThread_.joinable()) requestThread_.join();
158  should_stop_ = false;
159  request_stop_requested_ = false;
160 
161  if (request_socket_ == -1)
162  {
163  TLOG(TLVL_INFO) << "Connecting Request Reception socket";
164  setupRequestListener();
165  }
166 
167  TLOG(TLVL_INFO) << "Starting Request Reception Thread";
168  try {
169  requestThread_ = boost::thread(&RequestReceiver::receiveRequestsLoop, this);
170  }
171  catch (const boost::exception& e)
172  {
173  TLOG(TLVL_ERROR) << "Caught boost::exception starting Request Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
174  std::cerr << "Caught boost::exception starting Request Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
175  exit(5);
176  }
177  running_ = true;
178 }
179 
181 {
182  while (!should_stop_)
183  {
184  TLOG(16) << "receiveRequestsLoop: Polling Request socket for new requests";
185 
186  if (request_socket_ == -1)
187  {
188  setupRequestListener();
189  }
190 
191  int ms_to_wait = 10;
192  struct pollfd ufds[1];
193  ufds[0].fd = request_socket_;
194  ufds[0].events = POLLIN | POLLPRI | POLLERR;
195  int rv = poll(ufds, 1, ms_to_wait);
196 
197  // Continue loop if no message received or message does not have correct event ID
198  if (rv <= 0 || (ufds[0].revents != POLLIN && ufds[0].revents != POLLPRI))
199  {
200  if (rv == 1 && (ufds[0].revents & (POLLNVAL | POLLERR | POLLHUP)))
201  {
202  close(request_socket_);
203  request_socket_ = -1;
204  }
205  if (request_stop_requested_ && TimeUtils::GetElapsedTimeMilliseconds(request_stop_timeout_) > end_of_run_timeout_ms_)
206  {
207  break;
208  }
209  continue;
210  }
211 
212  TLOG(11) << "Received packet on Request channel";
213  std::vector<uint8_t> buffer(MAX_REQUEST_MESSAGE_SIZE);
214  struct sockaddr_in from;
215  socklen_t len = sizeof(from);
216  auto sts = recvfrom(request_socket_, &buffer[0], MAX_REQUEST_MESSAGE_SIZE, 0, (struct sockaddr*)&from, &len);
217  if (sts < 0)
218  {
219  TLOG(TLVL_ERROR) << "Error receiving request message header err=" << strerror(errno);
220  close(request_socket_);
221  request_socket_ = -1;
222  continue;
223  }
224 
225  auto hdr_buffer = reinterpret_cast<artdaq::detail::RequestHeader*>(&buffer[0]);
226  TLOG(11) << "Request header word: 0x" << std::hex << hdr_buffer->header /*<< std::dec*/ << ", packet_count: " << hdr_buffer->packet_count << " from rank " << hdr_buffer->rank << ", " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
227  if (!hdr_buffer->isValid()) continue;
228 
229  request_received_ = true;
230  if (hdr_buffer->mode == artdaq::detail::RequestMessageMode::EndOfRun)
231  {
232  TLOG(TLVL_INFO) << "Received Request Message with the EndOfRun marker. (Re)Starting 1-second timeout for receiving all outstanding requests...";
233  request_stop_timeout_ = std::chrono::steady_clock::now();
234  request_stop_requested_ = true;
235  }
236 
237  std::vector<artdaq::detail::RequestPacket> pkt_buffer(hdr_buffer->packet_count);
238  memcpy(&pkt_buffer[0], &buffer[sizeof(artdaq::detail::RequestHeader)], sizeof(artdaq::detail::RequestPacket) * hdr_buffer->packet_count);
239  bool anyNew = false;
240 
241  if (should_stop_) break;
242 
243  for (auto& buffer : pkt_buffer)
244  {
245  TLOG(20) << "Request Packet: hdr=" << /*std::dec <<*/ buffer.header << ", seq=" << buffer.sequence_id << ", ts=" << buffer.timestamp;
246  if (!buffer.isValid()) continue;
247  std::unique_lock<std::mutex> tlk(request_mutex_);
248  if (requests_.count(buffer.sequence_id) && requests_[buffer.sequence_id] != buffer.timestamp)
249  {
250  TLOG(TLVL_ERROR) << "Received conflicting request for SeqID "
251  << buffer.sequence_id << "!"
252  << " Old ts=" << requests_[buffer.sequence_id]
253  << ", new ts=" << buffer.timestamp << ". Keeping OLD!";
254  }
255  else if (!requests_.count(buffer.sequence_id))
256  {
257  int delta = buffer.sequence_id - highest_seen_request_;
258  TLOG(11) << "Received request for sequence ID " << buffer.sequence_id
259  << " and timestamp " << buffer.timestamp << " (delta: " << delta << ")";
260  if (delta <= 0 || out_of_order_requests_.count(buffer.sequence_id))
261  {
262  TLOG(11) << "Already serviced this request ( sequence ID " << buffer.sequence_id << ")! Ignoring...";
263  }
264  else
265  {
266  requests_[buffer.sequence_id] = buffer.timestamp;
267  request_timing_[buffer.sequence_id] = std::chrono::steady_clock::now();
268  anyNew = true;
269  }
270  }
271  }
272  if (anyNew)
273  {
274  request_cv_.notify_all();
275  }
276  }
277  TLOG(TLVL_DEBUG) << "Ending Request Thread";
278  running_ = false;
279 }
280 
281 void artdaq::RequestReceiver::RemoveRequest(artdaq::Fragment::sequence_id_t reqID)
282 {
283  TLOG(10) << "RemoveRequest: Removing request for id " << reqID;
284  std::unique_lock<std::mutex> lk(request_mutex_);
285  requests_.erase(reqID);
286 
287  if (reqID > highest_seen_request_)
288  {
289  TLOG(10) << "RemoveRequest: out_of_order_requests_.size() == " << out_of_order_requests_.size() << ", reqID=" << reqID << ", expected=" << highest_seen_request_ + request_increment_;
290  if (out_of_order_requests_.size() || reqID != highest_seen_request_ + request_increment_)
291  {
292  out_of_order_requests_.insert(reqID);
293 
294  auto it = out_of_order_requests_.begin();
295  while (it != out_of_order_requests_.end() && !should_stop_) // Stop accounting for requests after stop
296  {
297  if (*it == highest_seen_request_ + request_increment_)
298  {
299  highest_seen_request_ = *it;
300  it = out_of_order_requests_.erase(it);
301  }
302  else
303  {
304  break;
305  }
306  }
307  }
308  else // no out-of-order requests and this request is highest seen + request_increment_
309  {
310  highest_seen_request_ = reqID;
311  }
312  TLOG(10) << "RemoveRequest: reqID=" << reqID << " Setting highest_seen_request_ to " << highest_seen_request_;
313  }
314  if (metricMan && request_timing_.count(reqID))
315  {
316  metricMan->sendMetric("Request Response Time", TimeUtils::GetElapsedTime(request_timing_[reqID]), "seconds", 2, MetricMode::Average);
317  }
318  request_timing_.erase(reqID);
319 }
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Definition: TCPConnect.cc:33
End of Run mode (Used to end request processing on receiver)
void startRequestReceiverThread()
Function that launches the data request receiver thread (receiveRequestsLoop())
RequestReceiver()
RequestReceiver Default Constructor.
int GetInterfaceForNetwork(char const *host_in, in_addr &addr)
Convert an IP address to the network address of the interface sharing the subnet mask.
Definition: TCPConnect.cc:217
Header of a RequestMessage. Contains magic bytes for validation and a count of expected RequestPacket...
void stopRequestReceiverThread(bool force=false)
Stop the data request receiver thread (receiveRequestsLoop)
void setupRequestListener()
Opens the socket used to listen for data requests.
void receiveRequestsLoop()
This function receives data request packets, adding new requests to the request list.
The RequestPacket contains information about a single data request.
void RemoveRequest(artdaq::Fragment::sequence_id_t reqID)
Remove the request with the given sequence ID from the request map