artdaq  v3_09_01
TCPSocket_transfer.cc
1 // Sep 14, 2016. "TERMS AND CONDITIONS" governing this file are in the README
2 // or COPYING file. If you do not have such a file, one can be obtained by
3 // contacting Ron or Fermi Lab in Batavia IL, 60510, phone: 630-840-3000.
4 // $RCSfile: .emacs.gnu,v $
5 // rev="$Revision: 1.30 $$Date: 2016/03/01 14:27:27 $";
6 
7 // C Includes
8 #include <arpa/inet.h> // ntohl, ntohs
9 #include <poll.h> // struct pollfd
10 #include <sys/socket.h> // socket, socklen_t
11 #include <sys/types.h> // size_t
12 #include <sys/un.h> // sockaddr_un
13 #include <cstdlib> // atoi, strtoul
14 
15 // C++ Includes
16 #include <fstream>
17 #include <stdexcept>
18 #include <string>
19 
20 // product Includes
21 #include "artdaq/DAQdata/Globals.hh"
22 #define TRACE_NAME (app_name + "_TCPSocketTransfer").c_str()
23 
24 // artdaq Includes
25 #include <iomanip>
26 #include "artdaq-core/Data/Fragment.hh"
27 #include "artdaq-core/Utilities/TimeUtils.hh"
30 #include "artdaq/TransferPlugins/TCPSocketTransfer.hh"
31 #include "artdaq/TransferPlugins/detail/SRSockets.hh"
32 #include "artdaq/TransferPlugins/detail/Timeout.hh"
33 
34 #define USE_SENDMSG 1
35 
36 std::atomic<int> artdaq::TCPSocketTransfer::listen_thread_refcount_(0);
37 std::unique_ptr<boost::thread> artdaq::TCPSocketTransfer::listen_thread_ = nullptr;
38 std::map<int, std::set<int>> artdaq::TCPSocketTransfer::connected_fds_ = std::map<int, std::set<int>>();
39 std::mutex artdaq::TCPSocketTransfer::listen_thread_mutex_;
40 std::mutex artdaq::TCPSocketTransfer::fd_mutex_;
41 
43  TCPSocketTransfer(fhicl::ParameterSet const& pset, TransferInterface::Role role)
44  : TransferInterface(pset, role)
45  , send_fd_(-1)
46  , rcvbuf_(pset.get<size_t>("tcp_receive_buffer_size", 0))
47  , sndbuf_(pset.get<size_t>("tcp_send_buffer_size", max_fragment_size_words_ * sizeof(artdaq::RawDataType) * buffer_count_))
48  , send_retry_timeout_us_(pset.get<size_t>("send_retry_timeout_us", 1000000))
49  , timeoutMessageArmed_(true)
50  , receive_disconnected_wait_s_(pset.get<double>("receive_socket_disconnected_wait_s", 10.0))
51  , receive_err_wait_us_(pset.get<size_t>("receive_socket_disconnected_wait_us", 10000))
52  , receive_socket_has_been_connected_(false)
53  , send_ack_diff_(0)
54 {
55  TLOG(TLVL_DEBUG) << GetTraceName() << " Constructor: pset=" << pset.to_string() << ", role=" << (role == TransferInterface::Role::kReceive ? "kReceive" : "kSend");
56  connection_was_lost_ = false;
57 
59  {
60  // Wait for sender to connect...
61  TLOG(TLVL_DEBUG) << GetTraceName() << "Listening for connections";
62  start_listen_thread_();
63  TLOG(TLVL_DEBUG) << GetTraceName() << "Done Listening";
64  }
65  else
66  {
67  hostMap_ = MakeHostMap(pset);
68  TLOG(TLVL_DEBUG) << GetTraceName() << "Connecting to destination";
69  connect_();
70  TLOG(TLVL_DEBUG) << GetTraceName() << "Done Connecting";
71  }
72  TLOG(TLVL_DEBUG) << GetTraceName() << "End of Constructor";
73 }
74 
75 artdaq::TCPSocketTransfer::~TCPSocketTransfer() noexcept
76 {
77  TLOG(TLVL_DEBUG) << GetTraceName() << "Shutting down TCPSocketTransfer";
78 
79  if (role() == TransferInterface::Role::kSend)
80  {
81  // close all open connections (send stop_v0) first
82  MessHead mh = {0, MessHead::stop_v0, htons(TransferInterface::source_rank()), {0}};
83  if (send_fd_ != -1)
84  {
85  // should be blocking with modest timeo
86  timeval tv = {0, 100000};
87  socklen_t len = sizeof(tv);
88  setsockopt(send_fd_, SOL_SOCKET, SO_SNDTIMEO, &tv, len);
89  write(send_fd_, &mh, sizeof(mh));
90  }
91  close(send_fd_);
92  send_fd_ = -1;
93  }
94  else
95  {
97  try
98  {
99  if (ack_listen_thread_ && ack_listen_thread_->joinable())
100  {
101  ack_listen_thread_->join();
102  }
103  }
104  catch (...)
105  {
106  // IGNORED
107  }
108 
109  std::lock_guard<std::mutex> lk(listen_thread_mutex_);
110  listen_thread_refcount_--;
111  try
112  {
113  if (listen_thread_refcount_ <= 0 && listen_thread_ && listen_thread_->joinable())
114  {
115  listen_thread_->join();
116  }
117  }
118  catch (...)
119  {
120  // IGNORED
121  }
122  }
123 
124  TLOG(TLVL_DEBUG) << GetTraceName() << "End of Destructor";
125 }
126 
127 int artdaq::TCPSocketTransfer::receiveFragmentHeader(detail::RawFragmentHeader& header, size_t timeout_usec)
128 {
129  TLOG(5) << GetTraceName() << "receiveFragmentHeader: BEGIN";
130  int ret_rank = RECV_TIMEOUT;
131 
132  // Don't bomb out until received at least one connection...
133  if (getConnectedFDCount_(source_rank()) == 0)
134  { // what if just listen_fd???
135  // if (receive_socket_has_been_connected_ && TimeUtils::GetElapsedTime(last_recv_time_) > receive_disconnected_wait_s_)
136  // {
137  // TLOG(TLVL_ERROR) << GetTraceName() << "receiveFragmentHeader: senders have been disconnected for "
138  // << TimeUtils::GetElapsedTime(last_recv_time_) << " s (receive_socket_disconnected_wait_s = " << receive_disconnected_wait_s_ << " s). RETURNING DATA_END!";
139  // return DATA_END;
140  // }
141  //if (++not_connected_count_ > receive_err_threshold_) { return DATA_END; }
142  TLOG(7) << GetTraceName() << "receiveFragmentHeader: Receive socket not connected, returning RECV_TIMEOUT";
143  usleep(receive_err_wait_us_);
144  return RECV_TIMEOUT;
145  }
146  receive_socket_has_been_connected_ = true;
147  last_recv_time_ = std::chrono::steady_clock::now();
148 
149  TLOG(5) << GetTraceName() << "receiveFragmentHeader timeout_usec=" << timeout_usec;
150  //void* buff=alloca(max_fragment_size_words_*8);
151  size_t byte_cnt = 0;
152  int sts;
153  int offset = 0;
154  SocketState state = SocketState::Metadata;
155  int target_bytes = sizeof(MessHead);
156  uint64_t start_time_us = TimeUtils::gettimeofday_us();
157 
158  //while (active_receive_fd_ != -1)
159  //{
160  // TLOG(TLVL_TRACE) << GetTraceName() << "Currently receiving from fd " << active_receive_fd_ << ", waiting!";
161  // usleep(1000);
162  //}
163 
164  uint8_t* buff;
165 
166  int timeout_ms;
167  if (timeout_usec == 0)
168  {
169  timeout_ms = 0;
170  }
171  else
172  {
173  timeout_ms = (timeout_usec + 999) / 1000; // want at least 1 ms
174  }
175 
176  bool done = false;
177  bool noDataWarningSent = false;
178  int loop_guard = 0;
179 
180  while (!done && getConnectedFDCount_(source_rank()) > 0)
181  {
182  if (getActiveFD_(source_rank()) == -1)
183  {
184  loop_guard = 0;
185  size_t fd_count = 0;
186  std::vector<pollfd> pollfds;
187  {
188  std::lock_guard<std::mutex> lk(fd_mutex_);
189  fd_count = connected_fds_[source_rank()].size();
190  pollfds.resize(fd_count);
191  auto iter = connected_fds_[source_rank()].begin();
192  for (size_t ii = 0; ii < fd_count; ++ii)
193  {
194  pollfds[ii].events = POLLIN | POLLPRI | POLLERR;
195  pollfds[ii].fd = *iter;
196  ++iter;
197  }
198  }
199  //TLOG(TLVL_DEBUG) << GetTraceName() << "receiveFragment: Polling fd to see if there's data" ;
200  int num_fds_ready = poll(&pollfds[0], fd_count, timeout_ms);
201  if (num_fds_ready <= 0)
202  {
203  TLOG(5) << GetTraceName() << "receiveFragmentHeader: No data on receive socket, returning RECV_TIMEOUT";
204  return RECV_TIMEOUT;
205  }
206 
207  size_t index = 0;
208  if (getLastActiveFD_(source_rank()) != -1)
209  {
210  for (auto& pollfd : pollfds)
211  {
212  index++;
213  if (pollfd.fd == getLastActiveFD_(source_rank()))
214  {
215  break;
216  }
217  }
218  }
219 
220  int active_index = -1;
221  int16_t anomolous_events = 0;
222  for (size_t ii = index; ii < index + pollfds.size(); ++ii)
223  {
224  auto pollfd_index = (ii + index) % pollfds.size();
225  setActiveFD_(source_rank(), pollfds[pollfd_index].fd);
226  if ((pollfds[pollfd_index].revents & (POLLIN | POLLPRI)) != 0)
227  {
228  active_index = pollfd_index;
229  break;
230  }
231  if ((pollfds[pollfd_index].revents & (POLLHUP | POLLERR)) != 0)
232  {
233  disconnect_receive_socket_("Poll returned POLLHUP or POLLERR, indicating problems with the sender.");
234  continue;
235  }
236  else if ((pollfds[pollfd_index].revents & (POLLNVAL)) != 0)
237  {
238  TLOG(TLVL_DEBUG) << GetTraceName() << "receiveFragmentHeader: FD is closed, most likely because the peer went away. Removing from fd list.";
239  disconnect_receive_socket_("FD is closed, most likely because the peer went away.");
240  continue;
241  }
242  else if (pollfds[pollfd_index].revents != 0)
243  {
244  anomolous_events |= pollfds[pollfd_index].revents;
245  }
246  }
247 
248  if (active_index == -1)
249  {
250  if (anomolous_events != 0)
251  {
252  TLOG(TLVL_DEBUG) << GetTraceName() << "receiveFragmentHeader: Wrong event received from a pollfd. Mask: " << static_cast<int>(anomolous_events);
253  }
254  setActiveFD_(source_rank(), -1);
255  continue;
256  }
257 
258  if (!done && timeout_usec > 0)
259  {
260  // calc next timeout_ms (unless timed out)
261  size_t delta_us = TimeUtils::gettimeofday_us() - start_time_us;
262  if (delta_us > timeout_usec)
263  {
264  return RECV_TIMEOUT;
265  }
266  timeout_ms = ((timeout_usec - delta_us) + 999) / 1000; // want at least 1 ms
267  }
268  }
269  if (loop_guard > 10) { usleep(1000); }
270  if (++loop_guard > 10010)
271  {
272  TLOG(TLVL_WARNING) << GetTraceName() << "receiveFragmentHeader: loop guard triggered, returning RECV_TIMEOUT";
273  usleep(receive_err_wait_us_);
274  setActiveFD_(source_rank(), -1);
275  return RECV_TIMEOUT;
276  }
277 
278  if (state == SocketState::Metadata)
279  {
280  //TLOG(TLVL_DEBUG) << GetTraceName() << "receiveFragmentHeader: Reading Message Header" ;
281  buff = &(mha[offset]); // NOLINT(cppcoreguidelines-pro-bounds-constant-array-index)
282  byte_cnt = sizeof(MessHead) - offset;
283  }
284  else
285  {
286  //TLOG(TLVL_DEBUG) << GetTraceName() << "receiveFragmentHeader: Reading data" ;
287  buff = reinterpret_cast<uint8_t*>(&header) + offset; // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast,cppcoreguidelines-pro-bounds-pointer-arithmetic)
288  byte_cnt = target_bytes - offset;
289  }
290  //if (byte_cnt > sizeof(MessHead))
291  // {
292  // TLOG(TLVL_ERROR) << "Invalid byte count for read (count=" << byte_cnt
293  // << ",offset=" << offset << ",mh.byte_count=" << mh.byte_count
294  // << "), skipping read and returning RECV_TIMEOUT";
295  // return RECV_TIMEOUT;
296  //}
297 
298  auto fd = getActiveFD_(source_rank());
299  if (byte_cnt > 0)
300  {
301  TLOG(6) << GetTraceName() << "receiveFragmentHeader: Reading " << byte_cnt << " bytes from socket " << fd;
302  sts = read(fd, buff, byte_cnt);
303  TLOG(6) << GetTraceName() << "receiveFragmentHeader: Done with read";
304  }
305  if (sts > 0)
306  {
307  loop_guard = 0;
308  last_recv_time_ = std::chrono::steady_clock::now();
309  }
310 
311  TLOG(7) << GetTraceName() << "receiveFragmentHeader state=" << static_cast<int>(state) << " read=" << sts;
312  if (sts < 0 && errno != EAGAIN)
313  {
314  TLOG(TLVL_WARNING) << GetTraceName() << "receiveFragmentHeader: Error on receive, closing socket " << fd
315  << " (errno=" << errno << ": " << strerror(errno) << ")";
316  disconnect_receive_socket_("Error on receive");
317  }
318  else if (sts == 0 || errno == EAGAIN)
319  {
320  if (!noDataWarningSent)
321  {
322  TLOG(TLVL_WARNING) << GetTraceName() << "receiveFragmentHeader: No data received, is the sender still sending?!?";
323  noDataWarningSent = true;
324  }
325  if (TimeUtils::GetElapsedTime(last_recv_time_) > receive_disconnected_wait_s_)
326  {
327  TLOG(TLVL_ERROR) << GetTraceName() << "receiveFragmentHeader: No data received within timeout, aborting!";
328  return RECV_TIMEOUT;
329  }
330  }
331  else
332  {
333  // see if we're done (with this state)
334  sts = offset += sts;
335  if (sts >= target_bytes)
336  {
337  TLOG(7) << GetTraceName() << "receiveFragmentHeader: Target read bytes reached. Changing state";
338  offset = 0;
339  if (state == SocketState::Metadata)
340  {
341  state = SocketState::Data;
342  mh.byte_count = ntohl(mh.byte_count);
343  mh.source_id = ntohs(mh.source_id);
344  target_bytes = mh.byte_count;
345  TLOG(7) << GetTraceName() << "receiveFragmentHeader: Expected header size = " << target_bytes << ", sizeof(RawFragmentHeader) = " << sizeof(artdaq::detail::RawFragmentHeader);
346  //assert(target_bytes == sizeof(artdaq::detail::RawFragmentHeader) || target_bytes == 0);
347 
348  if (mh.message_type == MessHead::stop_v0)
349  {
350  disconnect_receive_socket_("Stop Message received.");
351  }
352  else if (mh.message_type == MessHead::data_v0 || mh.message_type == MessHead::data_more_v0)
353  {
354  TLOG(TLVL_WARNING) << GetTraceName() << "receiveFragmentHeader: Message header indicates that Fragment data follows when I was expecting a Fragment header!";
355  disconnect_receive_socket_("Desync detected");
356  }
357 
358  if (target_bytes == 0)
359  {
360  //Probably a stop_v0, return timeout so we can try again.
361  return RECV_TIMEOUT;
362  }
363  }
364  else
365  {
366  ret_rank = source_rank();
367  TLOG(8) << GetTraceName() << "receiveFragmentHeader done sts=" << sts << " src=" << ret_rank;
368  TLOG(7) << GetTraceName() << "receiveFragmentHeader: Done receiving fragment header. Moving into output.";
369 
370  done = true; // no more polls
371  //break; // no more read of ready fds
372  }
373  }
374  }
375 
376  } // while(!done)...poll
377 
378  TLOG(5) << GetTraceName() << "receiveFragmentHeader: Returning " << ret_rank;
379  return ret_rank;
380 }
381 
382 void artdaq::TCPSocketTransfer::disconnect_receive_socket_(const std::string& msg)
383 {
384  std::lock_guard<std::mutex> lk(fd_mutex_);
385  auto fd = active_receive_fds_[source_rank()];
386  TLOG(TLVL_WARNING) << GetTraceName() << "disconnect_receive_socket_: " << msg << " Closing socket " << fd << " for rank " << source_rank();
387  close(fd);
388  if (connected_fds_.count(source_rank()) != 0u)
389  {
390  connected_fds_[source_rank()].erase(fd);
391  }
392  active_receive_fds_[source_rank()] = -1;
393  TLOG(TLVL_DEBUG) << GetTraceName() << "disconnect_receive_socket_: There are now " << connected_fds_[source_rank()].size() << " active senders.";
394 }
395 
396 int artdaq::TCPSocketTransfer::receiveFragmentData(RawDataType* destination, size_t /*wordCount*/)
397 {
398  TLOG(19) << GetTraceName() << "receiveFragmentData: BEGIN";
399  int ret_rank = RECV_TIMEOUT;
400  if (getActiveFD_(source_rank()) == -1)
401  { // what if just listen_fd???
402  TLOG(TLVL_ERROR) << GetTraceName() << "receiveFragmentData: Receive socket not connected, returning RECV_TIMEOUT (Will result in \"Unexpected return code error\")";
403  return RECV_TIMEOUT;
404  }
405 
406  //void* buff=alloca(max_fragment_size_words_*8);
407  uint8_t* buff;
408  size_t byte_cnt = 0;
409  int sts;
410  int offset = 0;
411  SocketState state = SocketState::Metadata;
412  int target_bytes = sizeof(MessHead);
413 
414  pollfd pollfd_s;
415  pollfd_s.events = POLLIN | POLLPRI | POLLERR;
416  pollfd_s.fd = getActiveFD_(source_rank());
417 
418  int loop_guard = 0;
419  bool done = false;
420  bool noDataWarningSent = false;
421  last_recv_time_ = std::chrono::steady_clock::now();
422  while (!done)
423  {
424  TLOG(9) << GetTraceName() << "receiveFragmentData: Polling fd to see if there's data";
425  int num_fds_ready = poll(&pollfd_s, 1, 1000);
426  TLOG(TLVL_TRACE) << GetTraceName() << "receiveFragmentData: Polled fd to see if there's data"
427  << ", num_fds_ready = " << num_fds_ready;
428  if (num_fds_ready <= 0)
429  {
430  if (num_fds_ready == 0)
431  {
432  TLOG(TLVL_WARNING) << GetTraceName() << "receiveFragmentData: No data from " << source_rank() << " in " << TimeUtils::GetElapsedTimeMilliseconds(last_recv_time_) << " ms!"
433  << " State = " << (state == SocketState::Metadata ? "Metadata" : "Data") << ", recvd/total=" << offset << "/" << target_bytes << " (delta=" << target_bytes - offset << ")";
434 
435  if (TimeUtils::GetElapsedTime(last_recv_time_) > receive_disconnected_wait_s_)
436  {
437  TLOG(TLVL_WARNING) << GetTraceName() << "receiveFragmentData: No data received within timeout (" << TimeUtils::GetElapsedTime(last_recv_time_) << " / " << receive_disconnected_wait_s_ << " ), returning RECV_TIMEOUT";
438  disconnect_receive_socket_("No data on this socket within timeout");
439  return RECV_TIMEOUT;
440  }
441  continue;
442  }
443 
444  TLOG(TLVL_ERROR) << "Error in poll: errno=" << errno;
445  break;
446  }
447 
448  last_recv_time_ = std::chrono::steady_clock::now();
449 
450  if ((pollfd_s.revents & (POLLIN | POLLPRI)) != 0)
451  {
452  // Expected, don't have to check revents any further
453  }
454  else if ((pollfd_s.revents & (POLLNVAL)) != 0)
455  {
456  disconnect_receive_socket_("FD is closed, most likely because the peer went away.");
457  break;
458  }
459  else if ((pollfd_s.revents & (POLLHUP | POLLERR)) != 0)
460  {
461  disconnect_receive_socket_("Poll returned POLLHUP or POLLERR, indicating problems with the sender.");
462  break;
463  }
464  else
465  {
466  TLOG(TLVL_WARNING) << GetTraceName() << "receiveFragmentData: Wrong event received from pollfd: " << pollfd_s.revents;
467  disconnect_receive_socket_("Wrong event received from pollfd.");
468  break;
469  }
470 
471  if (state == SocketState::Metadata)
472  {
473  //TLOG(TLVL_DEBUG) << GetTraceName() << "receiveFragmentData: Reading Message Header" ;
474  buff = &(mha[offset]); // NOLINT(cppcoreguidelines-pro-bounds-constant-array-index)
475  byte_cnt = sizeof(MessHead) - offset;
476  }
477  else
478  {
479  //TLOG(TLVL_DEBUG) << GetTraceName() << "receiveFragmentData: Reading data" ;
480  buff = reinterpret_cast<uint8_t*>(destination) + offset; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic,cppcoreguidelines-pro-type-reinterpret-cast)
481  byte_cnt = mh.byte_count - offset;
482  }
483 
484  TLOG(10) << GetTraceName() << "receiveFragmentData: Reading " << byte_cnt << " bytes from socket into " << static_cast<void*>(buff);
485  sts = read(getActiveFD_(source_rank()), buff, byte_cnt);
486  //TLOG(TLVL_DEBUG) << GetTraceName() << "receiveFragmentData: Done with read" ;
487 
488  TLOG(10) << GetTraceName() << "recvFragment state=" << static_cast<int>(state) << " read=" << sts;
489 
490  if (sts == 0 || (sts < 0 && errno == EAGAIN))
491  {
492  sts = 0; // Treat EAGAIN as receiving no data
493  if (loop_guard > 10) { usleep(1000); }
494  if (++loop_guard > 10010)
495  {
496  TLOG(TLVL_WARNING) << GetTraceName() << "receiveFragmentData: loop guard triggered, returning RECV_TIMEOUT";
497  setActiveFD_(source_rank(), -1);
498  return RECV_TIMEOUT;
499  }
500  }
501  else if (sts > 0)
502  {
503  loop_guard = 0;
504  last_recv_time_ = std::chrono::steady_clock::now();
505  }
506 
507  if (sts < 0)
508  {
509  TLOG(TLVL_WARNING) << GetTraceName() << "receiveFragmentData: Error on receive, closing socket"
510  << " (errno=" << errno << ": " << strerror(errno) << ")";
511  disconnect_receive_socket_("Error on receive");
512  }
513  else if (sts == 0)
514  {
515  if (!noDataWarningSent)
516  {
517  TLOG(TLVL_WARNING) << GetTraceName() << "receiveFragmentData: No data received, is the sender still sending?!?";
518  noDataWarningSent = true;
519  }
520  if (TimeUtils::GetElapsedTime(last_recv_time_) > receive_disconnected_wait_s_)
521  {
522  TLOG(TLVL_ERROR) << GetTraceName() << "receiveFragmentData: No data received within timeout, aborting!";
523  return RECV_TIMEOUT;
524  }
525  }
526  else
527  {
528  // see if we're done (with this state)
529  sts = offset += sts;
530  if (sts >= target_bytes)
531  {
532  TLOG(9) << GetTraceName() << "receiveFragmentData: Target read bytes reached. Changing state";
533  offset = 0;
534  if (state == SocketState::Metadata)
535  {
536  state = SocketState::Data;
537  mh.byte_count = ntohl(mh.byte_count);
538  mh.source_id = ntohs(mh.source_id);
539  target_bytes = mh.byte_count;
540 
541  if (mh.message_type == MessHead::header_v0)
542  {
543  TLOG(TLVL_WARNING) << GetTraceName() << "receiveFragmentData: Message header indicates that a Fragment header follows when I was expecting Fragment data!";
544  disconnect_receive_socket_("Desync detected");
545  }
546  }
547  else
548  {
549  ret_rank = source_rank();
550  TLOG(11) << GetTraceName() << "receiveFragmentData done sts=" << sts << " src=" << ret_rank;
551  TLOG(9) << GetTraceName() << "receiveFragmentData: Done receiving fragment. Moving into output.";
552 
553 #if USE_ACKS
554  send_ack_(active_receive_fd_);
555 #endif
556 
557  done = true; // no more polls
558  //break; // no more read of ready fds
559  }
560  }
561  }
562 
563  // Check if we were asked to do a 0-size receive
564  if (target_bytes == 0 && state == SocketState::Data)
565  {
566  ret_rank = source_rank();
567  TLOG(11) << GetTraceName() << "receiveFragmentData done sts=" << sts << " src=" << ret_rank;
568  TLOG(9) << GetTraceName() << "receiveFragmentData: Done receiving fragment. Moving into output.";
569 
570 #if USE_ACKS
571  send_ack_(active_receive_fd_);
572 #endif
573 
574  done = true; // no more polls
575  }
576 
577  } // while(!done)...poll
578 
579  setLastActiveFD_(source_rank(), getActiveFD_(source_rank()));
580  setActiveFD_(source_rank(), -1);
581 
582  TLOG(9) << GetTraceName() << "receiveFragmentData: Returning rank " << ret_rank;
583  return ret_rank;
584 }
585 
587 {
588  switch (role())
589  {
591  return send_fd_ != -1;
593  auto count = getConnectedFDCount_(source_rank());
594  TLOG(TLVL_DEBUG) << GetTraceName() << "isRunning: There are " << count << " fds connected.";
595  return count > 0;
596  }
597  return false;
598 }
599 
601 {
602  std::lock_guard<std::mutex> lk(fd_mutex_);
603  if (connected_fds_.count(source_rank()) != 0u)
604  {
605  auto it = connected_fds_[source_rank()].begin();
606  char discard_buf[0x1000];
607  while (it != connected_fds_[source_rank()].end())
608  {
609  TLOG(TLVL_INFO) << GetTraceName() << "flush_buffers: Checking for data in socket " << *it << " for rank " << source_rank();
610  size_t bytes_read = 0;
611  while (int sts = static_cast<int>(read(*it, discard_buf, sizeof(discard_buf)) > 0))
612  {
613  bytes_read += sts;
614  }
615  if (bytes_read > 0)
616  {
617  TLOG(TLVL_WARNING) << GetTraceName() << "flush_buffers: Flushed " << bytes_read << " bytes from socket " << *it << " for rank " << source_rank();
618  }
619  TLOG(TLVL_INFO) << GetTraceName() << "flush_buffers: Closing socket " << *it << " for rank " << source_rank();
620  close(*it);
621  it = connected_fds_[source_rank()].erase(it);
622  }
623  connected_fds_.erase(source_rank());
624  }
625  active_receive_fds_[source_rank()] = -1;
626  last_active_receive_fds_[source_rank()] = -1;
627 }
628 
629 // Send the given Fragment. Return the rank of the destination to which
630 // the Fragment was sent OR -1 if to none.
631 artdaq::TransferInterface::CopyStatus artdaq::TCPSocketTransfer::sendFragment_(Fragment&& frag, size_t send_timeout_usec)
632 {
633  TLOG(12) << GetTraceName() << "sendFragment begin send of fragment with sequenceID=" << frag.sequenceID();
634  artdaq::Fragment grab_ownership_frag = std::move(frag);
635 
636  reconnect_();
637  if (send_fd_ == -1 && connection_was_lost_)
638  {
639  TLOG(TLVL_INFO) << GetTraceName() << "reconnection attempt failed, returning quickly.";
641  }
642 
643  // Send Fragment Header
644 
645 #if USE_ACKS
646  // Wait for fragments to be received
647  while (static_cast<size_t>(send_ack_diff_) > buffer_count_) usleep(10000);
648 #endif
649 
650  iovec iov = {static_cast<void*>(grab_ownership_frag.headerAddress()),
651  detail::RawFragmentHeader::num_words() * sizeof(RawDataType)};
652 
653  auto sts = sendData_(&iov, 1, send_retry_timeout_us_, true);
654  auto start_time = std::chrono::steady_clock::now();
655  //If it takes more than 10 seconds to write a Fragment header, give up
656  while (sts == CopyStatus::kTimeout && (send_timeout_usec == 0 || TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec) && TimeUtils::GetElapsedTimeMicroseconds(start_time) < 10000000)
657  {
658  TLOG(13) << GetTraceName() << "sendFragment: Timeout sending fragment";
659  sts = sendData_(&iov, 1, send_retry_timeout_us_, true);
660  usleep(1000);
661  }
662  if (sts != CopyStatus::kSuccess)
663  {
664  return sts;
665  }
666 
667  // Send Fragment Data
668 
669  iov = {static_cast<void*>(grab_ownership_frag.headerAddress() + detail::RawFragmentHeader::num_words()), // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
670  grab_ownership_frag.sizeBytes() - detail::RawFragmentHeader::num_words() * sizeof(RawDataType)};
671  sts = sendData_(&iov, 1, send_retry_timeout_us_);
672  start_time = std::chrono::steady_clock::now();
673  while (sts == CopyStatus::kTimeout && (send_timeout_usec == 0 || TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec) && TimeUtils::GetElapsedTimeMicroseconds(start_time) < 10000000)
674  {
675  TLOG(13) << GetTraceName() << "sendFragment: Timeout sending fragment";
676  sts = sendData_(&iov, 1, send_retry_timeout_us_);
677  usleep(1000);
678  }
679 
680 #if USE_ACKS
681  send_ack_diff_++;
682 #endif
683 
684  TLOG(12) << GetTraceName() << "sendFragment returning " << CopyStatusToString(sts);
685  return sts;
686 }
687 
688 artdaq::TransferInterface::CopyStatus artdaq::TCPSocketTransfer::sendData_(const void* buf, size_t bytes, size_t send_timeout_usec, bool isHeader)
689 {
690  TLOG(TLVL_DEBUG) << GetTraceName() << "sendData_ Converting buf to iovec";
691  iovec iov = {const_cast<void*>(buf), bytes}; // NOLINT(cppcoreguidelines-pro-type-const-cast)
692  return sendData_(&iov, 1, send_timeout_usec, isHeader);
693 }
694 
695 artdaq::TransferInterface::CopyStatus artdaq::TCPSocketTransfer::sendData_(const struct iovec* iov, int iovcnt, size_t send_timeout_usec, bool isHeader)
696 {
697  // check all connected??? -- currently just check fd!=-1
698  if (send_fd_ == -1)
699  {
700  if (timeoutMessageArmed_)
701  {
702  TLOG(TLVL_DEBUG) << GetTraceName() << "sendData_: Send fd is not open. Returning kTimeout";
703  timeoutMessageArmed_ = false;
704  }
705  return CopyStatus::kTimeout;
706  }
707  timeoutMessageArmed_ = true;
708  TLOG(14) << GetTraceName() << "send_timeout_usec is " << send_timeout_usec << ", currently unused.";
709 
710  //TLOG(TLVL_DEBUG) << GetTraceName() << "sendData_: Determining write size" ;
711  uint32_t total_to_write_bytes = 0;
712  std::vector<iovec> iov_in(iovcnt + 1); // need contiguous (for the unlike case that only partial MH
713  std::vector<iovec> iovv(iovcnt + 2); // 1 more for mh and another one for any partial
714  int ii;
715  for (ii = 0; ii < iovcnt; ++ii)
716  {
717  iov_in[ii + 1] = iov[ii]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
718  total_to_write_bytes += iov[ii].iov_len; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
719  }
720  //TLOG(TLVL_DEBUG) << GetTraceName() << "sendData_: Constructing Message Header" ;
721  MessHead mh = {0, isHeader ? MessHead::header_v0 : MessHead::data_v0, htons(source_rank()), {htonl(total_to_write_bytes)}};
722  iov_in[0].iov_base = &mh;
723  iov_in[0].iov_len = sizeof(mh);
724  total_to_write_bytes += sizeof(mh);
725 
726  ssize_t sts = 0;
727  ssize_t total_written_bytes = 0;
728  ssize_t per_write_max_bytes = (32 * 1024);
729 
730  size_t in_iov_idx = 0; // only increment this when we know the associated data has been xferred
731  size_t out_iov_idx = 0;
732  ssize_t this_write_bytes = 0;
733 
734  do
735  {
736  // The first out_iov may be set at the end of the previous loop.
737  // iov looping from below (b/c of the latter, we need to check this_write_bytes)
738  for (;
739  (in_iov_idx + out_iov_idx) < iov_in.size() && this_write_bytes < per_write_max_bytes;
740  ++out_iov_idx)
741  {
742  this_write_bytes += iov_in[in_iov_idx + out_iov_idx].iov_len;
743  iovv[out_iov_idx] = iov_in[in_iov_idx + out_iov_idx];
744  }
745  if (this_write_bytes > per_write_max_bytes)
746  {
747  iovv[out_iov_idx - 1].iov_len -= this_write_bytes - per_write_max_bytes;
748  this_write_bytes = per_write_max_bytes;
749  }
750 
751  // need to do blocking algorithm -- including throttled block notifications
752  do_again:
753 #ifndef __OPTIMIZE__ // This can be an expensive TRACE call (even if disabled) due to multiplicity of calls
754  TLOG(14) << GetTraceName() << "sendFragment b4 writev " << std::setw(7) << total_written_bytes << " total_written_bytes send_fd_=" << send_fd_ << " in_idx=" << in_iov_idx
755  << " iovcnt=" << out_iov_idx << " 1st.len=" << iovv[0].iov_len;
756 #endif
757 //TLOG(TLVL_DEBUG) << GetTraceName() << " calling writev" ;
758 #if USE_SENDMSG
759  msghdr msg;
760  memset(&msg, 0, sizeof(msghdr));
761  msg.msg_iov = &(iovv[0]);
762  msg.msg_iovlen = out_iov_idx; // at this point out_iov_idx is really the count (not an idx per se)
763  sts = sendmsg(send_fd_, &msg, MSG_NOSIGNAL | (blocking != 0u ? 0 : MSG_DONTWAIT));
764 #else
765  sts = writev(send_fd_, &(iovv[0]), out_iov_idx); // SIGPIPE may occur -- need signal handler or mask/ignore
766 #endif
767  //TLOG(TLVL_DEBUG) << GetTraceName() << " done with writev" ;
768 
769  if (sts == -1)
770  {
771  if (errno == EAGAIN /* same as EWOULDBLOCK */)
772  {
773  TLOG(TLVL_DEBUG) << GetTraceName() << "sendFragment EWOULDBLOCK";
774  blocking = 1u;
775 
776  fcntl(send_fd_, F_SETFL, 0); // clear O_NONBLOCK
777 
778  // NOTE: YES -- could drop here
779  goto do_again;
780  }
781  TLOG(TLVL_WARNING) << GetTraceName() << "sendFragment_: WRITE ERROR " << errno << ": " << strerror(errno);
782  connect_state = 0; // any write error closes
783  close(send_fd_);
784  send_fd_ = -1;
785  connection_was_lost_ = true;
787  }
788  if (sts != this_write_bytes)
789  {
790  // we'll loop around -- with
791  TLOG(TLVL_DEBUG) << GetTraceName() << "sendFragment writev sts(" << sts << ")!=requested_send_bytes(" << this_write_bytes << ")";
792  total_written_bytes += sts; // add sts to total_written_bytes now as sts is adjusted next
793  // find which iovs are done
794  for (ii = 0; static_cast<size_t>(sts) >= iovv[ii].iov_len; ++ii)
795  {
796  sts -= iovv[ii].iov_len;
797  }
798  in_iov_idx += ii; // done with these in_iovs
799  iovv[ii].iov_len -= sts; // adjust partial iov
800  iovv[ii].iov_base = static_cast<uint8_t*>(iovv[ii].iov_base) + sts; // adjust partial iov // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
801 
802  // add more to get up to per_write_max_bytes
803  out_iov_idx = 0;
804  if (ii != 0)
805  {
806  iovv[out_iov_idx] = iovv[ii];
807  }
808  // starting over
809  this_write_bytes = iovv[out_iov_idx].iov_len;
810  // add any left over from appropriate in_iov_idx --
811  // i.e. match this out_iov with the in_iov that was used to
812  // initialize it; see how close the out base+len is to in base+len
813  // check !>per_write_max_bytes
814  auto additional = (reinterpret_cast<uintptr_t>(iov_in[in_iov_idx].iov_base) + iov_in[in_iov_idx].iov_len) - (reinterpret_cast<uintptr_t>(iovv[out_iov_idx].iov_base) + iovv[out_iov_idx].iov_len); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
815  if (additional != 0u)
816  {
817  iovv[out_iov_idx].iov_len += additional;
818  this_write_bytes += additional;
819  if (this_write_bytes > per_write_max_bytes)
820  {
821  iovv[out_iov_idx].iov_len -= this_write_bytes - per_write_max_bytes;
822  this_write_bytes = per_write_max_bytes;
823  }
824  }
825  ++out_iov_idx; // done with
826  TLOG(TLVL_TRACE) << GetTraceName() << "sendFragment writev sts!=: this_write_bytes=" << this_write_bytes
827  << " out_iov_idx=" << out_iov_idx
828  << " additional=" << additional
829  << " ii=" << ii;
830  }
831  else
832  {
833 #ifndef __OPTIMIZE__ // This can be an expensive TRACE call (even if disabled) due to multiplicity of calls
834  TLOG(TLVL_TRACE) << GetTraceName() << "sendFragment writev sts(" << sts << ")==requested_send_bytes(" << this_write_bytes << ")";
835 #endif
836  total_written_bytes += sts;
837  --out_iov_idx; // make it the index of the last iovv
838  iovv[out_iov_idx].iov_base = static_cast<uint8_t*>(iovv[out_iov_idx].iov_base) + iovv[out_iov_idx].iov_len; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
839  iovv[out_iov_idx].iov_len = 0;
840  in_iov_idx += out_iov_idx; // at least this many complete (one more if "last iovv" is complete
841  this_write_bytes = 0;
842  // need to check last iovv against appropriate iov_in
843  auto additional = (reinterpret_cast<uintptr_t>(iov_in[in_iov_idx].iov_base) + iov_in[in_iov_idx].iov_len) - (reinterpret_cast<uintptr_t>(iovv[out_iov_idx].iov_base) + iovv[out_iov_idx].iov_len); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
844  if (additional != 0u)
845  {
846  iovv[out_iov_idx].iov_len += additional;
847  this_write_bytes += additional;
848  if (this_write_bytes > per_write_max_bytes)
849  {
850  iovv[out_iov_idx].iov_len -= this_write_bytes - per_write_max_bytes;
851  this_write_bytes = per_write_max_bytes;
852  }
853  if (out_iov_idx != 0)
854  {
855  iovv[0] = iovv[out_iov_idx];
856  }
857  out_iov_idx = 1;
858  }
859  else
860  {
861  ++in_iov_idx;
862  out_iov_idx = 0;
863  }
864  }
865  } while (total_written_bytes < total_to_write_bytes);
866  if (total_written_bytes > total_to_write_bytes)
867  {
868  TLOG(TLVL_ERROR) << GetTraceName() << "sendFragment program error: too many bytes transferred";
869  }
870 
871  if (blocking != 0u)
872  {
873  blocking = 0u;
874  fcntl(send_fd_, F_SETFL, O_NONBLOCK); // set O_NONBLOCK
875  }
876  sts = total_written_bytes - sizeof(MessHead);
877 
878  TLOG(14) << GetTraceName() << "sendFragment sts=" << sts;
880 }
881 
882 void artdaq::TCPSocketTransfer::connect_()
883 {
884  auto start_time = std::chrono::steady_clock::now();
885 
886  // Retry a few times if we can't connect
887  while (send_fd_ == -1 && TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_retry_timeout_us_ * 10)
888  {
889  TLOG(TLVL_DEBUG) << GetTraceName() << "Connecting sender socket";
890  int sndbuf_bytes = static_cast<int>(sndbuf_);
891  if (sndbuf_ > INT_MAX)
892  {
893  sndbuf_bytes = INT_MAX;
894  TLOG(TLVL_WARNING) << "Requested SNDBUF " << sndbuf_ << " too large, setting to INT_MAX: " << INT_MAX;
895  }
896  TLOG(TLVL_DEBUG) << "Requested SNDBUF is " << sndbuf_bytes;
897 
898  send_fd_ = TCPConnect(hostMap_[destination_rank()].c_str(), portMan->GetTCPSocketTransferPort(destination_rank()), O_NONBLOCK, sndbuf_bytes);
899  if (send_fd_ == -1)
900  {
901  if (connection_was_lost_) { break; }
902 
903  usleep(send_retry_timeout_us_);
904  }
905  }
906  connect_state = 0;
907  blocking = 0;
908  TLOG(TLVL_DEBUG) << GetTraceName() << "connect_ " + hostMap_[destination_rank()] + ":" << portMan->GetTCPSocketTransferPort(destination_rank()) << " send_fd_=" << send_fd_;
909  if (send_fd_ != -1)
910  {
911  // write connect msg
912  TLOG(TLVL_DEBUG) << GetTraceName() << "connect_: Writing connect message";
913  MessHead mh = {0, MessHead::connect_v0, htons(source_rank()), {htonl(CONN_MAGIC)}};
914  ssize_t sts = write(send_fd_, &mh, sizeof(mh));
915  if (sts == -1)
916  {
917  TLOG(TLVL_ERROR) << GetTraceName() << "connect_: Error writing connect message!";
918  // a write error here is completely unexpected!
919  connect_state = 0;
920  close(send_fd_);
921  send_fd_ = -1;
922  }
923  else
924  {
925  TLOG(TLVL_INFO) << GetTraceName() << "connect_: Successfully connected";
926  // consider it all connected/established
927  connect_state = 1;
928  connection_was_lost_ = false;
929  }
930 
931 #if USE_ACKS
932  if (ack_listen_thread_ && ack_listen_thread_->joinable()) ack_listen_thread_->join();
933  TLOG(TLVL_INFO) << GetTraceName() << "Starting Ack Listener Thread";
934 
935  try
936  {
937  ack_listen_thread_ = std::make_unique<boost::thread>(&TCPSocketTransfer::receive_acks_, this);
938  }
939  catch (const boost::exception& e)
940  {
941  TLOG(TLVL_ERROR) << "Caught boost::exception starting TCP Socket Ack Listen thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
942  std::cerr << "Caught boost::exception starting TCP Socket Ack Listen thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
943  exit(5);
944  }
945 #endif
946  }
947 }
948 
949 void artdaq::TCPSocketTransfer::reconnect_()
950 {
951  if (send_fd_ == -1 && role() == TransferInterface::Role::kSend)
952  {
953  TLOG(TLVL_TRACE) << GetTraceName() << "check/reconnect";
954  return connect_();
955  }
956 }
957 
958 void artdaq::TCPSocketTransfer::start_listen_thread_()
959 {
960  std::lock_guard<std::mutex> start_lock(listen_thread_mutex_);
961  if (listen_thread_refcount_ == 0)
962  {
963  if (listen_thread_ && listen_thread_->joinable())
964  {
965  listen_thread_->join();
966  }
967  listen_thread_refcount_ = 1;
968  TLOG(TLVL_INFO) << GetTraceName() << "Starting Listener Thread";
969 
970  try
971  {
972  listen_thread_ = std::make_unique<boost::thread>(&TCPSocketTransfer::listen_, portMan->GetTCPSocketTransferPort(destination_rank()), rcvbuf_);
973  }
974  catch (const boost::exception& e)
975  {
976  TLOG(TLVL_ERROR) << "Caught boost::exception starting TCP Socket Listen thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
977  std::cerr << "Caught boost::exception starting TCP Socket Listen thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
978  exit(5);
979  }
980  }
981  else
982  {
983  listen_thread_refcount_++;
984  }
985 }
986 
987 #if USE_ACKS
988 void artdaq::TCPSocketTransfer::receive_acks_()
989 {
990  while (send_fd_ >= 0)
991  {
992  pollfd pollfd_s;
993  pollfd_s.events = POLLIN | POLLPRI;
994  pollfd_s.fd = send_fd_;
995 
996  TLOG(18) << GetTraceName() << "receive_acks_: Polling fd to see if there's data";
997  int num_fds_ready = poll(&pollfd_s, 1, 1000);
998  if (num_fds_ready <= 0)
999  {
1000  if (num_fds_ready == 0)
1001  {
1002  TLOG(18) << GetTraceName() << "receive_acks_: No data on receive socket";
1003  continue;
1004  }
1005 
1006  TLOG(TLVL_ERROR) << "Error in poll: errno=" << errno;
1007  break;
1008  }
1009 
1010  if (pollfd_s.revents & (POLLIN | POLLPRI))
1011  {
1012  // Expected, don't have to check revents any further
1013  }
1014  else
1015  {
1016  TLOG(TLVL_DEBUG) << GetTraceName() << "receive_acks_: Wrong event received from pollfd: " << pollfd_s.revents;
1017  break;
1018  }
1019 
1020  MessHead mh;
1021  auto sts = read(send_fd_, &mh, sizeof(mh));
1022 
1023  if (sts != sizeof(mh))
1024  {
1025  TLOG(TLVL_ERROR) << GetTraceName() << "receive_ack_: Wrong message header length received! (actual " << sts << " != " << sizeof(mh) << " expected)";
1026  continue;
1027  }
1028 
1029  // check for "magic" and valid source_id(aka rank)
1030  mh.source_id = ntohs(mh.source_id); // convert here as it is reference several times
1031  if (mh.source_id != my_rank)
1032  {
1033  TLOG(TLVL_ERROR) << GetTraceName() << "receive_ack_: Received ack for different sender! Rank=" << my_rank << ", hdr=" << mh.source_id;
1034  continue;
1035  }
1036  if (ntohl(mh.conn_magic) != ACK_MAGIC || !(mh.message_type == MessHead::ack_v0)) // Allow for future connect message versions
1037  {
1038  TLOG(TLVL_ERROR) << GetTraceName() << "receive_ack_: Wrong magic bytes in header!";
1039  continue;
1040  }
1041 
1042  TLOG(17) << GetTraceName() << "receive_acks_: Received ack message, diff is now " << (send_ack_diff_.load() - 1);
1043  send_ack_diff_--;
1044  }
1045 }
1046 
1047 void artdaq::TCPSocketTransfer::send_ack_(int fd)
1048 {
1049  MessHead mh = {0, MessHead::ack_v0, htons(source_rank()), {htonl(ACK_MAGIC)}};
1050  write(fd, &mh, sizeof(mh));
1051 }
1052 #endif
1053 
1054 void artdaq::TCPSocketTransfer::listen_(int port, size_t rcvbuf)
1055 {
1056  int listen_fd = -1;
1057  while (listen_thread_refcount_ > 0)
1058  {
1059  TLOG(TLVL_TRACE) << "listen_: Listening/accepting new connections on port " << port;
1060  if (listen_fd == -1)
1061  {
1062  TLOG(TLVL_DEBUG) << "listen_: Opening listener";
1063  listen_fd = TCP_listen_fd(port, rcvbuf);
1064  }
1065  if (listen_fd == -1)
1066  {
1067  TLOG(TLVL_DEBUG) << "listen_: Error creating listen_fd!";
1068  break;
1069  }
1070 
1071  int res;
1072  timeval tv = {2, 0}; // maybe increase of some global "debugging" flag set???
1073  fd_set rfds;
1074  FD_ZERO(&rfds);
1075  FD_SET(listen_fd, &rfds); // NOLINT
1076 
1077  res = select(listen_fd + 1, &rfds, static_cast<fd_set*>(nullptr), static_cast<fd_set*>(nullptr), &tv);
1078  if (res > 0)
1079  {
1080  int sts;
1081  sockaddr_un un;
1082  socklen_t arglen = sizeof(un);
1083  int fd;
1084  TLOG(TLVL_DEBUG) << "listen_: Calling accept";
1085  fd = accept(listen_fd, reinterpret_cast<sockaddr*>(&un), &arglen); // NOLINT(cppcoreguidelines-pro-type-reinterpret-cast)
1086  TLOG(TLVL_DEBUG) << "listen_: Done with accept";
1087 
1088  TLOG(TLVL_DEBUG) << "listen_: Reading connect message";
1089  socklen_t lenlen = sizeof(tv);
1090  /*sts=*/
1091  setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, lenlen); // see man 7 socket.
1092  MessHead mh;
1093  uint64_t mark_us = TimeUtils::gettimeofday_us();
1094  sts = read(fd, &mh, sizeof(mh));
1095  uint64_t delta_us = TimeUtils::gettimeofday_us() - mark_us;
1096  TLOG(TLVL_DEBUG) << "listen_: Read of connect message took " << delta_us << " microseconds.";
1097  if (sts != sizeof(mh))
1098  {
1099  TLOG(TLVL_DEBUG) << "listen_: Wrong message header length received!";
1100  close(fd);
1101  continue;
1102  }
1103 
1104  // check for "magic" and valid source_id(aka rank)
1105  mh.source_id = ntohs(mh.source_id); // convert here as it is reference several times
1106  if (ntohl(mh.conn_magic) != CONN_MAGIC || !(mh.message_type == MessHead::connect_v0)) // Allow for future connect message versions
1107  {
1108  TLOG(TLVL_DEBUG) << "listen_: Wrong magic bytes in header!";
1109  close(fd);
1110  continue;
1111  }
1112 
1113  // now add (new) connection
1114  std::lock_guard<std::mutex> lk(fd_mutex_);
1115  connected_fds_[mh.source_id].insert(fd);
1116 
1117  TLOG(TLVL_INFO) << "listen_: New fd is " << fd << " for source rank " << mh.source_id;
1118  }
1119  else
1120  {
1121  TLOG(16) << "listen_: No connections in timeout interval!";
1122  }
1123  }
1124 
1125  TLOG(TLVL_INFO) << "listen_: Shutting down connection listener";
1126  if (listen_fd != -1)
1127  {
1128  close(listen_fd);
1129  }
1130  std::lock_guard<std::mutex> lk(fd_mutex_);
1131  auto it = connected_fds_.begin();
1132  while (it != connected_fds_.end())
1133  {
1134  auto& fd_set = it->second;
1135  auto rank_it = fd_set.begin();
1136  while (rank_it != fd_set.end())
1137  {
1138  close(*rank_it);
1139  rank_it = fd_set.erase(rank_it);
1140  }
1141  it = connected_fds_.erase(it);
1142  }
1143 
1144 } // do_connect_
1145 
1146 size_t artdaq::TCPSocketTransfer::getConnectedFDCount_(int source_rank)
1147 {
1148  std::lock_guard<std::mutex> lk(fd_mutex_);
1149 #ifndef __OPTIMIZE__
1150  TLOG(15) << GetTraceName() << "getConnectedFDCount_: count is " << (connected_fds_.count(source_rank) != 0u ? connected_fds_[source_rank].size() : 0);
1151 #endif
1152  return connected_fds_.count(source_rank) != 0u ? connected_fds_[source_rank].size() : 0;
1153 }
1154 
1155 int artdaq::TCPSocketTransfer::getActiveFD_(int source_rank)
1156 {
1157  std::lock_guard<std::mutex> lk(fd_mutex_);
1158 #ifndef __OPTIMIZE__
1159  TLOG(15) << GetTraceName() << "getActiveFD_: fd is " << (active_receive_fds_.count(source_rank) != 0u ? active_receive_fds_[source_rank] : -1);
1160 #endif
1161  return active_receive_fds_.count(source_rank) != 0u ? active_receive_fds_[source_rank] : -1;
1162 }
1163 void artdaq::TCPSocketTransfer::setActiveFD_(int source_rank, int fd)
1164 {
1165  std::lock_guard<std::mutex> lk(fd_mutex_);
1166 #ifndef __OPTIMIZE__
1167  TLOG(15) << GetTraceName() << "setActiveFD_: setting active fd for rank " << source_rank << " to " << fd;
1168 #endif
1169  active_receive_fds_[source_rank] = fd;
1170 }
1171 int artdaq::TCPSocketTransfer::getLastActiveFD_(int source_rank)
1172 {
1173  std::lock_guard<std::mutex> lk(fd_mutex_);
1174 #ifndef __OPTIMIZE__
1175  TLOG(15) << GetTraceName() << "getLastActiveFD_: fd is " << (last_active_receive_fds_.count(source_rank) != 0u ? last_active_receive_fds_[source_rank] : -1);
1176 #endif
1177  return last_active_receive_fds_.count(source_rank) != 0u ? last_active_receive_fds_[source_rank] : -1;
1178 }
1179 void artdaq::TCPSocketTransfer::setLastActiveFD_(int source_rank, int fd)
1180 {
1181  std::lock_guard<std::mutex> lk(fd_mutex_);
1182 #ifndef __OPTIMIZE__
1183  TLOG(15) << GetTraceName() << "setLastActiveFD_: setting last active fd for rank " << source_rank << " to " << fd;
1184 #endif
1185  last_active_receive_fds_[source_rank] = fd;
1186 }
1187 
1188 DEFINE_ARTDAQ_TRANSFER(artdaq::TCPSocketTransfer)
bool isRunning() override
Determine whether the TransferInterface plugin is able to send/receive data.
virtual int source_rank() const
Get the source rank for this TransferInterface instance.
uint32_t conn_magic
unsigned first is better for MessHead initializer: {0,0,my_node_idx_,CONN_MAGIC}
Definition: SRSockets.hh:40
int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t timeout_usec) override
Receive a Fragment Header from the transport mechanism.
int TCPConnect(char const *host_in, int dflt_port, int64_t flags=0, int sndbufsiz=0)
Connect to a host on a given port.
Definition: TCPConnect.cc:376
This TransferInterface is a Receiver.
int receiveFragmentData(RawDataType *destination, size_t wordCount) override
Receive the body of a Fragment to the given destination pointer.
int TCP_listen_fd(int port, int rcvbuf)
Create a TCP listening socket on the given port and INADDR_ANY, with the given receive buffer...
TCPSocketTransfer(fhicl::ParameterSet const &ps, Role role)
TCPSocketTransfer Constructor.
This TransferInterface is a Sender.
void flush_buffers() override
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
int32_t byte_count
use CONN_MAGIC for connect_v0, data that follow for data_v0 (and 0 lenght data)
Definition: SRSockets.hh:41
Some error occurred, but no exception was thrown.
Role
Used to determine if a TransferInterface is a Sender or Receiver.
int64_t source_id
Rank of the source.
Definition: SRSockets.hh:36
MessType message_type
Message Type.
Definition: SRSockets.hh:35
The send operation completed successfully.
This interface defines the functions used to transfer data between artdaq applications.
TransferInterface implementation plugin that sends data using TCP sockets.
This header is sent by the TCPSocket_transfer to allow for more efficient writev calls.
Definition: SRSockets.hh:15
hostMap_t MakeHostMap(fhicl::ParameterSet const &pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Definition: HostMap.hh:65
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.