artdaq  v3_11_00
TableReceiver.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_TableReceiver").c_str()
3 #include "artdaq/DAQrate/detail/TableReceiver.hh"
4 
5 #include <arpa/inet.h>
6 #include <netinet/in.h>
7 #include <poll.h>
8 #include <sys/socket.h>
9 #include <sys/types.h>
10 #include <chrono>
12 #include "canvas/Utilities/Exception.h"
13 
14 artdaq::TableReceiver::TableReceiver(const fhicl::ParameterSet& pset)
15  : use_routing_manager_(pset.get<bool>("use_routing_manager", false))
16  , should_stop_(false)
17  , table_port_(pset.get<int>("table_update_port", 35556))
18  , table_address_(pset.get<std::string>("routing_manager_hostname", "localhost"))
19  , table_socket_(-1)
20  , routing_table_last_(0)
21  , routing_table_max_size_(pset.get<size_t>("routing_table_max_size", 1000))
22  , routing_wait_time_(0)
23  , routing_wait_time_count_(0)
24  , routing_timeout_ms_((pset.get<size_t>("routing_timeout_ms", 1000)))
25  , highest_sequence_id_routed_(0)
26 {
27  TLOG(TLVL_DEBUG) << "Received pset: " << pset.to_string();
28 
29  if (use_routing_manager_)
30  {
31  startTableReceiverThread_();
32  }
33 }
34 
36 {
37  TLOG(TLVL_DEBUG) << "Shutting down TableReceiver BEGIN";
38  should_stop_ = true;
39  disconnectFromRoutingManager_();
40 
41  if (routing_thread_ != nullptr)
42  {
43  try
44  {
45  if (routing_thread_->joinable())
46  {
47  routing_thread_->join();
48  }
49  }
50  catch (...)
51  { // IGNORED
52  }
53  }
54  TLOG(TLVL_DEBUG) << "Shutting down TableReceiver END.";
55 }
56 
58 {
59  std::lock_guard<std::mutex> lk(routing_mutex_);
60  RoutingTable routing_table_copy(routing_table_);
61  return routing_table_copy;
62 }
63 
65 {
66  std::lock_guard<std::mutex> lk(routing_mutex_);
67  RoutingTable routing_table_copy(routing_table_);
68  routing_table_.clear();
69  return routing_table_copy;
70 }
71 
72 int artdaq::TableReceiver::GetRoutingTableEntry(artdaq::Fragment::sequence_id_t seqID)
73 {
74  if (use_routing_manager_)
75  {
76  sendTableUpdateRequest_(seqID);
77  auto routing_timeout_ms = routing_timeout_ms_;
78  if (routing_timeout_ms == 0)
79  {
80  routing_timeout_ms = 3600 * 1000;
81  }
82  auto condition_wait = routing_timeout_ms > 10 ? std::chrono::milliseconds(10) : std::chrono::milliseconds(routing_timeout_ms);
83  auto start_time = std::chrono::steady_clock::now();
84  while (!should_stop_ && TimeUtils::GetElapsedTimeMilliseconds(start_time) < routing_timeout_ms)
85  {
86  std::unique_lock<std::mutex> lk(routing_mutex_);
87  routing_cv_.wait_for(lk, condition_wait, [&]() { return routing_table_.count(seqID); });
88  if (routing_table_.count(seqID))
89  {
90  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start_time));
91  return routing_table_.at(seqID);
92  }
93  }
94  TLOG(TLVL_WARNING) << "Bad Omen: Timeout receiving routing information for " << seqID
95  << " in routing_timeout_ms (" << routing_timeout_ms_ << " ms)!";
96 
97  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start_time));
98  }
99  return ROUTING_FAILED;
100 }
101 
102 void artdaq::TableReceiver::connectToRoutingManager_()
103 {
104  auto start_time = std::chrono::steady_clock::now();
105  while (table_socket_ < 0 && TimeUtils::GetElapsedTime(start_time) < 30)
106  {
107  table_socket_ = TCPConnect(table_address_.c_str(), table_port_);
108  if (table_socket_ < 0)
109  {
110  TLOG(TLVL_TRACE) << "Waited " << TimeUtils::GetElapsedTime(start_time) << " s for Routing Manager to open table listen socket";
111  usleep(100000);
112  }
113  }
114  if (table_socket_ < 0)
115  {
116  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
117  exit(1);
118  }
119 
120  detail::RoutingRequest startHdr(my_rank);
121  write(table_socket_, &startHdr, sizeof(startHdr));
122 }
123 
124 void artdaq::TableReceiver::disconnectFromRoutingManager_()
125 {
126  detail::RoutingRequest endHdr(my_rank, detail::RoutingRequest::RequestMode::Disconnect);
127  write(table_socket_, &endHdr, sizeof(endHdr));
128  close(table_socket_);
129  table_socket_ = -1;
130 }
131 
132 void artdaq::TableReceiver::startTableReceiverThread_()
133 {
134  if (routing_thread_ != nullptr && routing_thread_->joinable())
135  {
136  routing_thread_->join();
137  }
138  TLOG(TLVL_INFO) << "Starting Routing Thread";
139  try
140  {
141  routing_thread_.reset(new boost::thread(&TableReceiver::receiveTableUpdatesLoop_, this));
142  char tname[16];
143  snprintf(tname, 16, "%s", "RoutingReceive"); // NOLINT
144  auto handle = routing_thread_->native_handle();
145  pthread_setname_np(handle, tname);
146  }
147  catch (const boost::exception& e)
148  {
149  TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
150  std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
151  exit(5);
152  }
153 }
154 
155 bool artdaq::TableReceiver::receiveTableUpdate_()
156 {
157  TLOG(TLVL_TRACE) << __func__ << ": Polling table socket for new routes (address:port = " << table_address_ << ":" << table_port_ << ")";
158  if (table_socket_ == -1)
159  {
160  TLOG(TLVL_DEBUG) << __func__ << ": Opening table socket";
161  connectToRoutingManager_();
162  }
163  if (table_socket_ == -1)
164  {
165  TLOG(TLVL_DEBUG) << __func__ << ": The table socket was not opened successfully.";
166  return false;
167  }
168 
169  struct pollfd fd;
170  fd.fd = table_socket_;
171  fd.events = POLLIN | POLLPRI;
172 
173  auto res = poll(&fd, 1, 1000);
174  if (res > 0)
175  {
176  if (fd.revents & (POLLIN | POLLPRI))
177  {
178  TLOG(TLVL_DEBUG) << __func__ << ": Going to receive RoutingPacketHeader";
180  ssize_t stss = recv(table_socket_, &hdr, sizeof(hdr), MSG_WAITALL);
181  if (stss != sizeof(hdr))
182  {
183  TLOG(TLVL_ERROR) << "Error reading Table Header from Table socket, errno=" << errno << " (" << strerror(errno) << ")";
184  disconnectFromRoutingManager_();
185  return false;
186  }
187 
188  TLOG(TLVL_DEBUG) << "receiveTableUpdatesLoop_: Checking for valid header with nEntries=" << hdr.nEntries << " header=" << std::hex << hdr.header;
189  if (hdr.header != ROUTING_MAGIC)
190  {
191  TLOG(TLVL_TRACE) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC.";
192  return false;
193  }
194  if (hdr.nEntries == 0)
195  {
196  TLOG(TLVL_TRACE) << __func__ << ": Empty Routing Table update received.";
197  return false;
198  }
199 
201  size_t sts = 0;
202  size_t total = sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries;
203  while (sts < total)
204  {
205  stss = read(table_socket_, reinterpret_cast<char*>(&buffer[0]) + sts, total - sts);
206  sts += stss;
207  TLOG(TLVL_DEBUG) << "Read " << stss << " bytes, total " << sts << " / " << total;
208  if (stss < 0)
209  {
210  TLOG(TLVL_ERROR) << "Error reading Table Data from Table socket, errno=" << errno << " (" << strerror(errno) << ")";
211  disconnectFromRoutingManager_();
212  return false;
213  }
214  }
215 
216  auto first = buffer.front().sequence_id;
217  auto last = buffer.back().sequence_id;
218 
219  if (first + hdr.nEntries - 1 != last)
220  {
221  TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
222  return false;
223  }
224 
225  auto thisSeqID = first;
226 
227  {
228  std::lock_guard<std::mutex> lck(routing_mutex_);
229  if (routing_table_.count(last) == 0)
230  {
231  for (auto entry : buffer)
232  {
233  if (thisSeqID != entry.sequence_id)
234  {
235  TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
236  last = thisSeqID - 1;
237  break;
238  }
239  thisSeqID++;
240  if (routing_table_.count(entry.sequence_id) != 0u)
241  {
242  if (routing_table_[entry.sequence_id] != entry.destination_rank)
243  {
244  TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
245  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
246  << " I will use the original value!";
247  }
248  continue;
249  }
250  if (entry.sequence_id < routing_table_last_)
251  {
252  continue;
253  }
254  routing_table_[entry.sequence_id] = entry.destination_rank;
255  TLOG(TLVL_DEBUG) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
256  << " -> Rank " << entry.destination_rank;
257  }
258  }
259 
260  TLOG(TLVL_DEBUG) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
261  if (!routing_table_.empty())
262  {
263  TLOG(TLVL_DEBUG) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
264  }
265 
266  auto counter = 0;
267  for (auto& entry : routing_table_)
268  {
269  TLOG(45) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
270  counter++;
271  }
272  }
273  routing_cv_.notify_all();
274 
275  SendMetrics();
276  return true;
277  }
278  else
279  {
280  TLOG(TLVL_DEBUG) << "Poll indicates socket closure. Disconnecting from Routing Manager";
281  disconnectFromRoutingManager_();
282  return false;
283  }
284  }
285  return false;
286 }
287 
288 void artdaq::TableReceiver::receiveTableUpdatesLoop_()
289 {
290  while (true)
291  {
292  if (should_stop_)
293  {
294  TLOG(TLVL_DEBUG) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
295  disconnectFromRoutingManager_();
296  return;
297  }
298 
299  receiveTableUpdate_();
300  }
301 }
302 
303 void artdaq::TableReceiver::sendTableUpdateRequest_(Fragment::sequence_id_t seq)
304 {
305  TLOG(TLVL_TRACE) << "sendTableUpdateRequest_ BEGIN";
306  {
307  std::lock_guard<std::mutex> lck(routing_mutex_);
308  if (routing_table_.count(seq))
309  {
310  TLOG(TLVL_TRACE) << "sendTableUpdateRequest_ END (no request sent): " << routing_table_.at(seq);
311  return;
312  }
313  }
314  if (table_socket_ == -1)
315  {
316  connectToRoutingManager_();
317  }
318 
319  TLOG(TLVL_DEBUG) << "sendTableUpdateRequest_: Sending table update request for " << my_rank << ", sequence ID " << seq;
320  detail::RoutingRequest pkt(my_rank, seq);
321  write(table_socket_, &pkt, sizeof(pkt));
322 
323  TLOG(TLVL_TRACE) << "sendTableUpdateRequest_ END";
324 }
325 
327 {
328  std::lock_guard<std::mutex> lck(routing_mutex_);
329  return routing_table_.size();
330 }
331 
333 {
334  std::lock_guard<std::mutex> lck(routing_mutex_);
335  // Find the distance from the next highest sequence ID to the end of the list
336  size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
337  return dist; // If dist == 1, there is one entry left.
338 }
339 
340 void artdaq::TableReceiver::RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
341 {
342  TLOG(15) << "RemoveRoutingTableEntry: Removing sequence ID " << seq << " from routing table.";
343  std::lock_guard<std::mutex> lck(routing_mutex_);
344  // while (routing_table_.size() > routing_table_max_size_)
345  // {
346  // routing_table_.erase(routing_table_.begin());
347  // }
348  if (routing_table_.find(seq) != routing_table_.end())
349  {
350  routing_table_.erase(routing_table_.find(seq));
351  }
352 }
353 
355 {
356  if (metricMan)
357  {
358  TLOG(5) << "sending metrics";
359  if (use_routing_manager_)
360  {
361  metricMan->sendMetric("Routing Table Size", GetRoutingTableEntryCount(), "events", 2, MetricMode::LastPoint);
362  if (routing_wait_time_ > 0)
363  {
364  metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 2, MetricMode::Average);
365  routing_wait_time_ = 0;
366  }
367  }
368  }
369 }
TableReceiver(const fhicl::ParameterSet &ps)
TableReceiver Constructor.
A row of the Routing Table.
int TCPConnect(char const *host_in, int dflt_port, int64_t flags=0, int sndbufsiz=0)
Connect to a host on a given port.
Definition: TCPConnect.cc:376
RoutingTable GetRoutingTable() const
Get a copy of the current RoutingTable.
virtual ~TableReceiver()
TableReceiver Destructor.
void SendMetrics() const
Report metrics to MetricManager.
void RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
Remove the given sequence ID from the routing table and sent_count lists.
The header of the Routing Table, containing the magic bytes and the number of entries.
int GetRoutingTableEntry(artdaq::Fragment::sequence_id_t seqID)
Get the destination rank for the given sequence ID.
uint64_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
RoutingTable GetAndClearRoutingTable()
Get the current RoutingTable and remove all entries.
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
std::map< artdaq::Fragment::sequence_id_t, int > RoutingTable
Internal representation of a routing table, relating a sequence ID to a destination rank...