artdaq  v3_12_02
TableReceiver.cc
1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_TableReceiver").c_str()
3 #include "artdaq/DAQrate/detail/TableReceiver.hh"
4 
6 #include "artdaq/DAQrate/detail/RoutingPacket.hh"
7 #include "canvas/Utilities/Exception.h"
8 
9 #include <arpa/inet.h>
10 #include <netinet/in.h>
11 #include <poll.h>
12 #include <sys/socket.h>
13 #include <sys/types.h>
14 #include <chrono>
15 
16 artdaq::TableReceiver::TableReceiver(const fhicl::ParameterSet& pset)
17  : use_routing_manager_(pset.get<bool>("use_routing_manager", false))
18  , should_stop_(false)
19  , table_port_(pset.get<int>("table_update_port", 35556))
20  , table_address_(pset.get<std::string>("routing_manager_hostname", "localhost"))
21  , table_socket_(-1)
22  , routing_table_last_(0)
23  , routing_table_max_size_(pset.get<size_t>("routing_table_max_size", 1000))
24  , routing_wait_time_(0)
25  , routing_wait_time_count_(0)
26  , routing_timeout_ms_((pset.get<size_t>("routing_timeout_ms", 1000)))
27  , highest_sequence_id_routed_(0)
28 {
29  TLOG(TLVL_DEBUG + 32) << "Received pset: " << pset.to_string();
30 
31  if (use_routing_manager_)
32  {
33  startTableReceiverThread_();
34  }
35 }
36 
38 {
39  TLOG(TLVL_DEBUG + 32) << "Shutting down TableReceiver BEGIN";
40  should_stop_ = true;
41  disconnectFromRoutingManager_();
42 
43  if (routing_thread_ != nullptr)
44  {
45  try
46  {
47  if (routing_thread_->joinable())
48  {
49  routing_thread_->join();
50  }
51  }
52  catch (...)
53  { // IGNORED
54  }
55  }
56  TLOG(TLVL_DEBUG + 32) << "Shutting down TableReceiver END.";
57 }
58 
60 {
61  std::lock_guard<std::mutex> lk(routing_mutex_);
62  RoutingTable routing_table_copy(routing_table_);
63  return routing_table_copy;
64 }
65 
67 {
68  std::lock_guard<std::mutex> lk(routing_mutex_);
69  RoutingTable routing_table_copy(routing_table_);
70  routing_table_.clear();
71  return routing_table_copy;
72 }
73 
74 int artdaq::TableReceiver::GetRoutingTableEntry(artdaq::Fragment::sequence_id_t seqID)
75 {
76  if (use_routing_manager_)
77  {
78  sendTableUpdateRequest_(seqID);
79  auto routing_timeout_ms = routing_timeout_ms_;
80  if (routing_timeout_ms == 0)
81  {
82  routing_timeout_ms = 3600 * 1000;
83  }
84  auto condition_wait = routing_timeout_ms > 10 ? std::chrono::milliseconds(10) : std::chrono::milliseconds(routing_timeout_ms);
85  auto start_time = std::chrono::steady_clock::now();
86  while (!should_stop_ && TimeUtils::GetElapsedTimeMilliseconds(start_time) < routing_timeout_ms)
87  {
88  std::unique_lock<std::mutex> lk(routing_mutex_);
89  routing_cv_.wait_for(lk, condition_wait, [&]() { return routing_table_.count(seqID); });
90  if (routing_table_.count(seqID))
91  {
92  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start_time));
93  return routing_table_.at(seqID);
94  }
95  }
96  TLOG(TLVL_WARNING) << "Bad Omen: Timeout receiving routing information for " << seqID
97  << " in routing_timeout_ms (" << routing_timeout_ms_ << " ms)!";
98 
99  routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start_time));
100  }
101  return ROUTING_FAILED;
102 }
103 
104 void artdaq::TableReceiver::connectToRoutingManager_()
105 {
106  auto start_time = std::chrono::steady_clock::now();
107  while (table_socket_ < 0 && TimeUtils::GetElapsedTime(start_time) < 30)
108  {
109  table_socket_ = TCPConnect(table_address_.c_str(), table_port_);
110  if (table_socket_ < 0)
111  {
112  TLOG(TLVL_DEBUG + 33) << "Waited " << TimeUtils::GetElapsedTime(start_time) << " s for Routing Manager to open table listen socket";
113  usleep(100000);
114  }
115  }
116  if (table_socket_ < 0)
117  {
118  TLOG(TLVL_ERROR) << "Error creating socket for receiving table updates!";
119  exit(1);
120  }
121 
122  detail::RoutingRequest startHdr(my_rank);
123  write(table_socket_, &startHdr, sizeof(startHdr));
124 }
125 
126 void artdaq::TableReceiver::disconnectFromRoutingManager_()
127 {
128  detail::RoutingRequest endHdr(my_rank, detail::RoutingRequest::RequestMode::Disconnect);
129  write(table_socket_, &endHdr, sizeof(endHdr));
130  close(table_socket_);
131  table_socket_ = -1;
132 }
133 
134 void artdaq::TableReceiver::startTableReceiverThread_()
135 {
136  if (routing_thread_ != nullptr && routing_thread_->joinable())
137  {
138  routing_thread_->join();
139  }
140  TLOG(TLVL_INFO) << "Starting Routing Thread";
141  try
142  {
143  routing_thread_.reset(new boost::thread(&TableReceiver::receiveTableUpdatesLoop_, this));
144  char tname[16];
145  snprintf(tname, 16, "%s", "RoutingReceive"); // NOLINT
146  auto handle = routing_thread_->native_handle();
147  pthread_setname_np(handle, tname);
148  }
149  catch (const boost::exception& e)
150  {
151  TLOG(TLVL_ERROR) << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
152  std::cerr << "Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
153  exit(5);
154  }
155 }
156 
157 bool artdaq::TableReceiver::receiveTableUpdate_()
158 {
159  TLOG(TLVL_DEBUG + 33) << __func__ << ": Polling table socket for new routes (address:port = " << table_address_ << ":" << table_port_ << ")";
160  if (table_socket_ == -1)
161  {
162  TLOG(TLVL_DEBUG + 32) << __func__ << ": Opening table socket";
163  connectToRoutingManager_();
164  }
165  if (table_socket_ == -1)
166  {
167  TLOG(TLVL_DEBUG + 32) << __func__ << ": The table socket was not opened successfully.";
168  return false;
169  }
170 
171  struct pollfd fd;
172  fd.fd = table_socket_;
173  fd.events = POLLIN | POLLPRI;
174 
175  auto res = poll(&fd, 1, 1000);
176  if (res > 0)
177  {
178  if (fd.revents & (POLLIN | POLLPRI))
179  {
180  TLOG(TLVL_DEBUG + 32) << __func__ << ": Going to receive RoutingPacketHeader";
182  ssize_t stss = recv(table_socket_, &hdr, sizeof(hdr), MSG_WAITALL);
183  if (stss != sizeof(hdr))
184  {
185  TLOG(TLVL_ERROR) << "Error reading Table Header from Table socket, errno=" << errno << " (" << strerror(errno) << ")";
186  disconnectFromRoutingManager_();
187  return false;
188  }
189 
190  TLOG(TLVL_DEBUG + 32) << "receiveTableUpdatesLoop_: Checking for valid header with nEntries=" << hdr.nEntries << " header=" << std::hex << hdr.header;
191  if (hdr.header != ROUTING_MAGIC)
192  {
193  TLOG(TLVL_DEBUG + 33) << __func__ << ": non-RoutingPacket received. No ROUTING_MAGIC.";
194  return false;
195  }
196  if (hdr.nEntries == 0)
197  {
198  TLOG(TLVL_DEBUG + 33) << __func__ << ": Empty Routing Table update received.";
199  return false;
200  }
201 
203  size_t sts = 0;
204  size_t total = sizeof(artdaq::detail::RoutingPacketEntry) * hdr.nEntries;
205  while (sts < total)
206  {
207  stss = read(table_socket_, reinterpret_cast<char*>(&buffer[0]) + sts, total - sts);
208  sts += stss;
209  TLOG(TLVL_DEBUG + 32) << "Read " << stss << " bytes, total " << sts << " / " << total;
210  if (stss < 0)
211  {
212  TLOG(TLVL_ERROR) << "Error reading Table Data from Table socket, errno=" << errno << " (" << strerror(errno) << ")";
213  disconnectFromRoutingManager_();
214  return false;
215  }
216  }
217 
218  auto first = buffer.front().sequence_id;
219  auto last = buffer.back().sequence_id;
220 
221  if (first + hdr.nEntries - 1 != last)
222  {
223  TLOG(TLVL_ERROR) << __func__ << ": Skipping this RoutingPacket because the first (" << first << ") and last (" << last << ") entries are inconsistent (sz=" << hdr.nEntries << ")!";
224  return false;
225  }
226 
227  auto thisSeqID = first;
228 
229  {
230  std::lock_guard<std::mutex> lck(routing_mutex_);
231  if (routing_table_.count(last) == 0)
232  {
233  for (auto entry : buffer)
234  {
235  if (thisSeqID != entry.sequence_id)
236  {
237  TLOG(TLVL_ERROR) << __func__ << ": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id << ", expected=" << thisSeqID << ")!";
238  last = thisSeqID - 1;
239  break;
240  }
241  thisSeqID++;
242  if (routing_table_.count(entry.sequence_id) != 0u)
243  {
244  if (routing_table_[entry.sequence_id] != entry.destination_rank)
245  {
246  TLOG(TLVL_ERROR) << __func__ << ": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
247  << " should go to rank " << entry.destination_rank << ", but I had already been told to send it to " << routing_table_[entry.sequence_id] << "!"
248  << " I will use the original value!";
249  }
250  continue;
251  }
252  if (entry.sequence_id < routing_table_last_)
253  {
254  continue;
255  }
256  routing_table_[entry.sequence_id] = entry.destination_rank;
257  TLOG(TLVL_DEBUG + 32) << __func__ << ": (my_rank=" << my_rank << ") received update: SeqID " << entry.sequence_id
258  << " -> Rank " << entry.destination_rank;
259  }
260  }
261 
262  TLOG(TLVL_DEBUG + 32) << __func__ << ": There are now " << routing_table_.size() << " entries in the Routing Table";
263  if (!routing_table_.empty())
264  {
265  TLOG(TLVL_DEBUG + 32) << __func__ << ": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
266  }
267 
268  auto counter = 0;
269  for (auto& entry : routing_table_)
270  {
271  TLOG(TLVL_DEBUG + 40) << "Routing Table Entry" << counter << ": " << entry.first << " -> " << entry.second;
272  counter++;
273  }
274  }
275  routing_cv_.notify_all();
276 
277  SendMetrics();
278  return true;
279  }
280  else
281  {
282  TLOG(TLVL_DEBUG + 32) << "Poll indicates socket closure. Disconnecting from Routing Manager";
283  disconnectFromRoutingManager_();
284  return false;
285  }
286  }
287  return false;
288 }
289 
290 void artdaq::TableReceiver::receiveTableUpdatesLoop_()
291 {
292  while (true)
293  {
294  if (should_stop_)
295  {
296  TLOG(TLVL_DEBUG + 32) << __func__ << ": should_stop is " << std::boolalpha << should_stop_ << ", stopping";
297  disconnectFromRoutingManager_();
298  return;
299  }
300 
301  receiveTableUpdate_();
302  }
303 }
304 
305 void artdaq::TableReceiver::sendTableUpdateRequest_(Fragment::sequence_id_t seq)
306 {
307  TLOG(TLVL_DEBUG + 33) << "sendTableUpdateRequest_ BEGIN";
308  {
309  std::lock_guard<std::mutex> lck(routing_mutex_);
310  if (routing_table_.count(seq))
311  {
312  TLOG(TLVL_DEBUG + 33) << "sendTableUpdateRequest_ END (no request sent): " << routing_table_.at(seq);
313  return;
314  }
315  }
316  if (table_socket_ == -1)
317  {
318  connectToRoutingManager_();
319  }
320 
321  TLOG(TLVL_DEBUG + 32) << "sendTableUpdateRequest_: Sending table update request for " << my_rank << ", sequence ID " << seq;
322  detail::RoutingRequest pkt(my_rank, seq);
323  write(table_socket_, &pkt, sizeof(pkt));
324 
325  TLOG(TLVL_DEBUG + 33) << "sendTableUpdateRequest_ END";
326 }
327 
329 {
330  std::lock_guard<std::mutex> lck(routing_mutex_);
331  return routing_table_.size();
332 }
333 
335 {
336  std::lock_guard<std::mutex> lck(routing_mutex_);
337  // Find the distance from the next highest sequence ID to the end of the list
338  size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
339  return dist; // If dist == 1, there is one entry left.
340 }
341 
342 void artdaq::TableReceiver::RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
343 {
344  TLOG(TLVL_DEBUG + 35) << "RemoveRoutingTableEntry: Removing sequence ID " << seq << " from routing table.";
345  std::lock_guard<std::mutex> lck(routing_mutex_);
346  // while (routing_table_.size() > routing_table_max_size_)
347  // {
348  // routing_table_.erase(routing_table_.begin());
349  // }
350  if (routing_table_.find(seq) != routing_table_.end())
351  {
352  routing_table_.erase(routing_table_.find(seq));
353  }
354 }
355 
357 {
358  if (metricMan)
359  {
360  TLOG(TLVL_DEBUG + 34) << "sending metrics";
361  if (use_routing_manager_)
362  {
363  metricMan->sendMetric("Routing Table Size", GetRoutingTableEntryCount(), "events", 2, MetricMode::LastPoint);
364  if (routing_wait_time_ > 0)
365  {
366  metricMan->sendMetric("Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000, "s", 2, MetricMode::Average);
367  routing_wait_time_ = 0;
368  }
369  }
370  }
371 }
TableReceiver(const fhicl::ParameterSet &ps)
TableReceiver Constructor.
A row of the Routing Table.
int TCPConnect(char const *host_in, int dflt_port, int64_t flags=0, int sndbufsiz=0)
Connect to a host on a given port.
Definition: TCPConnect.cc:377
RoutingTable GetRoutingTable() const
Get a copy of the current RoutingTable.
virtual ~TableReceiver()
TableReceiver Destructor.
void SendMetrics() const
Report metrics to MetricManager.
void RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
Remove the given sequence ID from the routing table and sent_count lists.
The header of the Routing Table, containing the magic bytes and the number of entries.
int GetRoutingTableEntry(artdaq::Fragment::sequence_id_t seqID)
Get the destination rank for the given sequence ID.
uint64_t nEntries
The number of RoutingPacketEntries in the RoutingPacket.
RoutingTable GetAndClearRoutingTable()
Get the current RoutingTable and remove all entries.
uint32_t header
Magic bytes to make sure the packet wasn&#39;t garbled.
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
std::map< artdaq::Fragment::sequence_id_t, int > RoutingTable
Internal representation of a routing table, relating a sequence ID to a destination rank...