1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_TableReceiver").c_str()
3 #include "artdaq/DAQrate/detail/TableReceiver.hh"
6 #include <netinet/in.h>
8 #include <sys/socket.h>
12 #include "canvas/Utilities/Exception.h"
15 : use_routing_manager_(pset.get<bool>(
"use_routing_manager", false))
17 , table_port_(pset.get<int>(
"table_update_port", 35556))
18 , table_address_(pset.get<std::string>(
"routing_manager_hostname",
"localhost"))
20 , routing_table_last_(0)
21 , routing_table_max_size_(pset.get<size_t>(
"routing_table_max_size", 1000))
22 , routing_wait_time_(0)
23 , routing_wait_time_count_(0)
24 , routing_timeout_ms_((pset.get<size_t>(
"routing_timeout_ms", 1000)))
25 , highest_sequence_id_routed_(0)
27 TLOG(TLVL_DEBUG + 32) <<
"Received pset: " << pset.to_string();
29 if (use_routing_manager_)
31 startTableReceiverThread_();
37 TLOG(TLVL_DEBUG + 32) <<
"Shutting down TableReceiver BEGIN";
39 disconnectFromRoutingManager_();
41 if (routing_thread_ !=
nullptr)
45 if (routing_thread_->joinable())
47 routing_thread_->join();
54 TLOG(TLVL_DEBUG + 32) <<
"Shutting down TableReceiver END.";
59 std::lock_guard<std::mutex> lk(routing_mutex_);
61 return routing_table_copy;
66 std::lock_guard<std::mutex> lk(routing_mutex_);
68 routing_table_.clear();
69 return routing_table_copy;
74 if (use_routing_manager_)
76 sendTableUpdateRequest_(seqID);
77 auto routing_timeout_ms = routing_timeout_ms_;
78 if (routing_timeout_ms == 0)
80 routing_timeout_ms = 3600 * 1000;
82 auto condition_wait = routing_timeout_ms > 10 ? std::chrono::milliseconds(10) : std::chrono::milliseconds(routing_timeout_ms);
83 auto start_time = std::chrono::steady_clock::now();
84 while (!should_stop_ && TimeUtils::GetElapsedTimeMilliseconds(start_time) < routing_timeout_ms)
86 std::unique_lock<std::mutex> lk(routing_mutex_);
87 routing_cv_.wait_for(lk, condition_wait, [&]() {
return routing_table_.count(seqID); });
88 if (routing_table_.count(seqID))
90 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start_time));
91 return routing_table_.at(seqID);
94 TLOG(TLVL_WARNING) <<
"Bad Omen: Timeout receiving routing information for " << seqID
95 <<
" in routing_timeout_ms (" << routing_timeout_ms_ <<
" ms)!";
97 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start_time));
99 return ROUTING_FAILED;
102 void artdaq::TableReceiver::connectToRoutingManager_()
104 auto start_time = std::chrono::steady_clock::now();
105 while (table_socket_ < 0 && TimeUtils::GetElapsedTime(start_time) < 30)
107 table_socket_ =
TCPConnect(table_address_.c_str(), table_port_);
108 if (table_socket_ < 0)
110 TLOG(TLVL_DEBUG + 33) <<
"Waited " << TimeUtils::GetElapsedTime(start_time) <<
" s for Routing Manager to open table listen socket";
114 if (table_socket_ < 0)
116 TLOG(TLVL_ERROR) <<
"Error creating socket for receiving table updates!";
120 detail::RoutingRequest startHdr(my_rank);
121 write(table_socket_, &startHdr,
sizeof(startHdr));
124 void artdaq::TableReceiver::disconnectFromRoutingManager_()
126 detail::RoutingRequest endHdr(my_rank, detail::RoutingRequest::RequestMode::Disconnect);
127 write(table_socket_, &endHdr,
sizeof(endHdr));
128 close(table_socket_);
132 void artdaq::TableReceiver::startTableReceiverThread_()
134 if (routing_thread_ !=
nullptr && routing_thread_->joinable())
136 routing_thread_->join();
138 TLOG(TLVL_INFO) <<
"Starting Routing Thread";
141 routing_thread_.reset(
new boost::thread(&TableReceiver::receiveTableUpdatesLoop_,
this));
143 snprintf(tname, 16,
"%s",
"RoutingReceive");
144 auto handle = routing_thread_->native_handle();
145 pthread_setname_np(handle, tname);
147 catch (
const boost::exception& e)
149 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
150 std::cerr <<
"Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
155 bool artdaq::TableReceiver::receiveTableUpdate_()
157 TLOG(TLVL_DEBUG + 33) << __func__ <<
": Polling table socket for new routes (address:port = " << table_address_ <<
":" << table_port_ <<
")";
158 if (table_socket_ == -1)
160 TLOG(TLVL_DEBUG + 32) << __func__ <<
": Opening table socket";
161 connectToRoutingManager_();
163 if (table_socket_ == -1)
165 TLOG(TLVL_DEBUG + 32) << __func__ <<
": The table socket was not opened successfully.";
170 fd.fd = table_socket_;
171 fd.events = POLLIN | POLLPRI;
173 auto res = poll(&fd, 1, 1000);
176 if (fd.revents & (POLLIN | POLLPRI))
178 TLOG(TLVL_DEBUG + 32) << __func__ <<
": Going to receive RoutingPacketHeader";
180 ssize_t stss = recv(table_socket_, &hdr,
sizeof(hdr), MSG_WAITALL);
181 if (stss !=
sizeof(hdr))
183 TLOG(TLVL_ERROR) <<
"Error reading Table Header from Table socket, errno=" << errno <<
" (" << strerror(errno) <<
")";
184 disconnectFromRoutingManager_();
188 TLOG(TLVL_DEBUG + 32) <<
"receiveTableUpdatesLoop_: Checking for valid header with nEntries=" << hdr.
nEntries <<
" header=" << std::hex << hdr.
header;
189 if (hdr.
header != ROUTING_MAGIC)
191 TLOG(TLVL_DEBUG + 33) << __func__ <<
": non-RoutingPacket received. No ROUTING_MAGIC.";
196 TLOG(TLVL_DEBUG + 33) << __func__ <<
": Empty Routing Table update received.";
205 stss = read(table_socket_, reinterpret_cast<char*>(&buffer[0]) + sts, total - sts);
207 TLOG(TLVL_DEBUG + 32) <<
"Read " << stss <<
" bytes, total " << sts <<
" / " << total;
210 TLOG(TLVL_ERROR) <<
"Error reading Table Data from Table socket, errno=" << errno <<
" (" << strerror(errno) <<
")";
211 disconnectFromRoutingManager_();
216 auto first = buffer.front().sequence_id;
217 auto last = buffer.back().sequence_id;
219 if (first + hdr.
nEntries - 1 != last)
221 TLOG(TLVL_ERROR) << __func__ <<
": Skipping this RoutingPacket because the first (" << first <<
") and last (" << last <<
") entries are inconsistent (sz=" << hdr.
nEntries <<
")!";
225 auto thisSeqID = first;
228 std::lock_guard<std::mutex> lck(routing_mutex_);
229 if (routing_table_.count(last) == 0)
231 for (
auto entry : buffer)
233 if (thisSeqID != entry.sequence_id)
235 TLOG(TLVL_ERROR) << __func__ <<
": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id <<
", expected=" << thisSeqID <<
")!";
236 last = thisSeqID - 1;
240 if (routing_table_.count(entry.sequence_id) != 0u)
242 if (routing_table_[entry.sequence_id] != entry.destination_rank)
244 TLOG(TLVL_ERROR) << __func__ <<
": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
245 <<
" should go to rank " << entry.destination_rank <<
", but I had already been told to send it to " << routing_table_[entry.sequence_id] <<
"!"
246 <<
" I will use the original value!";
250 if (entry.sequence_id < routing_table_last_)
254 routing_table_[entry.sequence_id] = entry.destination_rank;
255 TLOG(TLVL_DEBUG + 32) << __func__ <<
": (my_rank=" << my_rank <<
") received update: SeqID " << entry.sequence_id
256 <<
" -> Rank " << entry.destination_rank;
260 TLOG(TLVL_DEBUG + 32) << __func__ <<
": There are now " << routing_table_.size() <<
" entries in the Routing Table";
261 if (!routing_table_.empty())
263 TLOG(TLVL_DEBUG + 32) << __func__ <<
": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
267 for (
auto& entry : routing_table_)
269 TLOG(TLVL_DEBUG + 40) <<
"Routing Table Entry" << counter <<
": " << entry.first <<
" -> " << entry.second;
273 routing_cv_.notify_all();
280 TLOG(TLVL_DEBUG + 32) <<
"Poll indicates socket closure. Disconnecting from Routing Manager";
281 disconnectFromRoutingManager_();
288 void artdaq::TableReceiver::receiveTableUpdatesLoop_()
294 TLOG(TLVL_DEBUG + 32) << __func__ <<
": should_stop is " << std::boolalpha << should_stop_ <<
", stopping";
295 disconnectFromRoutingManager_();
299 receiveTableUpdate_();
303 void artdaq::TableReceiver::sendTableUpdateRequest_(Fragment::sequence_id_t seq)
305 TLOG(TLVL_DEBUG + 33) <<
"sendTableUpdateRequest_ BEGIN";
307 std::lock_guard<std::mutex> lck(routing_mutex_);
308 if (routing_table_.count(seq))
310 TLOG(TLVL_DEBUG + 33) <<
"sendTableUpdateRequest_ END (no request sent): " << routing_table_.at(seq);
314 if (table_socket_ == -1)
316 connectToRoutingManager_();
319 TLOG(TLVL_DEBUG + 32) <<
"sendTableUpdateRequest_: Sending table update request for " << my_rank <<
", sequence ID " << seq;
320 detail::RoutingRequest pkt(my_rank, seq);
321 write(table_socket_, &pkt,
sizeof(pkt));
323 TLOG(TLVL_DEBUG + 33) <<
"sendTableUpdateRequest_ END";
328 std::lock_guard<std::mutex> lck(routing_mutex_);
329 return routing_table_.size();
334 std::lock_guard<std::mutex> lck(routing_mutex_);
336 size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
342 TLOG(TLVL_DEBUG + 35) <<
"RemoveRoutingTableEntry: Removing sequence ID " << seq <<
" from routing table.";
343 std::lock_guard<std::mutex> lck(routing_mutex_);
348 if (routing_table_.find(seq) != routing_table_.end())
350 routing_table_.erase(routing_table_.find(seq));
358 TLOG(TLVL_DEBUG + 34) <<
"sending metrics";
359 if (use_routing_manager_)
361 metricMan->sendMetric(
"Routing Table Size", GetRoutingTableEntryCount(),
"events", 2, MetricMode::LastPoint);
362 if (routing_wait_time_ > 0)
364 metricMan->sendMetric(
"Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000,
"s", 2, MetricMode::Average);
365 routing_wait_time_ = 0;
TableReceiver(const fhicl::ParameterSet &ps)
TableReceiver Constructor.
A row of the Routing Table.
int TCPConnect(char const *host_in, int dflt_port, int64_t flags=0, int sndbufsiz=0)
Connect to a host on a given port.
RoutingTable GetRoutingTable() const
Get a copy of the current RoutingTable.
virtual ~TableReceiver()
TableReceiver Destructor.
void SendMetrics() const
Report metrics to MetricManager.
void RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
Remove the given sequence ID from the routing table and sent_count lists.
int GetRoutingTableEntry(artdaq::Fragment::sequence_id_t seqID)
Get the destination rank for the given sequence ID.
RoutingTable GetAndClearRoutingTable()
Get the current RoutingTable and remove all entries.
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
std::map< artdaq::Fragment::sequence_id_t, int > RoutingTable
Internal representation of a routing table, relating a sequence ID to a destination rank...