1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_TableReceiver").c_str()
3 #include "artdaq/DAQrate/detail/TableReceiver.hh"
6 #include "artdaq/DAQrate/detail/RoutingPacket.hh"
7 #include "canvas/Utilities/Exception.h"
10 #include <netinet/in.h>
12 #include <sys/socket.h>
13 #include <sys/types.h>
17 : use_routing_manager_(pset.get<bool>(
"use_routing_manager", false))
19 , table_port_(pset.get<int>(
"table_update_port", 35556))
20 , table_address_(pset.get<std::string>(
"routing_manager_hostname",
"localhost"))
22 , routing_table_last_(0)
23 , routing_table_max_size_(pset.get<size_t>(
"routing_table_max_size", 1000))
24 , routing_wait_time_(0)
25 , routing_wait_time_count_(0)
26 , routing_timeout_ms_((pset.get<size_t>(
"routing_timeout_ms", 1000)))
27 , highest_sequence_id_routed_(0)
29 TLOG(TLVL_DEBUG + 32) <<
"Received pset: " << pset.to_string();
31 if (use_routing_manager_)
33 startTableReceiverThread_();
39 TLOG(TLVL_DEBUG + 32) <<
"Shutting down TableReceiver BEGIN";
41 disconnectFromRoutingManager_();
43 if (routing_thread_ !=
nullptr)
47 if (routing_thread_->joinable())
49 routing_thread_->join();
56 TLOG(TLVL_DEBUG + 32) <<
"Shutting down TableReceiver END.";
61 std::lock_guard<std::mutex> lk(routing_mutex_);
63 return routing_table_copy;
68 std::lock_guard<std::mutex> lk(routing_mutex_);
70 routing_table_.clear();
71 return routing_table_copy;
76 if (use_routing_manager_)
78 sendTableUpdateRequest_(seqID);
79 auto routing_timeout_ms = routing_timeout_ms_;
80 if (routing_timeout_ms == 0)
82 routing_timeout_ms = 3600 * 1000;
84 auto condition_wait = routing_timeout_ms > 10 ? std::chrono::milliseconds(10) : std::chrono::milliseconds(routing_timeout_ms);
85 auto start_time = std::chrono::steady_clock::now();
86 while (!should_stop_ && TimeUtils::GetElapsedTimeMilliseconds(start_time) < routing_timeout_ms)
88 std::unique_lock<std::mutex> lk(routing_mutex_);
89 routing_cv_.wait_for(lk, condition_wait, [&]() {
return routing_table_.count(seqID); });
90 if (routing_table_.count(seqID))
92 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start_time));
93 return routing_table_.at(seqID);
96 TLOG(TLVL_WARNING) <<
"Bad Omen: Timeout receiving routing information for " << seqID
97 <<
" in routing_timeout_ms (" << routing_timeout_ms_ <<
" ms)!";
99 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start_time));
101 return ROUTING_FAILED;
104 void artdaq::TableReceiver::connectToRoutingManager_()
106 auto start_time = std::chrono::steady_clock::now();
107 while (table_socket_ < 0 && TimeUtils::GetElapsedTime(start_time) < 30)
109 table_socket_ =
TCPConnect(table_address_.c_str(), table_port_);
110 if (table_socket_ < 0)
112 TLOG(TLVL_DEBUG + 33) <<
"Waited " << TimeUtils::GetElapsedTime(start_time) <<
" s for Routing Manager to open table listen socket";
116 if (table_socket_ < 0)
118 TLOG(TLVL_ERROR) <<
"Error creating socket for receiving table updates!";
122 detail::RoutingRequest startHdr(my_rank);
123 write(table_socket_, &startHdr,
sizeof(startHdr));
126 void artdaq::TableReceiver::disconnectFromRoutingManager_()
128 detail::RoutingRequest endHdr(my_rank, detail::RoutingRequest::RequestMode::Disconnect);
129 write(table_socket_, &endHdr,
sizeof(endHdr));
130 close(table_socket_);
134 void artdaq::TableReceiver::startTableReceiverThread_()
136 if (routing_thread_ !=
nullptr && routing_thread_->joinable())
138 routing_thread_->join();
140 TLOG(TLVL_INFO) <<
"Starting Routing Thread";
143 routing_thread_.reset(
new boost::thread(&TableReceiver::receiveTableUpdatesLoop_,
this));
145 snprintf(tname, 16,
"%s",
"RoutingReceive");
146 auto handle = routing_thread_->native_handle();
147 pthread_setname_np(handle, tname);
149 catch (
const boost::exception& e)
151 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
152 std::cerr <<
"Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
157 bool artdaq::TableReceiver::receiveTableUpdate_()
159 TLOG(TLVL_DEBUG + 33) << __func__ <<
": Polling table socket for new routes (address:port = " << table_address_ <<
":" << table_port_ <<
")";
160 if (table_socket_ == -1)
162 TLOG(TLVL_DEBUG + 32) << __func__ <<
": Opening table socket";
163 connectToRoutingManager_();
165 if (table_socket_ == -1)
167 TLOG(TLVL_DEBUG + 32) << __func__ <<
": The table socket was not opened successfully.";
172 fd.fd = table_socket_;
173 fd.events = POLLIN | POLLPRI;
175 auto res = poll(&fd, 1, 1000);
178 if (fd.revents & (POLLIN | POLLPRI))
180 TLOG(TLVL_DEBUG + 32) << __func__ <<
": Going to receive RoutingPacketHeader";
182 ssize_t stss = recv(table_socket_, &hdr,
sizeof(hdr), MSG_WAITALL);
183 if (stss !=
sizeof(hdr))
185 TLOG(TLVL_ERROR) <<
"Error reading Table Header from Table socket, errno=" << errno <<
" (" << strerror(errno) <<
")";
186 disconnectFromRoutingManager_();
190 TLOG(TLVL_DEBUG + 32) <<
"receiveTableUpdatesLoop_: Checking for valid header with nEntries=" << hdr.
nEntries <<
" header=" << std::hex << hdr.
header;
191 if (hdr.
header != ROUTING_MAGIC)
193 TLOG(TLVL_DEBUG + 33) << __func__ <<
": non-RoutingPacket received. No ROUTING_MAGIC.";
198 TLOG(TLVL_DEBUG + 33) << __func__ <<
": Empty Routing Table update received.";
207 stss = read(table_socket_, reinterpret_cast<char*>(&buffer[0]) + sts, total - sts);
209 TLOG(TLVL_DEBUG + 32) <<
"Read " << stss <<
" bytes, total " << sts <<
" / " << total;
212 TLOG(TLVL_ERROR) <<
"Error reading Table Data from Table socket, errno=" << errno <<
" (" << strerror(errno) <<
")";
213 disconnectFromRoutingManager_();
218 auto first = buffer.front().sequence_id;
219 auto last = buffer.back().sequence_id;
221 if (first + hdr.
nEntries - 1 != last)
223 TLOG(TLVL_ERROR) << __func__ <<
": Skipping this RoutingPacket because the first (" << first <<
") and last (" << last <<
") entries are inconsistent (sz=" << hdr.
nEntries <<
")!";
227 auto thisSeqID = first;
230 std::lock_guard<std::mutex> lck(routing_mutex_);
231 if (routing_table_.count(last) == 0)
233 for (
auto entry : buffer)
235 if (thisSeqID != entry.sequence_id)
237 TLOG(TLVL_ERROR) << __func__ <<
": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id <<
", expected=" << thisSeqID <<
")!";
238 last = thisSeqID - 1;
242 if (routing_table_.count(entry.sequence_id) != 0u)
244 if (routing_table_[entry.sequence_id] != entry.destination_rank)
246 TLOG(TLVL_ERROR) << __func__ <<
": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
247 <<
" should go to rank " << entry.destination_rank <<
", but I had already been told to send it to " << routing_table_[entry.sequence_id] <<
"!"
248 <<
" I will use the original value!";
252 if (entry.sequence_id < routing_table_last_)
256 routing_table_[entry.sequence_id] = entry.destination_rank;
257 TLOG(TLVL_DEBUG + 32) << __func__ <<
": (my_rank=" << my_rank <<
") received update: SeqID " << entry.sequence_id
258 <<
" -> Rank " << entry.destination_rank;
262 TLOG(TLVL_DEBUG + 32) << __func__ <<
": There are now " << routing_table_.size() <<
" entries in the Routing Table";
263 if (!routing_table_.empty())
265 TLOG(TLVL_DEBUG + 32) << __func__ <<
": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
269 for (
auto& entry : routing_table_)
271 TLOG(TLVL_DEBUG + 40) <<
"Routing Table Entry" << counter <<
": " << entry.first <<
" -> " << entry.second;
275 routing_cv_.notify_all();
282 TLOG(TLVL_DEBUG + 32) <<
"Poll indicates socket closure. Disconnecting from Routing Manager";
283 disconnectFromRoutingManager_();
290 void artdaq::TableReceiver::receiveTableUpdatesLoop_()
296 TLOG(TLVL_DEBUG + 32) << __func__ <<
": should_stop is " << std::boolalpha << should_stop_ <<
", stopping";
297 disconnectFromRoutingManager_();
301 receiveTableUpdate_();
305 void artdaq::TableReceiver::sendTableUpdateRequest_(Fragment::sequence_id_t seq)
307 TLOG(TLVL_DEBUG + 33) <<
"sendTableUpdateRequest_ BEGIN";
309 std::lock_guard<std::mutex> lck(routing_mutex_);
310 if (routing_table_.count(seq))
312 TLOG(TLVL_DEBUG + 33) <<
"sendTableUpdateRequest_ END (no request sent): " << routing_table_.at(seq);
316 if (table_socket_ == -1)
318 connectToRoutingManager_();
321 TLOG(TLVL_DEBUG + 32) <<
"sendTableUpdateRequest_: Sending table update request for " << my_rank <<
", sequence ID " << seq;
322 detail::RoutingRequest pkt(my_rank, seq);
323 write(table_socket_, &pkt,
sizeof(pkt));
325 TLOG(TLVL_DEBUG + 33) <<
"sendTableUpdateRequest_ END";
330 std::lock_guard<std::mutex> lck(routing_mutex_);
331 return routing_table_.size();
336 std::lock_guard<std::mutex> lck(routing_mutex_);
338 size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
344 TLOG(TLVL_DEBUG + 35) <<
"RemoveRoutingTableEntry: Removing sequence ID " << seq <<
" from routing table.";
345 std::lock_guard<std::mutex> lck(routing_mutex_);
350 if (routing_table_.find(seq) != routing_table_.end())
352 routing_table_.erase(routing_table_.find(seq));
360 TLOG(TLVL_DEBUG + 34) <<
"sending metrics";
361 if (use_routing_manager_)
363 metricMan->sendMetric(
"Routing Table Size", GetRoutingTableEntryCount(),
"events", 2, MetricMode::LastPoint);
364 if (routing_wait_time_ > 0)
366 metricMan->sendMetric(
"Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000,
"s", 2, MetricMode::Average);
367 routing_wait_time_ = 0;
TableReceiver(const fhicl::ParameterSet &ps)
TableReceiver Constructor.
A row of the Routing Table.
int TCPConnect(char const *host_in, int dflt_port, int64_t flags=0, int sndbufsiz=0)
Connect to a host on a given port.
RoutingTable GetRoutingTable() const
Get a copy of the current RoutingTable.
virtual ~TableReceiver()
TableReceiver Destructor.
void SendMetrics() const
Report metrics to MetricManager.
void RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
Remove the given sequence ID from the routing table and sent_count lists.
int GetRoutingTableEntry(artdaq::Fragment::sequence_id_t seqID)
Get the destination rank for the given sequence ID.
RoutingTable GetAndClearRoutingTable()
Get the current RoutingTable and remove all entries.
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
std::map< artdaq::Fragment::sequence_id_t, int > RoutingTable
Internal representation of a routing table, relating a sequence ID to a destination rank...