1 #include "artdaq/DAQrate/DataSenderManager.hh"
2 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
3 #include "artdaq/DAQdata/Globals.hh"
6 #include <canvas/Utilities/Exception.h>
8 #include <netinet/in.h>
11 #include <sys/socket.h>
12 #include "artdaq/Application/Routing/RoutingPacket.hh"
17 , enabled_destinations_()
19 , broadcast_sends_(pset.get<bool>(
"broadcast_sends", false))
20 , non_blocking_mode_(pset.get<bool>(
"nonblocking_sends", false))
21 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
26 TLOG_DEBUG(
"DataSenderManager") <<
"Received pset: " << pset.to_string() << TLOG_ENDL;
27 auto rmConfig = pset.get<fhicl::ParameterSet>(
"routing_table_config", fhicl::ParameterSet());
28 use_routing_master_ = rmConfig.get<
bool>(
"use_routing_master",
false);
29 table_port_ = rmConfig.get<
int>(
"table_update_port", 35556);
30 table_address_ = rmConfig.get<std::string>(
"table_update_address",
"227.128.12.28");
31 ack_port_ = rmConfig.get<
int>(
"table_acknowledge_port", 35557);
32 ack_address_ = rmConfig.get<std::string>(
"routing_master_hostname",
"localhost");
33 routing_timeout_ms_ = (rmConfig.get<
int>(
"routing_timeout_ms", 1000));
36 auto dests = pset.get<fhicl::ParameterSet>(
"destinations", fhicl::ParameterSet());
37 for (
auto& d : dests.get_pset_names())
42 auto destination_rank = transfer->destination_rank();
43 destinations_.emplace( destination_rank, std::move(transfer));
45 catch (std::invalid_argument)
47 TRACE(3,
"Invalid destination specification: " + d);
49 catch (cet::exception ex)
51 TLOG_WARNING(
"DataSenderManager") <<
"Caught cet::exception: " << ex.what() << TLOG_ENDL;
55 TLOG_WARNING(
"DataSenderManager") <<
"Non-cet exception while setting up TransferPlugin: " << d <<
"." << TLOG_ENDL;
58 if (destinations_.size() == 0)
60 TLOG_ERROR(
"DataSenderManager") <<
"No destinations specified!" << TLOG_ENDL;
64 auto enabled_dests = pset.get<std::vector<size_t>>(
"enabled_destinations", std::vector<size_t>());
65 if (enabled_dests.size() == 0)
67 TLOG_INFO(
"DataSenderManager") <<
"enabled_destinations not specified, assuming all destinations enabled." << TLOG_ENDL;
68 for (
auto& d : destinations_)
70 enabled_destinations_.insert(d.first);
75 for (
auto& d : enabled_dests)
77 enabled_destinations_.insert(d);
81 if (use_routing_master_) startTableReceiverThread_();
86 TLOG_DEBUG(
"DataSenderManager") <<
"Shutting down DataSenderManager BEGIN" << TLOG_ENDL;
88 for (
auto& dest : enabled_destinations_)
90 if (destinations_.count(dest))
92 destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
96 if (routing_thread_.joinable()) routing_thread_.join();
97 TLOG_DEBUG(
"DataSenderManager") <<
"Shutting down DataSenderManager END. Sent " << count() <<
" fragments." << TLOG_ENDL;
101 void artdaq::DataSenderManager::setupTableListener_()
103 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
106 TLOG_ERROR(
"DataSenderManager") <<
"Error creating socket for receiving table updates!" << TLOG_ENDL;
110 struct sockaddr_in si_me_request;
113 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes,
sizeof(yes)) < 0)
115 TLOG_ERROR(
"DataSenderManager") <<
" Unable to enable port reuse on request socket" << TLOG_ENDL;
118 memset(&si_me_request, 0,
sizeof(si_me_request));
119 si_me_request.sin_family = AF_INET;
120 si_me_request.sin_port = htons(table_port_);
121 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
122 if (bind(table_socket_, (
struct sockaddr *)&si_me_request,
sizeof(si_me_request)) == -1)
124 TLOG_ERROR(
"DataSenderManager") <<
"Cannot bind request socket to port " << table_port_ << TLOG_ENDL;
129 int sts =
ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
132 TLOG_ERROR(
"DataSenderManager") <<
"Unable to resolve multicast address for table updates" << TLOG_ENDL;
135 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
136 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq,
sizeof(mreq)) < 0)
138 TLOG_ERROR(
"DataSenderManager") <<
"Unable to join multicast group" << TLOG_ENDL;
142 void artdaq::DataSenderManager::startTableReceiverThread_()
144 if (routing_thread_.joinable()) routing_thread_.join();
145 TLOG_INFO(
"DataSenderManager") <<
"Starting Routing Thread" << TLOG_ENDL;
146 routing_thread_ = std::thread(&DataSenderManager::receiveTableUpdatesLoop_,
this);
148 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
154 TLOG_DEBUG(
"DataSenderManager") <<
"receiveTableUpdatesLoop: should_stop is " << std::boolalpha << should_stop_ <<
", stopping" << TLOG_ENDL;
158 TRACE(4,
"DataSenderManager::receiveTableUpdatesLoop: Polling Request socket for new requests");
159 if (table_socket_ == -1)
161 TLOG_DEBUG(
"DataSenderManager") <<
"Opening table listener socket" << TLOG_ENDL;
162 setupTableListener_();
164 if (table_socket_ == -1)
166 TLOG_DEBUG(
"DataSenderManager") <<
"The listen socket was not opened successfully." << TLOG_ENDL;
169 if (ack_socket_ == -1)
171 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
172 auto sts =
ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
175 TLOG_ERROR(
"DataSenderManager") <<
"Unable to resolve routing_master_address" << TLOG_ENDL;
178 TLOG_DEBUG(
"DataSenderManager") <<
"Ack socket is fd " << ack_socket_ << TLOG_ENDL;
182 fd.fd = table_socket_;
183 fd.events = POLLIN | POLLPRI;
185 auto res = poll(&fd, 1, 1000);
187 auto first = artdaq::Fragment::InvalidSequenceID;
188 auto last = artdaq::Fragment::InvalidSequenceID;
191 TLOG_DEBUG(
"DataSenderManager") <<
"Going to receive RoutingPacketHeader" << TLOG_ENDL;
193 TLOG_DEBUG(
"DataSenderManager") <<
"Received " << std::to_string(stss) <<
" bytes. (sizeof(RoutingPacketHeader) == " << std::to_string(
sizeof(detail::RoutingPacketHeader)) << TLOG_ENDL;
195 TLOG_DEBUG(
"DataSenderManager") <<
"Checking for valid header" << TLOG_ENDL;
196 if (hdr.
header == ROUTING_MAGIC) {
197 if(routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
199 TLOG_ERROR(
"DataSenderManager") <<
"Received table has different RoutingMasterMode than expected!" << TLOG_ENDL;
202 routing_master_mode_ = hdr.mode;
205 TLOG_DEBUG(
"DataSenderManager") <<
"Receiving data buffer" << TLOG_ENDL;
208 TRACE(6,
"Received a packet of %zu bytes", sts);
210 first = buffer[0].sequence_id;
211 last = buffer[buffer.size() - 1].sequence_id;
213 if (first + hdr.
nEntries - 1 != last)
215 TLOG_ERROR(
"DataSenderManager") <<
"Skipping this RoutingPacket because the first (" << first <<
") and last (" << last <<
") entries are inconsistent (sz=" << hdr.
nEntries <<
")!" << TLOG_ENDL;
218 auto thisSeqID = first;
220 if (routing_table_.count(last) == 0) {
221 for (
auto entry : buffer)
223 if (thisSeqID != entry.sequence_id)
225 TLOG_ERROR(
"DataSenderManager") <<
"Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id <<
", expected=" << thisSeqID <<
")!" << TLOG_ENDL;
226 last = thisSeqID - 1;
230 if (routing_table_.count(entry.sequence_id))
232 if (routing_table_[entry.sequence_id] != entry.destination_rank)
234 TLOG_ERROR(
"DataSenderManager") <<
"Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
235 <<
" should go to rank " << entry.destination_rank <<
", but I had already been told to send it to " << routing_table_[entry.sequence_id] <<
"!"
236 <<
" I will use the original value!" << TLOG_ENDL;
240 routing_table_[entry.sequence_id] = entry.destination_rank;
241 TLOG_DEBUG(
"DataSenderManager") <<
"DataSenderManager " << std::to_string(my_rank) <<
": received update: SeqID " << std::to_string(entry.sequence_id) <<
" -> Rank " << std::to_string(entry.destination_rank) << TLOG_ENDL;
250 TLOG_DEBUG(
"DataSenderManager") <<
"Sending RoutingAckPacket with first= " << std::to_string(first) <<
" and last= " << std::to_string(last) <<
" to " << ack_address_ <<
", port " << ack_port_ <<
" (my_rank = " << my_rank <<
")"<< TLOG_ENDL;
251 TLOG_DEBUG(
"DataSenderManager") <<
"There are now " << routing_table_.size() <<
" entries in the Routing Table" << TLOG_ENDL;
260 std::unique_lock<std::mutex> lck(routing_mutex_);
261 return routing_table_.size();
264 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id)
const
267 if (enabled_destinations_.size() == 1)
return *enabled_destinations_.begin();
269 if (use_routing_master_)
271 auto start = std::chrono::steady_clock::now();
272 while (routing_timeout_ms_ <= 0 || std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count() < routing_timeout_ms_)
274 std::unique_lock<std::mutex> lck(routing_mutex_);
275 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id)) {
276 routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
277 return routing_table_.at(sequence_id);
279 else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count()))
281 routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
282 return routing_table_.at(sent_frag_count_.count());
284 usleep(routing_timeout_ms_ * 10);
286 routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
287 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID) {
288 TLOG_ERROR(
"DataSenderManager") <<
"Bad Omen: I don't have routing information for seqID " << std::to_string(sequence_id)
289 <<
" and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) <<
")!" << TLOG_ENDL;
293 TLOG_ERROR(
"DataSenderManager") <<
"Bad Omen: I don't have routing information for send number " << std::to_string(sent_frag_count_.count())
294 <<
" and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) <<
")!" << TLOG_ENDL;
298 auto index = sequence_id % enabled_destinations_.size();
299 auto it = enabled_destinations_.begin();
300 for (; index > 0; --index)
303 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
316 auto start_time = std::chrono::steady_clock::now();
317 if (frag.type() == Fragment::EndOfDataFragmentType)
319 throw cet::exception(
"LogicError")
320 <<
"EOD fragments should not be sent on as received: "
321 <<
"use sendEODFrag() instead.";
323 size_t seqID = frag.sequenceID();
324 size_t fragSize = frag.sizeBytes();
325 TLOG_ARB(13,
"DataSenderManager") <<
"sendFragment start frag.fragmentHeader()=" << std::hex << (
void*)(frag.headerBeginBytes()) <<
", szB=" << std::dec << std::to_string(fragSize) <<
", seqID=" << std::to_string(seqID) << TLOG_ENDL;
327 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
329 for (
auto& bdest : enabled_destinations_)
331 TRACE(5,
"DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d (broadcast)", seqID, bdest);
333 Fragment fragCopy(frag);
334 auto sts = destinations_[bdest]->copyFragment(fragCopy);
337 sts = destinations_[bdest]->copyFragment(fragCopy);
339 sent_frag_count_.incSlot(bdest);
342 else if (non_blocking_mode_)
345 dest = calcDest_(seqID);
348 TLOG_WARNING(
"DataSenderManager") <<
"Could not get destination for seqID " << std::to_string(seqID) <<
", retrying." << TLOG_ENDL;
351 if (destinations_.count(dest) && enabled_destinations_.count(dest))
353 TRACE(5,
"DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
355 auto lastWarnTime = std::chrono::steady_clock::now();
358 sts = destinations_[dest]->copyFragment(frag);
361 TLOG_ERROR(
"DataSenderManager") <<
"sendFragment: Sending fragment " << seqID <<
" to destination " << dest <<
" failed! Retrying..." << TLOG_ENDL;
362 lastWarnTime = std::chrono::steady_clock::now();
366 sent_frag_count_.incSlot(dest);
370 TLOG_WARNING(
"DataSenderManager") <<
"calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID << TLOG_ENDL;
376 dest = calcDest_(seqID);
379 TLOG_WARNING(
"DataSenderManager") <<
"Could not get destination for seqID " << std::to_string(seqID) <<
", send number " << sent_frag_count_.count() <<
", retrying." << TLOG_ENDL;
382 if (destinations_.count(dest) && enabled_destinations_.count(dest))
384 TRACE(5,
"DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
386 auto lastWarnTime = std::chrono::steady_clock::now();
389 sts = destinations_[dest]->moveFragment(std::move(frag));
392 TLOG_ERROR(
"DataSenderManager") <<
"sendFragment: Sending fragment " << seqID <<
" to destination " << dest <<
" failed! Retrying..." << TLOG_ENDL;
393 lastWarnTime = std::chrono::steady_clock::now();
397 sent_frag_count_.incSlot(dest);
401 TLOG_WARNING(
"DataSenderManager") <<
"calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID << TLOG_ENDL;
404 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.find(seqID - 1) != routing_table_.end())
406 std::unique_lock<std::mutex> lck(routing_mutex_);
407 routing_table_.erase(routing_table_.begin(), routing_table_.find(seqID - 1));
409 else if(routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount)
411 std::unique_lock<std::mutex> lck(routing_mutex_);
412 routing_table_.erase(routing_table_.begin(), routing_table_.find(sent_frag_count_.count()));
416 auto delta_t = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - start_time).count();
417 metricMan->sendMetric(
"Data Send Time to Rank " + std::to_string(dest), delta_t,
"s", 1);
418 metricMan->sendMetric(
"Data Send Size to Rank " + std::to_string(dest), fragSize,
"B", 1);
419 metricMan->sendMetric(
"Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t,
"B/s", 1);
420 if (use_routing_master_) {
421 metricMan->sendMetric(
"Routing Table Size", routing_table_.size(),
"events", 1);
422 if (routing_wait_time_ > 0)
424 size_t wttemp = routing_wait_time_;
425 routing_wait_time_ = 0;
426 metricMan->sendMetric(
"Routing Wait Time", wttemp / 1000000000,
"s", 1);
430 TRACE(5,
"DataSenderManager::sendFragment: Done sending fragment %zu", seqID);
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
A row of the Routing Table.
The send operation timed out.
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
int sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.