1 #define TRACE_NAME "DataSenderManager"
2 #include "artdaq/DAQdata/Globals.hh"
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
7 #include "canvas/Utilities/Exception.h"
9 #include <netinet/in.h>
10 #include <sys/types.h>
12 #include <sys/socket.h>
17 , enabled_destinations_()
19 , broadcast_sends_(pset.get<bool>(
"broadcast_sends", false))
20 , non_blocking_mode_(pset.get<bool>(
"nonblocking_sends", false))
21 , send_timeout_us_(pset.get<size_t>(
"send_timeout_usec", 0))
22 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
27 TLOG_DEBUG(
"DataSenderManager") <<
"Received pset: " << pset.to_string() << TLOG_ENDL;
28 auto rmConfig = pset.get<fhicl::ParameterSet>(
"routing_table_config", fhicl::ParameterSet());
29 use_routing_master_ = rmConfig.get<
bool>(
"use_routing_master",
false);
30 table_port_ = rmConfig.get<
int>(
"table_update_port", 35556);
31 table_address_ = rmConfig.get<std::string>(
"table_update_address",
"227.128.12.28");
32 ack_port_ = rmConfig.get<
int>(
"table_acknowledge_port", 35557);
33 ack_address_ = rmConfig.get<std::string>(
"routing_master_hostname",
"localhost");
34 routing_timeout_ms_ = (rmConfig.get<
int>(
"routing_timeout_ms", 1000));
35 routing_retry_count_ = rmConfig.get<
int>(
"routing_retry_count", 5);
38 auto dests = pset.get<fhicl::ParameterSet>(
"destinations", fhicl::ParameterSet());
39 for (
auto& d : dests.get_pset_names())
44 auto destination_rank = transfer->destination_rank();
45 destinations_.emplace(destination_rank, std::move(transfer));
47 catch (std::invalid_argument)
49 TRACE(3,
"Invalid destination specification: " + d);
51 catch (cet::exception ex)
53 TLOG_WARNING(
"DataSenderManager") <<
"Caught cet::exception: " << ex.what() << TLOG_ENDL;
57 TLOG_WARNING(
"DataSenderManager") <<
"Non-cet exception while setting up TransferPlugin: " << d <<
"." << TLOG_ENDL;
60 if (destinations_.size() == 0)
62 TLOG_ERROR(
"DataSenderManager") <<
"No destinations specified!" << TLOG_ENDL;
66 auto enabled_dests = pset.get<std::vector<size_t>>(
"enabled_destinations", std::vector<size_t>());
67 if (enabled_dests.size() == 0)
69 TLOG_INFO(
"DataSenderManager") <<
"enabled_destinations not specified, assuming all destinations enabled." << TLOG_ENDL;
70 for (
auto& d : destinations_)
72 enabled_destinations_.insert(d.first);
77 for (
auto& d : enabled_dests)
79 enabled_destinations_.insert(d);
83 if (use_routing_master_) startTableReceiverThread_();
88 TLOG_DEBUG(
"DataSenderManager") <<
"Shutting down DataSenderManager BEGIN" << TLOG_ENDL;
90 for (
auto& dest : enabled_destinations_)
92 if (destinations_.count(dest))
94 auto sts = destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
99 if (routing_thread_.joinable()) routing_thread_.join();
100 TLOG_DEBUG(
"DataSenderManager") <<
"Shutting down DataSenderManager END. Sent " << count() <<
" fragments." << TLOG_ENDL;
104 void artdaq::DataSenderManager::setupTableListener_()
106 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
107 if (table_socket_ < 0)
109 TLOG_ERROR(
"DataSenderManager") <<
"Error creating socket for receiving table updates!" << TLOG_ENDL;
113 struct sockaddr_in si_me_request;
116 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes,
sizeof(yes)) < 0)
118 TLOG_ERROR(
"DataSenderManager") <<
" Unable to enable port reuse on request socket" << TLOG_ENDL;
121 memset(&si_me_request, 0,
sizeof(si_me_request));
122 si_me_request.sin_family = AF_INET;
123 si_me_request.sin_port = htons(table_port_);
124 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
125 if (bind(table_socket_, (
struct sockaddr *)&si_me_request,
sizeof(si_me_request)) == -1)
127 TLOG_ERROR(
"DataSenderManager") <<
"Cannot bind request socket to port " << table_port_ << TLOG_ENDL;
132 int sts =
ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
135 TLOG_ERROR(
"DataSenderManager") <<
"Unable to resolve multicast address for table updates" << TLOG_ENDL;
138 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
139 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq,
sizeof(mreq)) < 0)
141 TLOG_ERROR(
"DataSenderManager") <<
"Unable to join multicast group" << TLOG_ENDL;
145 void artdaq::DataSenderManager::startTableReceiverThread_()
147 if (routing_thread_.joinable()) routing_thread_.join();
148 TLOG_INFO(
"DataSenderManager") <<
"Starting Routing Thread" << TLOG_ENDL;
149 routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_,
this);
151 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
157 TLOG_DEBUG(
"DataSenderManager") <<
"receiveTableUpdatesLoop: should_stop is " << std::boolalpha << should_stop_ <<
", stopping" << TLOG_ENDL;
161 TRACE(4,
"DataSenderManager::receiveTableUpdatesLoop: Polling Request socket for new requests");
162 if (table_socket_ == -1)
164 TLOG_DEBUG(
"DataSenderManager") <<
"Opening table listener socket" << TLOG_ENDL;
165 setupTableListener_();
167 if (table_socket_ == -1)
169 TLOG_DEBUG(
"DataSenderManager") <<
"The listen socket was not opened successfully." << TLOG_ENDL;
172 if (ack_socket_ == -1)
174 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
175 auto sts =
ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
178 TLOG_ERROR(
"DataSenderManager") <<
"Unable to resolve routing_master_address" << TLOG_ENDL;
181 TLOG_DEBUG(
"DataSenderManager") <<
"Ack socket is fd " << ack_socket_ << TLOG_ENDL;
185 fd.fd = table_socket_;
186 fd.events = POLLIN | POLLPRI;
188 auto res = poll(&fd, 1, 1000);
190 auto first = artdaq::Fragment::InvalidSequenceID;
191 auto last = artdaq::Fragment::InvalidSequenceID;
194 TLOG_DEBUG(
"DataSenderManager") <<
"Going to receive RoutingPacketHeader" << TLOG_ENDL;
196 TLOG_DEBUG(
"DataSenderManager") <<
"Received " << std::to_string(stss) <<
" bytes. (sizeof(RoutingPacketHeader) == " << std::to_string(
sizeof(detail::RoutingPacketHeader)) << TLOG_ENDL;
198 TLOG_DEBUG(
"DataSenderManager") <<
"Checking for valid header" << TLOG_ENDL;
199 if (hdr.
header == ROUTING_MAGIC) {
200 if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.
mode)
202 TLOG_ERROR(
"DataSenderManager") <<
"Received table has different RoutingMasterMode than expected!" << TLOG_ENDL;
205 routing_master_mode_ = hdr.
mode;
208 TLOG_DEBUG(
"DataSenderManager") <<
"Receiving data buffer" << TLOG_ENDL;
211 TRACE(6,
"Received a packet of %zu bytes", sts);
213 first = buffer[0].sequence_id;
214 last = buffer[buffer.size() - 1].sequence_id;
216 if (first + hdr.
nEntries - 1 != last)
218 TLOG_ERROR(
"DataSenderManager") <<
"Skipping this RoutingPacket because the first (" << first <<
") and last (" << last <<
") entries are inconsistent (sz=" << hdr.
nEntries <<
")!" << TLOG_ENDL;
221 auto thisSeqID = first;
223 if (routing_table_.count(last) == 0) {
224 for (
auto entry : buffer)
226 if (thisSeqID != entry.sequence_id)
228 TLOG_ERROR(
"DataSenderManager") <<
"Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id <<
", expected=" << thisSeqID <<
")!" << TLOG_ENDL;
229 last = thisSeqID - 1;
233 if (routing_table_.count(entry.sequence_id))
235 if (routing_table_[entry.sequence_id] != entry.destination_rank)
237 TLOG_ERROR(
"DataSenderManager") <<
"Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
238 <<
" should go to rank " << entry.destination_rank <<
", but I had already been told to send it to " << routing_table_[entry.sequence_id] <<
"!"
239 <<
" I will use the original value!" << TLOG_ENDL;
243 routing_table_[entry.sequence_id] = entry.destination_rank;
244 TLOG_DEBUG(
"DataSenderManager") <<
"DataSenderManager " << std::to_string(my_rank) <<
": received update: SeqID " << std::to_string(entry.sequence_id) <<
" -> Rank " << std::to_string(entry.destination_rank) << TLOG_ENDL;
253 TLOG_DEBUG(
"DataSenderManager") <<
"Sending RoutingAckPacket with first= " << std::to_string(first) <<
" and last= " << std::to_string(last) <<
" to " << ack_address_ <<
", port " << ack_port_ <<
" (my_rank = " << my_rank <<
")" << TLOG_ENDL;
254 TLOG_DEBUG(
"DataSenderManager") <<
"There are now " << routing_table_.size() <<
" entries in the Routing Table" << TLOG_ENDL;
263 std::unique_lock<std::mutex> lck(routing_mutex_);
264 return routing_table_.size();
267 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id)
const
270 if (enabled_destinations_.size() == 1)
return *enabled_destinations_.begin();
272 if (use_routing_master_)
274 auto start = std::chrono::steady_clock::now();
275 while (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_))
277 std::unique_lock<std::mutex> lck(routing_mutex_);
279 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
280 return routing_table_.at(sequence_id);
284 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
285 return routing_table_.at(sent_frag_count_.count());
287 usleep(routing_timeout_ms_ * 10);
289 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
291 TLOG_ERROR(
"DataSenderManager") <<
"Bad Omen: I don't have routing information for seqID " << std::to_string(sequence_id)
292 <<
" and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) <<
")!" << TLOG_ENDL;
296 TLOG_ERROR(
"DataSenderManager") <<
"Bad Omen: I don't have routing information for send number " << std::to_string(sent_frag_count_.count())
297 <<
" and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) <<
")!" << TLOG_ENDL;
301 auto index = sequence_id % enabled_destinations_.size();
302 auto it = enabled_destinations_.begin();
303 for (; index > 0; --index)
306 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
319 auto start_time = std::chrono::steady_clock::now();
320 if (frag.type() == Fragment::EndOfDataFragmentType)
322 throw cet::exception(
"LogicError")
323 <<
"EOD fragments should not be sent on as received: "
324 <<
"use sendEODFrag() instead.";
326 size_t seqID = frag.sequenceID();
327 size_t fragSize = frag.sizeBytes();
328 TLOG_ARB(13,
"DataSenderManager") <<
"sendFragment start frag.fragmentHeader()=" << std::hex << (
void*)(frag.headerBeginBytes()) <<
", szB=" << std::dec << std::to_string(fragSize)
329 <<
", seqID=" << std::to_string(seqID) <<
", type=" << frag.typeString() << TLOG_ENDL;
331 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
333 for (
auto& bdest : enabled_destinations_)
335 TLOG_TRACE(
"DataSenderManager") <<
"sendFragment: Sending fragment with seqId " << std::to_string(seqID) <<
" to destination " << bdest <<
" (broadcast)" << TLOG_ENDL;
337 Fragment fragCopy(frag);
338 auto sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
341 sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
343 sent_frag_count_.incSlot(bdest);
346 else if (non_blocking_mode_)
348 auto count = routing_retry_count_;
350 dest = calcDest_(seqID);
354 TLOG_WARNING(
"DataSenderManager") <<
"Could not get destination for seqID " << std::to_string(seqID) << (count > 0 ?
", retrying." :
".") << TLOG_ENDL;
359 TLOG_TRACE(
"DataSenderManager") <<
"sendFragment: Sending fragment with seqId " << std::to_string(seqID) <<
" to destination " << dest << TLOG_ENDL;
361 auto lastWarnTime = std::chrono::steady_clock::now();
364 sts = destinations_[dest]->copyFragment(frag, send_timeout_us_);
367 TLOG_ERROR(
"DataSenderManager") <<
"sendFragment: Sending fragment " << seqID <<
" to destination " << dest <<
" failed! Retrying..." << TLOG_ENDL;
368 lastWarnTime = std::chrono::steady_clock::now();
372 sent_frag_count_.incSlot(dest);
376 TLOG_WARNING(
"DataSenderManager") <<
"calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID << TLOG_ENDL;
380 auto count = routing_retry_count_;
382 dest = calcDest_(seqID);
385 TLOG_WARNING(
"DataSenderManager") <<
"Could not get destination for seqID "
386 << std::to_string(seqID) <<
", send number " << sent_frag_count_.count()
387 << (count > 0 ?
", retrying." :
".") << TLOG_ENDL;
391 TRACE(5,
"DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
394 sts = destinations_[dest]->moveFragment(std::move(frag), send_timeout_us_);
396 TLOG_ERROR(
"DataSenderManager") <<
"sendFragment: Sending fragment " << seqID <<
" to destination "
397 << dest <<
" failed! Data has been lost!" << TLOG_ENDL;
400 sent_frag_count_.incSlot(dest);
403 TLOG_WARNING(
"DataSenderManager") <<
"calcDest returned invalid destination rank " << dest
404 <<
"! This event has been lost: " << seqID << TLOG_ENDL;
407 && routing_table_.find(seqID - 1) != routing_table_.end()) {
408 std::unique_lock<std::mutex> lck(routing_mutex_);
409 routing_table_.erase(routing_table_.begin(), routing_table_.find(seqID - 1));
412 std::unique_lock<std::mutex> lck(routing_mutex_);
413 routing_table_.erase(routing_table_.begin(), routing_table_.find(sent_frag_count_.count()));
416 TRACE(5,
"sendFragment: sending metrics");
417 auto delta_t = TimeUtils::GetElapsedTime(start_time);
418 metricMan->sendMetric(
"Data Send Time to Rank " + std::to_string(dest), delta_t,
"s", 3, MetricMode::Accumulate);
419 metricMan->sendMetric(
"Data Send Size to Rank " + std::to_string(dest), fragSize,
"B", 3, MetricMode::Accumulate);
420 metricMan->sendMetric(
"Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t,
"B/s", 3, MetricMode::Average);
421 metricMan->sendMetric(
"Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest),
422 "fragments", 3, MetricMode::LastPoint);
423 if (use_routing_master_) {
424 metricMan->sendMetric(
"Routing Table Size", routing_table_.size(),
"events", 1, MetricMode::LastPoint);
425 if (routing_wait_time_ > 0)
426 metricMan->sendMetric(
"Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000,
"s", 1,
427 MetricMode::Average);
430 TRACE(5,
"sendFragment: Done sending fragment %zu", seqID);
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
A row of the Routing Table.
The send operation timed out.
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
Events should be routed by sequence ID (BR -> EB)
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
int sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
Events should be routed by send count (EB -> Agg)
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.