1 #define TRACE_NAME "DataSenderManager"
2 #include "artdaq/DAQdata/Globals.hh"
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 #include "artdaq/TransferPlugins/detail/HostMap.hh"
8 #include "canvas/Utilities/Exception.h"
10 #include <netinet/in.h>
11 #include <sys/types.h>
13 #include <sys/socket.h>
18 , enabled_destinations_()
20 , broadcast_sends_(pset.get<bool>(
"broadcast_sends", false))
21 , non_blocking_mode_(pset.get<bool>(
"nonblocking_sends", false))
22 , send_timeout_us_(pset.get<size_t>(
"send_timeout_usec", 5000000))
23 , send_retry_count_(pset.get<size_t>(
"send_retry_count", 2))
24 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
29 TLOG(TLVL_DEBUG) <<
"Received pset: " << pset.to_string();
32 if (send_timeout_us_ == 0) send_timeout_us_ = std::numeric_limits<size_t>::max();
34 auto rmConfig = pset.get<fhicl::ParameterSet>(
"routing_table_config", fhicl::ParameterSet());
35 use_routing_master_ = rmConfig.get<
bool>(
"use_routing_master",
false);
36 table_port_ = rmConfig.get<
int>(
"table_update_port", 35556);
37 table_address_ = rmConfig.get<std::string>(
"table_update_address",
"227.128.12.28");
38 ack_port_ = rmConfig.get<
int>(
"table_acknowledge_port", 35557);
39 ack_address_ = rmConfig.get<std::string>(
"routing_master_hostname",
"localhost");
40 routing_timeout_ms_ = (rmConfig.get<
int>(
"routing_timeout_ms", 1000));
41 routing_retry_count_ = rmConfig.get<
int>(
"routing_retry_count", 5);
44 hostMap_t host_map = MakeHostMap(pset);
46 auto dests = pset.get<fhicl::ParameterSet>(
"destinations", fhicl::ParameterSet());
47 for (
auto& d : dests.get_pset_names())
49 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
50 host_map = MakeHostMap(dest_pset, 0, host_map);
52 auto host_map_pset = MakeHostMapPset(host_map);
53 fhicl::ParameterSet dests_mod;
54 for (
auto& d : dests.get_pset_names())
56 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
57 dest_pset.erase(
"host_map");
58 dest_pset.put<std::vector<fhicl::ParameterSet>>(
"host_map", host_map_pset);
59 dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
62 for (
auto& d : dests.get_pset_names())
67 auto destination_rank = transfer->destination_rank();
68 destinations_.emplace(destination_rank, std::move(transfer));
70 catch (std::invalid_argument)
72 TLOG(TLVL_DEBUG) <<
"Invalid destination specification: " << d;
74 catch (cet::exception ex)
76 TLOG(TLVL_WARNING) <<
"Caught cet::exception: " << ex.what();
80 TLOG(TLVL_WARNING) <<
"Non-cet exception while setting up TransferPlugin: " << d <<
".";
83 if (destinations_.size() == 0)
85 TLOG(TLVL_ERROR) <<
"No destinations specified!";
89 auto enabled_dests = pset.get<std::vector<size_t>>(
"enabled_destinations", std::vector<size_t>());
90 if (enabled_dests.size() == 0)
92 TLOG(TLVL_INFO) <<
"enabled_destinations not specified, assuming all destinations enabled.";
93 for (
auto& d : destinations_)
95 enabled_destinations_.insert(d.first);
100 for (
auto& d : enabled_dests)
102 enabled_destinations_.insert(d);
106 if (use_routing_master_) startTableReceiverThread_();
111 TLOG(TLVL_DEBUG) <<
"Shutting down DataSenderManager BEGIN";
113 for (
auto& dest : enabled_destinations_)
115 if (destinations_.count(dest))
117 auto sts = destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
122 if (routing_thread_.joinable()) routing_thread_.join();
123 TLOG(TLVL_DEBUG) <<
"Shutting down DataSenderManager END. Sent " << count() <<
" fragments.";
127 void artdaq::DataSenderManager::setupTableListener_()
129 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
130 if (table_socket_ < 0)
132 TLOG(TLVL_ERROR) <<
"Error creating socket for receiving table updates!";
136 struct sockaddr_in si_me_request;
139 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes,
sizeof(yes)) < 0)
141 TLOG(TLVL_ERROR) <<
" Unable to enable port reuse on request socket";
144 memset(&si_me_request, 0,
sizeof(si_me_request));
145 si_me_request.sin_family = AF_INET;
146 si_me_request.sin_port = htons(table_port_);
147 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
148 if (bind(table_socket_, (
struct sockaddr *)&si_me_request,
sizeof(si_me_request)) == -1)
150 TLOG(TLVL_ERROR) <<
"Cannot bind request socket to port " << table_port_;
155 int sts =
ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
158 TLOG(TLVL_ERROR) <<
"Unable to resolve multicast address for table updates";
161 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
162 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq,
sizeof(mreq)) < 0)
164 TLOG(TLVL_ERROR) <<
"Unable to join multicast group";
168 void artdaq::DataSenderManager::startTableReceiverThread_()
170 if (routing_thread_.joinable()) routing_thread_.join();
171 TLOG(TLVL_INFO) <<
"Starting Routing Thread";
172 routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_,
this);
174 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
180 TLOG(TLVL_DEBUG) <<
"receiveTableUpdatesLoop: should_stop is " << std::boolalpha << should_stop_ <<
", stopping";
184 TLOG(TLVL_TRACE) <<
"DataSenderManager::receiveTableUpdatesLoop: Polling Request socket for new requests";
185 if (table_socket_ == -1)
187 TLOG(TLVL_DEBUG) <<
"Opening table listener socket";
188 setupTableListener_();
190 if (table_socket_ == -1)
192 TLOG(TLVL_DEBUG) <<
"The listen socket was not opened successfully.";
195 if (ack_socket_ == -1)
197 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
198 auto sts =
ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
201 TLOG(TLVL_ERROR) <<
"Unable to resolve routing_master_address";
204 TLOG(TLVL_DEBUG) <<
"Ack socket is fd " << ack_socket_;
208 fd.fd = table_socket_;
209 fd.events = POLLIN | POLLPRI;
211 auto res = poll(&fd, 1, 1000);
214 auto first = artdaq::Fragment::InvalidSequenceID;
215 auto last = artdaq::Fragment::InvalidSequenceID;
218 TLOG(TLVL_DEBUG) <<
"Going to receive RoutingPacketHeader";
220 TLOG(TLVL_DEBUG) <<
"Received " << std::to_string(stss) <<
" bytes. (sizeof(RoutingPacketHeader) == " << std::to_string(
sizeof(detail::RoutingPacketHeader));
222 TLOG(TLVL_DEBUG) <<
"Checking for valid header";
223 if (hdr.
header == ROUTING_MAGIC)
225 if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.
mode)
227 TLOG(TLVL_ERROR) <<
"Received table has different RoutingMasterMode than expected!";
230 routing_master_mode_ = hdr.
mode;
233 TLOG(TLVL_DEBUG) <<
"Receiving data buffer";
236 TLOG(6) <<
"Received a packet of " << sts <<
" bytes";
238 first = buffer[0].sequence_id;
239 last = buffer[buffer.size() - 1].sequence_id;
241 if (first + hdr.
nEntries - 1 != last)
243 TLOG(TLVL_ERROR) <<
"Skipping this RoutingPacket because the first (" << first <<
") and last (" << last <<
") entries are inconsistent (sz=" << hdr.
nEntries <<
")!";
246 auto thisSeqID = first;
248 if (routing_table_.count(last) == 0)
250 for (
auto entry : buffer)
252 if (thisSeqID != entry.sequence_id)
254 TLOG(TLVL_ERROR) <<
"Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id <<
", expected=" << thisSeqID <<
")!";
255 last = thisSeqID - 1;
259 if (routing_table_.count(entry.sequence_id))
261 if (routing_table_[entry.sequence_id] != entry.destination_rank)
263 TLOG(TLVL_ERROR) <<
"Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
264 <<
" should go to rank " << entry.destination_rank <<
", but I had already been told to send it to " << routing_table_[entry.sequence_id] <<
"!"
265 <<
" I will use the original value!";
269 routing_table_[entry.sequence_id] = entry.destination_rank;
270 TLOG(TLVL_DEBUG) <<
"DataSenderManager " << std::to_string(my_rank) <<
": received update: SeqID " << std::to_string(entry.sequence_id) <<
" -> Rank " << std::to_string(entry.destination_rank);
279 TLOG(TLVL_DEBUG) <<
"Sending RoutingAckPacket with first= " << std::to_string(first) <<
" and last= " << std::to_string(last) <<
" to " << ack_address_ <<
", port " << ack_port_ <<
" (my_rank = " << my_rank <<
")";
280 TLOG(TLVL_DEBUG) <<
"There are now " << routing_table_.size() <<
" entries in the Routing Table";
289 std::unique_lock<std::mutex> lck(routing_mutex_);
290 return routing_table_.size();
293 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id)
const
296 if (enabled_destinations_.size() == 1)
return *enabled_destinations_.begin();
298 if (use_routing_master_)
300 auto start = std::chrono::steady_clock::now();
301 while (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_))
303 std::unique_lock<std::mutex> lck(routing_mutex_);
306 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
307 return routing_table_.at(sequence_id);
311 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
312 return routing_table_.at(sent_frag_count_.count());
314 usleep(routing_timeout_ms_ * 10);
316 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
319 TLOG(TLVL_ERROR) <<
"Bad Omen: I don't have routing information for seqID " << std::to_string(sequence_id)
320 <<
" and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) <<
")!";
324 TLOG(TLVL_ERROR) <<
"Bad Omen: I don't have routing information for send number " << std::to_string(sent_frag_count_.count())
325 <<
" and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) <<
")!";
330 auto index = sequence_id % enabled_destinations_.size();
331 auto it = enabled_destinations_.begin();
332 for (; index > 0; --index)
335 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
346 auto start_time = std::chrono::steady_clock::now();
347 if (frag.type() == Fragment::EndOfDataFragmentType)
349 throw cet::exception(
"LogicError")
350 <<
"EOD fragments should not be sent on as received: "
351 <<
"use sendEODFrag() instead.";
353 size_t seqID = frag.sequenceID();
354 size_t fragSize = frag.sizeBytes();
355 TLOG(13) <<
"sendFragment start frag.fragmentHeader()=" << std::hex << (
void*)(frag.headerBeginBytes()) <<
", szB=" << std::dec << std::to_string(fragSize)
356 <<
", seqID=" << std::to_string(seqID) <<
", type=" << frag.typeString();
359 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
361 for (
auto& bdest : enabled_destinations_)
363 TLOG(TLVL_TRACE) <<
"sendFragment: Sending fragment with seqId " << std::to_string(seqID) <<
" to destination " << bdest <<
" (broadcast)";
365 Fragment fragCopy(frag);
366 auto sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
370 sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
374 sent_frag_count_.incSlot(bdest);
377 else if (non_blocking_mode_)
379 auto count = routing_retry_count_;
382 dest = calcDest_(seqID);
386 TLOG(TLVL_WARNING) <<
"Could not get destination for seqID " << std::to_string(seqID) << (count > 0 ?
", retrying." :
".");
391 TLOG(TLVL_TRACE) <<
"sendFragment: Sending fragment with seqId " << std::to_string(seqID) <<
" to destination " << dest;
393 auto lastWarnTime = std::chrono::steady_clock::now();
397 sts = destinations_[dest]->copyFragment(frag, send_timeout_us_);
400 TLOG(TLVL_ERROR) <<
"sendFragment: Sending fragment " << seqID <<
" to destination " << dest <<
" failed! Retrying...";
401 lastWarnTime = std::chrono::steady_clock::now();
406 sent_frag_count_.incSlot(dest);
410 TLOG(TLVL_WARNING) <<
"calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID;
415 auto count = routing_retry_count_;
418 dest = calcDest_(seqID);
422 TLOG(TLVL_WARNING) <<
"Could not get destination for seqID "
423 << std::to_string(seqID) <<
", send number " << sent_frag_count_.count()
424 << (count > 0 ?
", retrying." :
".");
429 TLOG(5) <<
"DataSenderManager::sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << dest;
432 sts = destinations_[dest]->moveFragment(std::move(frag));
434 TLOG(TLVL_ERROR) <<
"sendFragment: Sending fragment " << seqID <<
" to destination "
435 << dest <<
" failed! Data has been lost!";
438 sent_frag_count_.incSlot(dest);
442 TLOG(TLVL_WARNING) <<
"calcDest returned invalid destination rank " << dest
443 <<
"! This event has been lost: " << seqID;
446 && routing_table_.find(seqID - 1) != routing_table_.end())
448 std::unique_lock<std::mutex> lck(routing_mutex_);
449 routing_table_.erase(routing_table_.begin(), routing_table_.find(seqID - 1));
453 std::unique_lock<std::mutex> lck(routing_mutex_);
454 routing_table_.erase(routing_table_.begin(), routing_table_.find(sent_frag_count_.count()));
458 TLOG(5) <<
"sendFragment: sending metrics";
459 auto delta_t = TimeUtils::GetElapsedTime(start_time);
460 metricMan->sendMetric(
"Data Send Time to Rank " + std::to_string(dest), delta_t,
"s", 5, MetricMode::Accumulate);
461 metricMan->sendMetric(
"Data Send Size to Rank " + std::to_string(dest), fragSize,
"B", 5, MetricMode::Accumulate);
462 metricMan->sendMetric(
"Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t,
"B/s", 5, MetricMode::Average);
463 metricMan->sendMetric(
"Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest),
464 "fragments", 3, MetricMode::LastPoint);
465 if (use_routing_master_)
467 metricMan->sendMetric(
"Routing Table Size", routing_table_.size(),
"events", 2, MetricMode::LastPoint);
468 if (routing_wait_time_ > 0)
469 metricMan->sendMetric(
"Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000,
"s", 2,
470 MetricMode::Average);
473 TLOG(5) <<
"sendFragment: Done sending fragment " << seqID;
474 return std::make_pair(dest, outsts);
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
Value to be returned upon receive timeout.
A row of the Routing Table.
The send operation timed out.
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
Events should be routed by sequence ID (BR -> EB)
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
std::pair< int, TransferInterface::CopyStatus > sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
Events should be routed by send count (EB -> Agg)
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.