1 #include "artdaq/DAQdata/Globals.hh"
2 #define TRACE_NAME (app_name + "_DataSenderManager").c_str()
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 #include "artdaq/TransferPlugins/detail/HostMap.hh"
8 #include "canvas/Utilities/Exception.h"
10 #include <netinet/in.h>
11 #include <sys/types.h>
13 #include <sys/socket.h>
18 , enabled_destinations_()
20 , broadcast_sends_(pset.get<bool>(
"broadcast_sends", false))
21 , non_blocking_mode_(pset.get<bool>(
"nonblocking_sends", false))
22 , send_timeout_us_(pset.get<size_t>(
"send_timeout_usec", 5000000))
23 , send_retry_count_(pset.get<size_t>(
"send_retry_count", 2))
24 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
28 , routing_table_last_(0)
29 , routing_table_max_size_(pset.get<size_t>(
"routing_table_max_size", 1000))
30 , highest_sequence_id_routed_(0)
32 TLOG(TLVL_DEBUG) <<
"Received pset: " << pset.to_string();
35 if (send_timeout_us_ == 0) send_timeout_us_ = std::numeric_limits<size_t>::max();
37 auto rmConfig = pset.get<fhicl::ParameterSet>(
"routing_table_config", fhicl::ParameterSet());
38 use_routing_master_ = rmConfig.get<
bool>(
"use_routing_master",
false);
39 table_port_ = rmConfig.get<
int>(
"table_update_port", 35556);
40 table_address_ = rmConfig.get<std::string>(
"table_update_address",
"227.128.12.28");
41 table_multicast_interface_ = rmConfig.get<std::string>(
"table_update_multicast_interface",
"localhost");
42 ack_port_ = rmConfig.get<
int>(
"table_acknowledge_port", 35557);
43 ack_address_ = rmConfig.get<std::string>(
"routing_master_hostname",
"localhost");
44 routing_timeout_ms_ = (rmConfig.get<
int>(
"routing_timeout_ms", 1000));
45 routing_retry_count_ = rmConfig.get<
int>(
"routing_retry_count", 5);
48 size_t tcp_send_buffer_size = pset.get<
size_t>(
"tcp_send_buffer_size", 0);
49 size_t max_fragment_size_words = pset.get<
size_t>(
"max_fragment_size_words", 0);
51 auto dests = pset.get<fhicl::ParameterSet>(
"destinations", fhicl::ParameterSet());
52 for (
auto& d : dests.get_pset_names())
54 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
58 fhicl::ParameterSet dests_mod;
59 for (
auto& d : dests.get_pset_names())
61 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
62 dest_pset.erase(
"host_map");
63 dest_pset.put<std::vector<fhicl::ParameterSet>>(
"host_map", host_map_pset);
65 if (tcp_send_buffer_size != 0 && !dest_pset.has_key(
"tcp_send_buffer_size"))
67 dest_pset.put<
size_t>(
"tcp_send_buffer_size", tcp_send_buffer_size);
69 if (max_fragment_size_words != 0 && !dest_pset.has_key(
"max_fragment_size_words"))
71 dest_pset.put<
size_t>(
"max_fragment_size_words", max_fragment_size_words);
74 dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
77 for (
auto& d : dests_mod.get_pset_names())
82 auto destination_rank = transfer->destination_rank();
83 destinations_.emplace(destination_rank, std::move(transfer));
85 catch (
const std::invalid_argument&)
87 TLOG(TLVL_DEBUG) <<
"Invalid destination specification: " << d;
89 catch (
const cet::exception& ex)
91 TLOG(TLVL_WARNING) <<
"Caught cet::exception: " << ex.what();
95 TLOG(TLVL_WARNING) <<
"Non-cet exception while setting up TransferPlugin: " << d <<
".";
98 if (destinations_.size() == 0)
100 TLOG(TLVL_ERROR) <<
"No destinations specified!";
104 auto enabled_dests = pset.get<std::vector<size_t>>(
"enabled_destinations", std::vector<size_t>());
105 if (enabled_dests.size() == 0)
107 TLOG(TLVL_INFO) <<
"enabled_destinations not specified, assuming all destinations enabled.";
108 for (
auto& d : destinations_)
110 enabled_destinations_.insert(d.first);
115 for (
auto& d : enabled_dests)
117 enabled_destinations_.insert(d);
121 if (use_routing_master_) startTableReceiverThread_();
126 TLOG(TLVL_DEBUG) <<
"Shutting down DataSenderManager BEGIN";
128 for (
auto& dest : enabled_destinations_)
130 if (destinations_.count(dest))
132 auto sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
137 if (routing_thread_.joinable()) routing_thread_.join();
138 TLOG(TLVL_DEBUG) <<
"Shutting down DataSenderManager END. Sent " << count() <<
" fragments.";
142 void artdaq::DataSenderManager::setupTableListener_()
145 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
146 if (table_socket_ < 0)
148 TLOG(TLVL_ERROR) <<
"Error creating socket for receiving table updates!";
152 struct sockaddr_in si_me_request;
155 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes,
sizeof(yes)) < 0)
157 TLOG(TLVL_ERROR) <<
" Unable to enable port reuse on request socket";
160 memset(&si_me_request, 0,
sizeof(si_me_request));
161 si_me_request.sin_family = AF_INET;
162 si_me_request.sin_port = htons(table_port_);
164 struct in_addr in_addr_s;
165 sts = inet_aton(table_address_.c_str(), &in_addr_s);
168 TLOG(TLVL_ERROR) <<
"inet_aton says table_address " << table_address_ <<
" is invalid";
170 si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
171 if (bind(table_socket_, (
struct sockaddr *)&si_me_request,
sizeof(si_me_request)) == -1)
173 TLOG(TLVL_ERROR) <<
"Cannot bind request socket to port " << table_port_;
178 sts =
ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
181 TLOG(TLVL_ERROR) <<
"Unable to resolve multicast address for table updates";
187 TLOG(TLVL_ERROR) <<
"Unable to resolve multicast interface for table updates" << table_multicast_interface_;
190 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq,
sizeof(mreq)) < 0)
192 TLOG(TLVL_ERROR) <<
"Unable to join multicast group";
196 void artdaq::DataSenderManager::startTableReceiverThread_()
198 if (routing_thread_.joinable()) routing_thread_.join();
199 TLOG(TLVL_INFO) <<
"Starting Routing Thread";
201 routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_,
this);
203 catch (
const boost::exception& e)
205 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
206 std::cerr <<
"Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
210 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
216 TLOG(TLVL_DEBUG) << __func__ <<
": should_stop is " << std::boolalpha << should_stop_ <<
", stopping";
220 TLOG(TLVL_TRACE) << __func__ <<
": Polling table socket for new routes (interface,address,port = "
221 << table_multicast_interface_ <<
"," << table_address_ <<
"," << table_port_ <<
")";
222 if (table_socket_ == -1)
224 TLOG(TLVL_DEBUG) << __func__ <<
": Opening table listener socket";
225 setupTableListener_();
227 if (table_socket_ == -1)
229 TLOG(TLVL_DEBUG) << __func__ <<
": The listen socket was not opened successfully.";
232 if (ack_socket_ == -1)
234 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
235 auto sts =
ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
238 TLOG(TLVL_ERROR) << __func__ <<
": Unable to resolve routing_master_address";
241 TLOG(TLVL_DEBUG) << __func__ <<
": Ack socket is fd " << ack_socket_;
245 fd.fd = table_socket_;
246 fd.events = POLLIN | POLLPRI;
248 auto res = poll(&fd, 1, 1000);
251 auto first = artdaq::Fragment::InvalidSequenceID;
252 auto last = artdaq::Fragment::InvalidSequenceID;
253 std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
256 TLOG(TLVL_DEBUG) << __func__ <<
": Going to receive RoutingPacketHeader";
257 struct sockaddr_in from;
258 socklen_t len =
sizeof(from);
259 auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, (
struct sockaddr*)&from, &len);
260 TLOG(TLVL_DEBUG) << __func__ <<
": Received " << stss <<
" bytes from " << inet_ntoa(from.sin_addr) <<
":" << from.sin_port;
262 if (stss > static_cast<ssize_t>(
sizeof(hdr)))
268 TLOG(TLVL_TRACE) << __func__ <<
": Incorrect size received. Discarding.";
272 TRACE(TLVL_DEBUG,
"receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx", hdr.
nEntries, ((
unsigned long*)&hdr)[0], ((
unsigned long*)&hdr)[1]);
273 if (hdr.
header != ROUTING_MAGIC)
275 TLOG(TLVL_TRACE) << __func__ <<
": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)=" << stss;
279 if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.
mode)
281 TLOG(TLVL_ERROR) << __func__ <<
": Received table has different RoutingMasterMode than expected!";
284 routing_master_mode_ = hdr.
mode;
289 TRACE(6,
"receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx", stss, ((
unsigned long*)&buffer[0])[0], ((
unsigned long*)&buffer[0])[1]);
291 first = buffer[0].sequence_id;
292 last = buffer[buffer.size() - 1].sequence_id;
294 if (first + hdr.
nEntries - 1 != last)
296 TLOG(TLVL_ERROR) << __func__ <<
": Skipping this RoutingPacket because the first (" << first <<
") and last (" << last <<
") entries are inconsistent (sz=" << hdr.
nEntries <<
")!";
299 auto thisSeqID = first;
302 std::unique_lock<std::mutex> lck(routing_mutex_);
303 if (routing_table_.count(last) == 0)
305 for (
auto entry : buffer)
307 if (thisSeqID != entry.sequence_id)
309 TLOG(TLVL_ERROR) << __func__ <<
": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id <<
", expected=" << thisSeqID <<
")!";
310 last = thisSeqID - 1;
314 if (routing_table_.count(entry.sequence_id))
316 if (routing_table_[entry.sequence_id] != entry.destination_rank)
318 TLOG(TLVL_ERROR) << __func__ <<
": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
319 <<
" should go to rank " << entry.destination_rank <<
", but I had already been told to send it to " << routing_table_[entry.sequence_id] <<
"!"
320 <<
" I will use the original value!";
324 if (entry.sequence_id < routing_table_last_)
continue;
325 routing_table_[entry.sequence_id] = entry.destination_rank;
326 TLOG(TLVL_DEBUG) << __func__ <<
": (my_rank=" << my_rank <<
") received update: SeqID " << entry.sequence_id
327 <<
" -> Rank " << entry.destination_rank;
331 TLOG(TLVL_DEBUG) << __func__ <<
": There are now " << routing_table_.size() <<
" entries in the Routing Table";
332 if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ <<
": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
335 for (
auto& entry : routing_table_)
337 TLOG(45) <<
"Routing Table Entry" << counter <<
": " << entry.first <<
" -> " << entry.second;
347 if (last > routing_table_last_) routing_table_last_ = last;
349 if (my_rank < static_cast<int>(8*
sizeof(hdr.already_acknowledged_ranks)) && hdr.already_acknowledged_ranks.test(my_rank))
351 TLOG(TLVL_DEBUG) << __func__ <<
": Skipping RoutingAckPacket since this Routing Table Update has already been acknowledged (my_rank = " << my_rank <<
")";
355 TLOG(TLVL_DEBUG) << __func__ <<
": Sending RoutingAckPacket with first= " << first <<
" and last= " << last <<
" to " << ack_address_ <<
", port " << ack_port_ <<
" (my_rank = " << my_rank <<
")";
365 std::unique_lock<std::mutex> lck(routing_mutex_);
366 return routing_table_.size();
371 std::unique_lock<std::mutex> lck(routing_mutex_);
373 size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
377 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id)
const
380 if (!use_routing_master_ && enabled_destinations_.size() == 1)
return *enabled_destinations_.begin();
382 if (use_routing_master_)
384 auto start = std::chrono::steady_clock::now();
385 TLOG(15) <<
"calcDest_ use_routing_master check for routing info for seqID=" << sequence_id <<
" routing_timeout_ms=" << routing_timeout_ms_ <<
" should_stop_=" << should_stop_;
386 while (!should_stop_ && (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_)))
389 std::unique_lock<std::mutex> lck(routing_mutex_);
392 if (sequence_id > highest_sequence_id_routed_) highest_sequence_id_routed_ = sequence_id;
393 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
394 return routing_table_.at(sequence_id);
398 if (sent_frag_count_.count() + 1 > highest_sequence_id_routed_) highest_sequence_id_routed_ = sent_frag_count_.count() + 1;
399 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
400 return routing_table_.at(sent_frag_count_.count() + 1);
403 usleep(routing_timeout_ms_ * 10);
405 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
408 TLOG(TLVL_WARNING) <<
"Bad Omen: I don't have routing information for seqID " << sequence_id
409 <<
" and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ <<
" ms)!";
413 TLOG(TLVL_WARNING) <<
"Bad Omen: I don't have routing information for send number " << sent_frag_count_.count()
414 <<
" and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ <<
" ms)!";
419 auto index = sequence_id % enabled_destinations_.size();
420 auto it = enabled_destinations_.begin();
421 for (; index > 0; --index)
424 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
431 void artdaq::DataSenderManager::RemoveRoutingTableEntry(Fragment::sequence_id_t seq)
433 TLOG(15) <<
"RemoveRoutingTableEntry: Removing sequence ID " << seq <<
" from routing table. Sent " << GetSentSequenceIDCount(seq) <<
" Fragments with this Sequence ID.";
434 std::unique_lock<std::mutex> lck(routing_mutex_);
439 if (routing_table_.find(seq) != routing_table_.end())
440 routing_table_.erase(routing_table_.find(seq));
442 if (sent_sequence_id_count_.find(seq) != sent_sequence_id_count_.end())
444 sent_sequence_id_count_.erase(sent_sequence_id_count_.find(seq));
448 size_t artdaq::DataSenderManager::GetSentSequenceIDCount(Fragment::sequence_id_t seq)
450 std::unique_lock<std::mutex> lck(routing_mutex_);
451 if (!sent_sequence_id_count_.count(seq))
return 0;
452 return sent_sequence_id_count_[seq];
459 auto start_time = std::chrono::steady_clock::now();
460 if (frag.type() == Fragment::EndOfDataFragmentType)
462 throw cet::exception(
"LogicError")
463 <<
"EOD fragments should not be sent on as received: "
464 <<
"use sendEODFrag() instead.";
466 size_t seqID = frag.sequenceID();
467 size_t fragSize = frag.sizeBytes();
468 auto latency_s = frag.getLatency(
true);
469 double latency = latency_s.tv_sec + (latency_s.tv_nsec / 1000000000.0);
470 TLOG(13) <<
"sendFragment start frag.fragmentHeader()=" << std::hex << (
void*)(frag.headerBeginBytes()) <<
", szB=" << std::dec << fragSize
471 <<
", seqID=" << seqID <<
", fragID=" << frag.fragmentID() <<
", type=" << frag.typeString();
474 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
476 for (
auto& bdest : enabled_destinations_)
478 TLOG(TLVL_TRACE) <<
"sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << bdest <<
" (broadcast)";
484 if (!non_blocking_mode_)
486 sts = destinations_[bdest]->transfer_fragment_reliable_mode(Fragment(frag));
490 sts = destinations_[bdest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
495 sent_frag_count_.incSlot(bdest);
498 else if (non_blocking_mode_)
500 auto count = routing_retry_count_;
503 dest = calcDest_(seqID);
507 TLOG(TLVL_WARNING) <<
"Could not get destination for seqID " << seqID << (count > 0 ?
", retrying." :
".");
512 TLOG(TLVL_TRACE) <<
"sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << dest;
514 auto lastWarnTime = std::chrono::steady_clock::now();
518 sts = destinations_[dest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
521 TLOG(TLVL_WARNING) <<
"sendFragment: Sending fragment " << seqID <<
" to destination " << dest <<
" failed! Retrying...";
522 lastWarnTime = std::chrono::steady_clock::now();
528 sent_frag_count_.incSlot(dest);
530 else if (!should_stop_)
531 TLOG(TLVL_ERROR) <<
"(in non_blocking) calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID
532 <<
". enabled_destinantions_.size()=" << enabled_destinations_.size();
536 auto start = std::chrono::steady_clock::now();
539 dest = calcDest_(seqID);
542 TLOG(TLVL_WARNING) <<
"Could not get destination for seqID " << seqID <<
", send number " << sent_frag_count_.count() <<
", retrying. Waited " << TimeUtils::GetElapsedTime(start) <<
" s for routing information.";
548 TLOG(5) <<
"DataSenderManager::sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << dest;
551 sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(frag));
553 TLOG(TLVL_ERROR) <<
"sendFragment: Sending fragment " << seqID <<
" to destination "
554 << dest <<
" failed! Data has been lost!";
557 sent_frag_count_.incSlot(dest);
560 else if (!should_stop_)
561 TLOG(TLVL_ERROR) <<
"calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID
562 <<
". enabled_destinantions_.size()=" << enabled_destinations_.size();
566 std::unique_lock<std::mutex> lck(routing_mutex_);
567 sent_sequence_id_count_[seqID]++;
570 auto delta_t = TimeUtils::GetElapsedTime(start_time);
574 TLOG(5) <<
"sendFragment: sending metrics";
575 metricMan->sendMetric(
"Data Send Time to Rank " + std::to_string(dest), delta_t,
"s", 5, MetricMode::Accumulate);
576 metricMan->sendMetric(
"Data Send Size to Rank " + std::to_string(dest), fragSize,
"B", 5, MetricMode::Accumulate);
577 metricMan->sendMetric(
"Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t,
"B/s", 5, MetricMode::Average);
578 metricMan->sendMetric(
"Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest),
"fragments", 3, MetricMode::LastPoint);
579 metricMan->sendMetric(
"Fragment Latency at Send", latency,
"s", 4, MetricMode::Average | MetricMode::Maximum);
581 if (use_routing_master_)
583 metricMan->sendMetric(
"Routing Table Size", GetRoutingTableEntryCount(),
"events", 2, MetricMode::LastPoint);
584 if (routing_wait_time_ > 0)
586 metricMan->sendMetric(
"Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000,
"s", 2, MetricMode::Average);
587 routing_wait_time_ = 0;
591 TLOG(5) <<
"sendFragment: Done sending fragment " << seqID <<
" to dest="<<dest;
592 return std::make_pair(dest, outsts);
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
A row of the Routing Table.
The send operation timed out.
hostMap_t MakeHostMap(fhicl::ParameterSet pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
int GetInterfaceForNetwork(char const *host_in, in_addr &addr)
Convert an IP address to the network address of the interface sharing the subnet mask.
Events should be routed by sequence ID (BR -> EB)
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
std::pair< int, TransferInterface::CopyStatus > sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, std::string > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
Value to be returned upon receive timeout.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Events should be routed by send count (EB -> Agg)
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.