1 #define TRACE_NAME (app_name + "_DataSenderManager").c_str()
2 #include "artdaq/DAQdata/Globals.hh"
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 #include "artdaq/TransferPlugins/detail/HostMap.hh"
8 #include "canvas/Utilities/Exception.h"
10 #include <netinet/in.h>
11 #include <sys/types.h>
13 #include <sys/socket.h>
18 , destination_metric_data_()
19 , destination_metric_send_time_()
20 , enabled_destinations_()
22 , broadcast_sends_(pset.get<bool>(
"broadcast_sends", false))
23 , non_blocking_mode_(pset.get<bool>(
"nonblocking_sends", false))
24 , send_timeout_us_(pset.get<size_t>(
"send_timeout_usec", 5000000))
25 , send_retry_count_(pset.get<size_t>(
"send_retry_count", 2))
26 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
30 , routing_table_last_(0)
31 , routing_table_max_size_(pset.get<size_t>(
"routing_table_max_size", 1000))
32 , highest_sequence_id_routed_(0)
34 TLOG(TLVL_DEBUG) <<
"Received pset: " << pset.to_string();
37 if (send_timeout_us_ == 0) send_timeout_us_ = std::numeric_limits<size_t>::max();
39 auto rmConfig = pset.get<fhicl::ParameterSet>(
"routing_table_config", fhicl::ParameterSet());
40 use_routing_master_ = rmConfig.get<
bool>(
"use_routing_master",
false);
41 table_port_ = rmConfig.get<
int>(
"table_update_port", 35556);
42 table_address_ = rmConfig.get<std::string>(
"table_update_address",
"227.128.12.28");
43 ack_port_ = rmConfig.get<
int>(
"table_acknowledge_port", 35557);
44 ack_address_ = rmConfig.get<std::string>(
"routing_master_hostname",
"localhost");
45 routing_timeout_ms_ = (rmConfig.get<
int>(
"routing_timeout_ms", 1000));
46 routing_retry_count_ = rmConfig.get<
int>(
"routing_retry_count", 5);
49 size_t tcp_send_buffer_size = pset.get<
size_t>(
"tcp_send_buffer_size", 0);
50 size_t max_fragment_size_words = pset.get<
size_t>(
"max_fragment_size_words", 0);
52 auto dests = pset.get<fhicl::ParameterSet>(
"destinations", fhicl::ParameterSet());
53 for (
auto& d : dests.get_pset_names())
55 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
59 fhicl::ParameterSet dests_mod;
60 for (
auto& d : dests.get_pset_names())
62 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
63 dest_pset.erase(
"host_map");
64 dest_pset.put<std::vector<fhicl::ParameterSet>>(
"host_map", host_map_pset);
66 if (tcp_send_buffer_size != 0 && !dest_pset.has_key(
"tcp_send_buffer_size"))
68 dest_pset.put<
size_t>(
"tcp_send_buffer_size", tcp_send_buffer_size);
70 if (max_fragment_size_words != 0 && !dest_pset.has_key(
"max_fragment_size_words"))
72 dest_pset.put<
size_t>(
"max_fragment_size_words", max_fragment_size_words);
75 dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
78 for (
auto& d : dests_mod.get_pset_names())
83 auto destination_rank = transfer->destination_rank();
84 destinations_.emplace(destination_rank, std::move(transfer));
85 destination_metric_data_[destination_rank] = std::pair<size_t, double>();
86 destination_metric_send_time_[destination_rank] = std::chrono::steady_clock::now();
88 catch (std::invalid_argument)
90 TLOG(TLVL_DEBUG) <<
"Invalid destination specification: " << d;
92 catch (cet::exception ex)
94 TLOG(TLVL_WARNING) <<
"Caught cet::exception: " << ex.what();
98 TLOG(TLVL_WARNING) <<
"Non-cet exception while setting up TransferPlugin: " << d <<
".";
101 if (destinations_.size() == 0)
103 TLOG(TLVL_ERROR) <<
"No destinations specified!";
107 auto enabled_dests = pset.get<std::vector<size_t>>(
"enabled_destinations", std::vector<size_t>());
108 if (enabled_dests.size() == 0)
110 TLOG(TLVL_INFO) <<
"enabled_destinations not specified, assuming all destinations enabled.";
111 for (
auto& d : destinations_)
113 enabled_destinations_.insert(d.first);
118 for (
auto& d : enabled_dests)
120 enabled_destinations_.insert(d);
124 if (use_routing_master_) startTableReceiverThread_();
129 TLOG(TLVL_DEBUG) <<
"Shutting down DataSenderManager BEGIN";
131 for (
auto& dest : enabled_destinations_)
133 if (destinations_.count(dest))
135 auto sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
140 if (routing_thread_.joinable()) routing_thread_.join();
141 TLOG(TLVL_DEBUG) <<
"Shutting down DataSenderManager END. Sent " << count() <<
" fragments.";
145 void artdaq::DataSenderManager::setupTableListener_()
148 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
149 if (table_socket_ < 0)
151 TLOG(TLVL_ERROR) <<
"Error creating socket for receiving table updates!";
155 struct sockaddr_in si_me_request;
158 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes,
sizeof(yes)) < 0)
160 TLOG(TLVL_ERROR) <<
" Unable to enable port reuse on request socket";
163 memset(&si_me_request, 0,
sizeof(si_me_request));
164 si_me_request.sin_family = AF_INET;
165 si_me_request.sin_port = htons(table_port_);
167 struct in_addr in_addr_s;
168 sts = inet_aton(table_address_.c_str(), &in_addr_s );
171 TLOG(TLVL_ERROR) <<
"inet_aton says table_address " << table_address_ <<
" is invalid";
173 si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
174 if (bind(table_socket_, (
struct sockaddr *)&si_me_request,
sizeof(si_me_request)) == -1)
176 TLOG(TLVL_ERROR) <<
"Cannot bind request socket to port " << table_port_;
181 sts =
ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
184 TLOG(TLVL_ERROR) <<
"Unable to resolve multicast address for table updates";
187 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
188 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq,
sizeof(mreq)) < 0)
190 TLOG(TLVL_ERROR) <<
"Unable to join multicast group";
194 void artdaq::DataSenderManager::startTableReceiverThread_()
196 if (routing_thread_.joinable()) routing_thread_.join();
197 TLOG(TLVL_INFO) <<
"Starting Routing Thread";
199 routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_,
this);
201 catch (
const boost::exception& e)
203 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
204 std::cerr <<
"Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
208 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
214 TLOG(TLVL_DEBUG) << __func__ <<
": should_stop is " << std::boolalpha << should_stop_ <<
", stopping";
218 TLOG(TLVL_TRACE) << __func__ <<
": Polling table socket for new routes";
219 if (table_socket_ == -1)
221 TLOG(TLVL_DEBUG) << __func__ <<
": Opening table listener socket";
222 setupTableListener_();
224 if (table_socket_ == -1)
226 TLOG(TLVL_DEBUG) << __func__ <<
": The listen socket was not opened successfully.";
229 if (ack_socket_ == -1)
231 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
232 auto sts =
ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
235 TLOG(TLVL_ERROR) << __func__ <<
": Unable to resolve routing_master_address";
238 TLOG(TLVL_DEBUG) << __func__ <<
": Ack socket is fd " << ack_socket_;
242 fd.fd = table_socket_;
243 fd.events = POLLIN | POLLPRI;
245 auto res = poll(&fd, 1, 1000);
248 auto first = artdaq::Fragment::InvalidSequenceID;
249 auto last = artdaq::Fragment::InvalidSequenceID;
250 std::vector<uint8_t> buf(MAX_ROUTING_TABLE_SIZE);
253 TLOG(TLVL_DEBUG) << __func__ <<
": Going to receive RoutingPacketHeader";
254 struct sockaddr_in from;
255 socklen_t len=
sizeof(from);
256 auto stss = recvfrom(table_socket_, &buf[0], MAX_ROUTING_TABLE_SIZE, 0, (
struct sockaddr*)&from, &len );
257 TLOG(TLVL_DEBUG) << __func__ <<
": Received " << stss <<
" bytes from " << inet_ntoa(from.sin_addr) <<
":" << from.sin_port;
259 if (stss > static_cast<ssize_t>(
sizeof(hdr)))
265 TLOG(TLVL_TRACE) << __func__ <<
": Incorrect size received. Discarding.";
269 TRACE(TLVL_DEBUG,
"receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx",hdr.
nEntries,((
unsigned long*)&hdr)[0],((
unsigned long*)&hdr)[1]);
270 if (hdr.
header != ROUTING_MAGIC)
272 TLOG(TLVL_TRACE) << __func__ <<
": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)="<<stss;
276 if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.
mode)
278 TLOG(TLVL_ERROR) << __func__ <<
": Received table has different RoutingMasterMode than expected!";
281 routing_master_mode_ = hdr.
mode;
286 TRACE(6,
"receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx",stss,((
unsigned long*)&buffer[0])[0],((
unsigned long*)&buffer[0])[1]);
288 first = buffer[0].sequence_id;
289 last = buffer[buffer.size() - 1].sequence_id;
291 if (first + hdr.
nEntries - 1 != last)
293 TLOG(TLVL_ERROR) << __func__ <<
": Skipping this RoutingPacket because the first (" << first <<
") and last (" << last <<
") entries are inconsistent (sz=" << hdr.
nEntries <<
")!";
296 auto thisSeqID = first;
299 std::unique_lock<std::mutex> lck(routing_mutex_);
300 if (routing_table_.count(last) == 0)
302 for (
auto entry : buffer)
304 if (thisSeqID != entry.sequence_id)
306 TLOG(TLVL_ERROR) << __func__ <<
": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id <<
", expected=" << thisSeqID <<
")!";
307 last = thisSeqID - 1;
311 if (routing_table_.count(entry.sequence_id))
313 if (routing_table_[entry.sequence_id] != entry.destination_rank)
315 TLOG(TLVL_ERROR) << __func__ <<
": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
316 <<
" should go to rank " << entry.destination_rank <<
", but I had already been told to send it to " << routing_table_[entry.sequence_id] <<
"!"
317 <<
" I will use the original value!";
321 if (entry.sequence_id < routing_table_last_)
continue;
322 routing_table_[entry.sequence_id] = entry.destination_rank;
323 TLOG(TLVL_DEBUG) << __func__ <<
": (my_rank=" << my_rank <<
") received update: SeqID " << entry.sequence_id
324 <<
" -> Rank " << entry.destination_rank;
328 TLOG(TLVL_DEBUG) << __func__ <<
": There are now " << routing_table_.size() <<
" entries in the Routing Table";
329 if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ <<
": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
332 for (
auto& entry : routing_table_)
334 TLOG(45) <<
"Routing Table Entry" << counter <<
": " << entry.first <<
" -> " << entry.second;
344 if (last > routing_table_last_) routing_table_last_ = last;
346 TLOG(TLVL_DEBUG) << __func__ <<
": Sending RoutingAckPacket with first= " << first <<
" and last= " << last <<
" to " << ack_address_ <<
", port " << ack_port_ <<
" (my_rank = " << my_rank <<
")";
355 std::unique_lock<std::mutex> lck(routing_mutex_);
356 return routing_table_.size();
361 std::unique_lock<std::mutex> lck(routing_mutex_);
363 size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
367 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id)
const
370 if (!use_routing_master_ && enabled_destinations_.size() == 1)
return *enabled_destinations_.begin();
372 if (use_routing_master_)
374 auto start = std::chrono::steady_clock::now();
375 TLOG(15) <<
"calcDest_ use_routing_master check for routing info for seqID="<<sequence_id<<
" routing_timeout_ms="<<routing_timeout_ms_<<
" should_stop_="<<should_stop_;
376 while (!should_stop_ && (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_)))
379 std::unique_lock<std::mutex> lck(routing_mutex_);
382 if (sequence_id > highest_sequence_id_routed_) highest_sequence_id_routed_ = sequence_id;
383 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
384 return routing_table_.at(sequence_id);
388 if (sent_frag_count_.count() + 1 > highest_sequence_id_routed_) highest_sequence_id_routed_ = sent_frag_count_.count() + 1;
389 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
390 return routing_table_.at(sent_frag_count_.count() + 1);
393 usleep(routing_timeout_ms_ * 10);
395 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
398 TLOG(TLVL_WARNING) <<
"Bad Omen: I don't have routing information for seqID " << sequence_id
399 <<
" and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ <<
" ms)!";
403 TLOG(TLVL_WARNING) <<
"Bad Omen: I don't have routing information for send number " << sent_frag_count_.count()
404 <<
" and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ <<
" ms)!";
409 auto index = sequence_id % enabled_destinations_.size();
410 auto it = enabled_destinations_.begin();
411 for (; index > 0; --index)
414 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
425 auto start_time = std::chrono::steady_clock::now();
426 if (frag.type() == Fragment::EndOfDataFragmentType)
428 throw cet::exception(
"LogicError")
429 <<
"EOD fragments should not be sent on as received: "
430 <<
"use sendEODFrag() instead.";
432 size_t seqID = frag.sequenceID();
433 size_t fragSize = frag.sizeBytes();
434 TLOG(13) <<
"sendFragment start frag.fragmentHeader()=" << std::hex << (
void*)(frag.headerBeginBytes()) <<
", szB=" << std::dec << fragSize
435 <<
", seqID=" << seqID <<
", type=" << frag.typeString();
438 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
440 for (
auto& bdest : enabled_destinations_)
442 TLOG(TLVL_TRACE) <<
"sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << bdest <<
" (broadcast)";
448 if (!non_blocking_mode_)
450 sts = destinations_[bdest]->transfer_fragment_reliable_mode(Fragment(frag));
454 sts = destinations_[bdest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
459 sent_frag_count_.incSlot(bdest);
462 else if (non_blocking_mode_)
464 auto count = routing_retry_count_;
467 dest = calcDest_(seqID);
471 TLOG(TLVL_WARNING) <<
"Could not get destination for seqID " << seqID << (count > 0 ?
", retrying." :
".");
476 TLOG(TLVL_TRACE) <<
"sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << dest;
478 auto lastWarnTime = std::chrono::steady_clock::now();
482 sts = destinations_[dest]->transfer_fragment_min_blocking_mode(frag, send_timeout_us_);
485 TLOG(TLVL_WARNING) <<
"sendFragment: Sending fragment " << seqID <<
" to destination " << dest <<
" failed! Retrying...";
486 lastWarnTime = std::chrono::steady_clock::now();
491 sent_frag_count_.incSlot(dest);
493 else if (!should_stop_)
494 TLOG(TLVL_ERROR) <<
"(in non_blocking) calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID
495 <<
". enabled_destinantions_.size()="<<enabled_destinations_.size();
499 auto start = std::chrono::steady_clock::now();
502 dest = calcDest_(seqID);
505 TLOG(TLVL_WARNING) <<
"Could not get destination for seqID " << seqID <<
", send number " << sent_frag_count_.count() <<
", retrying. Waited " << TimeUtils::GetElapsedTime(start) <<
" s for routing information.";
511 TLOG(5) <<
"DataSenderManager::sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << dest;
514 sts = destinations_[dest]->transfer_fragment_reliable_mode(std::move(frag));
516 TLOG(TLVL_ERROR) <<
"sendFragment: Sending fragment " << seqID <<
" to destination "
517 << dest <<
" failed! Data has been lost!";
520 sent_frag_count_.incSlot(dest);
523 else if (!should_stop_)
524 TLOG(TLVL_ERROR) <<
"calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID
525 <<
". enabled_destinantions_.size()="<<enabled_destinations_.size();
529 std::unique_lock<std::mutex> lck(routing_mutex_);
535 routing_table_.erase(routing_table_.find(seqID));
536 else if(routing_table_.find(sent_frag_count_.count()) != routing_table_.end())
537 routing_table_.erase(routing_table_.find(sent_frag_count_.count()));
541 auto delta_t = TimeUtils::GetElapsedTime(start_time);
542 destination_metric_data_[dest].first += fragSize;
543 destination_metric_data_[dest].second += delta_t;
545 if (metricMan && TimeUtils::GetElapsedTime(destination_metric_send_time_[dest]) > 1)
547 TLOG(5) <<
"sendFragment: sending metrics";
548 metricMan->sendMetric(
"Data Send Time to Rank " + std::to_string(dest), destination_metric_data_[dest].second,
"s", 5, MetricMode::Accumulate);
549 metricMan->sendMetric(
"Data Send Size to Rank " + std::to_string(dest), destination_metric_data_[dest].first,
"B", 5, MetricMode::Accumulate);
550 metricMan->sendMetric(
"Data Send Rate to Rank " + std::to_string(dest), destination_metric_data_[dest].first / destination_metric_data_[dest].second,
"B/s", 5, MetricMode::Average);
551 metricMan->sendMetric(
"Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest),
"fragments", 3, MetricMode::LastPoint);
553 destination_metric_send_time_[dest] = std::chrono::steady_clock::now();
554 destination_metric_data_[dest].first = 0;
555 destination_metric_data_[dest].second = 0.0;
557 if (use_routing_master_)
559 metricMan->sendMetric(
"Routing Table Size", GetRoutingTableEntryCount(),
"events", 2, MetricMode::LastPoint);
560 if (routing_wait_time_ > 0)
562 metricMan->sendMetric(
"Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000,
"s", 2, MetricMode::Average);
563 routing_wait_time_ = 0;
567 TLOG(5) <<
"sendFragment: Done sending fragment " << seqID <<
" to dest="<<dest;
568 return std::make_pair(dest, outsts);
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
A row of the Routing Table.
The send operation timed out.
hostMap_t MakeHostMap(fhicl::ParameterSet pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
Events should be routed by sequence ID (BR -> EB)
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
std::pair< int, TransferInterface::CopyStatus > sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, std::string > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
Value to be returned upon receive timeout.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Events should be routed by send count (EB -> Agg)
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.