1 #define TRACE_NAME (app_name + "_DataSenderManager").c_str()
2 #include "artdaq/DAQdata/Globals.hh"
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 #include "artdaq/TransferPlugins/detail/HostMap.hh"
8 #include "canvas/Utilities/Exception.h"
10 #include <netinet/in.h>
11 #include <sys/types.h>
13 #include <sys/socket.h>
18 , destination_metric_data_()
19 , destination_metric_send_time_()
20 , enabled_destinations_()
22 , broadcast_sends_(pset.get<bool>(
"broadcast_sends", false))
23 , non_blocking_mode_(pset.get<bool>(
"nonblocking_sends", false))
24 , send_timeout_us_(pset.get<size_t>(
"send_timeout_usec", 5000000))
25 , send_retry_count_(pset.get<size_t>(
"send_retry_count", 2))
26 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
30 , routing_table_last_(0)
31 , routing_table_max_size_(pset.get<size_t>(
"routing_table_max_size", 1000))
32 , highest_sequence_id_routed_(0)
34 TLOG(TLVL_DEBUG) <<
"Received pset: " << pset.to_string();
37 if (send_timeout_us_ == 0) send_timeout_us_ = std::numeric_limits<size_t>::max();
39 auto rmConfig = pset.get<fhicl::ParameterSet>(
"routing_table_config", fhicl::ParameterSet());
40 use_routing_master_ = rmConfig.get<
bool>(
"use_routing_master",
false);
41 table_port_ = rmConfig.get<
int>(
"table_update_port", 35556);
42 table_address_ = rmConfig.get<std::string>(
"table_update_address",
"227.128.12.28");
43 ack_port_ = rmConfig.get<
int>(
"table_acknowledge_port", 35557);
44 ack_address_ = rmConfig.get<std::string>(
"routing_master_hostname",
"localhost");
45 routing_timeout_ms_ = (rmConfig.get<
int>(
"routing_timeout_ms", 1000));
46 routing_retry_count_ = rmConfig.get<
int>(
"routing_retry_count", 5);
49 size_t tcp_send_buffer_size = pset.get<
size_t>(
"tcp_send_buffer_size", 0);
50 size_t max_fragment_size_words = pset.get<
size_t>(
"max_fragment_size_words", 0);
52 auto dests = pset.get<fhicl::ParameterSet>(
"destinations", fhicl::ParameterSet());
53 for (
auto& d : dests.get_pset_names())
55 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
59 fhicl::ParameterSet dests_mod;
60 for (
auto& d : dests.get_pset_names())
62 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
63 dest_pset.erase(
"host_map");
64 dest_pset.put<std::vector<fhicl::ParameterSet>>(
"host_map", host_map_pset);
66 if (tcp_send_buffer_size != 0 && !dest_pset.has_key(
"tcp_send_buffer_size"))
68 dest_pset.put<
size_t>(
"tcp_send_buffer_size", tcp_send_buffer_size);
70 if (max_fragment_size_words != 0 && !dest_pset.has_key(
"max_fragment_size_words"))
72 dest_pset.put<
size_t>(
"max_fragment_size_words", max_fragment_size_words);
75 dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
78 for (
auto& d : dests_mod.get_pset_names())
83 auto destination_rank = transfer->destination_rank();
84 destinations_.emplace(destination_rank, std::move(transfer));
85 destination_metric_data_[destination_rank] = std::pair<size_t, double>();
86 destination_metric_send_time_[destination_rank] = std::chrono::steady_clock::now();
88 catch (std::invalid_argument)
90 TLOG(TLVL_DEBUG) <<
"Invalid destination specification: " << d;
92 catch (cet::exception ex)
94 TLOG(TLVL_WARNING) <<
"Caught cet::exception: " << ex.what();
98 TLOG(TLVL_WARNING) <<
"Non-cet exception while setting up TransferPlugin: " << d <<
".";
101 if (destinations_.size() == 0)
103 TLOG(TLVL_ERROR) <<
"No destinations specified!";
107 auto enabled_dests = pset.get<std::vector<size_t>>(
"enabled_destinations", std::vector<size_t>());
108 if (enabled_dests.size() == 0)
110 TLOG(TLVL_INFO) <<
"enabled_destinations not specified, assuming all destinations enabled.";
111 for (
auto& d : destinations_)
113 enabled_destinations_.insert(d.first);
118 for (
auto& d : enabled_dests)
120 enabled_destinations_.insert(d);
124 if (use_routing_master_) startTableReceiverThread_();
129 TLOG(TLVL_DEBUG) <<
"Shutting down DataSenderManager BEGIN";
131 for (
auto& dest : enabled_destinations_)
133 if (destinations_.count(dest))
135 auto sts = destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
140 if (routing_thread_.joinable()) routing_thread_.join();
141 TLOG(TLVL_DEBUG) <<
"Shutting down DataSenderManager END. Sent " << count() <<
" fragments.";
145 void artdaq::DataSenderManager::setupTableListener_()
148 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
149 if (table_socket_ < 0)
151 TLOG(TLVL_ERROR) <<
"Error creating socket for receiving table updates!";
155 struct sockaddr_in si_me_request;
158 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes,
sizeof(yes)) < 0)
160 TLOG(TLVL_ERROR) <<
" Unable to enable port reuse on request socket";
163 memset(&si_me_request, 0,
sizeof(si_me_request));
164 si_me_request.sin_family = AF_INET;
165 si_me_request.sin_port = htons(table_port_);
167 struct in_addr in_addr_s;
168 sts = inet_aton(table_address_.c_str(), &in_addr_s );
171 TLOG(TLVL_ERROR) <<
"inet_aton says table_address " << table_address_ <<
" is invalid";
173 si_me_request.sin_addr.s_addr = in_addr_s.s_addr;
174 if (bind(table_socket_, (
struct sockaddr *)&si_me_request,
sizeof(si_me_request)) == -1)
176 TLOG(TLVL_ERROR) <<
"Cannot bind request socket to port " << table_port_;
181 sts =
ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
184 TLOG(TLVL_ERROR) <<
"Unable to resolve multicast address for table updates";
187 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
188 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq,
sizeof(mreq)) < 0)
190 TLOG(TLVL_ERROR) <<
"Unable to join multicast group";
194 void artdaq::DataSenderManager::startTableReceiverThread_()
196 if (routing_thread_.joinable()) routing_thread_.join();
197 TLOG(TLVL_INFO) <<
"Starting Routing Thread";
199 routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_,
this);
201 catch (
const boost::exception& e)
203 TLOG(TLVL_ERROR) <<
"Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) <<
", errno=" << errno;
204 std::cerr <<
"Caught boost::exception starting Routing Table Receive thread: " << boost::diagnostic_information(e) <<
", errno=" << errno << std::endl;
208 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
214 TLOG(TLVL_DEBUG) << __func__ <<
": should_stop is " << std::boolalpha << should_stop_ <<
", stopping";
218 TLOG(TLVL_TRACE) << __func__ <<
": Polling table socket for new routes";
219 if (table_socket_ == -1)
221 TLOG(TLVL_DEBUG) << __func__ <<
": Opening table listener socket";
222 setupTableListener_();
224 if (table_socket_ == -1)
226 TLOG(TLVL_DEBUG) << __func__ <<
": The listen socket was not opened successfully.";
229 if (ack_socket_ == -1)
231 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
232 auto sts =
ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
235 TLOG(TLVL_ERROR) << __func__ <<
": Unable to resolve routing_master_address";
238 TLOG(TLVL_DEBUG) << __func__ <<
": Ack socket is fd " << ack_socket_;
242 fd.fd = table_socket_;
243 fd.events = POLLIN | POLLPRI;
245 auto res = poll(&fd, 1, 1000);
248 auto first = artdaq::Fragment::InvalidSequenceID;
249 auto last = artdaq::Fragment::InvalidSequenceID;
252 TLOG(TLVL_DEBUG) << __func__ <<
": Going to receive RoutingPacketHeader";
253 struct sockaddr_in from;
254 socklen_t len=
sizeof(from);
256 TLOG(TLVL_DEBUG) << __func__ <<
": Received " << stss <<
" hdr bytes. (sizeof(RoutingPacketHeader) == " <<
sizeof(detail::RoutingPacketHeader)
257 <<
" from " << inet_ntoa(from.sin_addr) <<
":" << from.sin_port;
259 TRACE(TLVL_DEBUG,
"receiveTableUpdatesLoop_: Checking for valid header with nEntries=%lu headerData:0x%016lx%016lx",hdr.
nEntries,((
unsigned long*)&hdr)[0],((
unsigned long*)&hdr)[1]);
260 if (hdr.
header != ROUTING_MAGIC)
262 TLOG(TLVL_TRACE) << __func__ <<
": non-RoutingPacket received. No ROUTING_MAGIC. size(bytes)="<<stss;
266 TLOG(TLVL_TRACE) << __func__ <<
": non-RoutingPacket received. size(bytes)="<<stss;
270 if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.
mode)
272 TLOG(TLVL_ERROR) << __func__ <<
": Received table has different RoutingMasterMode than expected!";
275 routing_master_mode_ = hdr.
mode;
278 TLOG(TLVL_DEBUG) << __func__ <<
": Receiving data buffer";
281 TLOG(TLVL_DEBUG) << __func__ <<
": Received " << sts <<
" pkt bytes from " << inet_ntoa(from.sin_addr) <<
":" << from.sin_port;
282 TRACE(6,
"receiveTableUpdatesLoop_: Received a packet of %ld bytes. 1st 16 bytes: 0x%016lx%016lx",sts,((
unsigned long*)&buffer[0])[0],((
unsigned long*)&buffer[0])[1]);
284 first = buffer[0].sequence_id;
285 last = buffer[buffer.size() - 1].sequence_id;
287 if (first + hdr.
nEntries - 1 != last)
289 TLOG(TLVL_ERROR) << __func__ <<
": Skipping this RoutingPacket because the first (" << first <<
") and last (" << last <<
") entries are inconsistent (sz=" << hdr.
nEntries <<
")!";
292 auto thisSeqID = first;
295 std::unique_lock<std::mutex> lck(routing_mutex_);
296 if (routing_table_.count(last) == 0)
298 for (
auto entry : buffer)
300 if (thisSeqID != entry.sequence_id)
302 TLOG(TLVL_ERROR) << __func__ <<
": Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id <<
", expected=" << thisSeqID <<
")!";
303 last = thisSeqID - 1;
307 if (routing_table_.count(entry.sequence_id))
309 if (routing_table_[entry.sequence_id] != entry.destination_rank)
311 TLOG(TLVL_ERROR) << __func__ <<
": Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
312 <<
" should go to rank " << entry.destination_rank <<
", but I had already been told to send it to " << routing_table_[entry.sequence_id] <<
"!"
313 <<
" I will use the original value!";
317 if (entry.sequence_id < routing_table_last_)
continue;
318 routing_table_[entry.sequence_id] = entry.destination_rank;
319 TLOG(TLVL_DEBUG) << __func__ <<
": (my_rank=" << my_rank <<
") received update: SeqID " << entry.sequence_id
320 <<
" -> Rank " << entry.destination_rank;
324 TLOG(TLVL_DEBUG) << __func__ <<
": There are now " << routing_table_.size() <<
" entries in the Routing Table";
325 if (routing_table_.size() > 0) TLOG(TLVL_DEBUG) << __func__ <<
": Last routing table entry is seqID=" << routing_table_.rbegin()->first;
328 for (
auto& entry : routing_table_)
330 TLOG(45) <<
"Routing Table Entry" << counter <<
": " << entry.first <<
" -> " << entry.second;
340 if (last > routing_table_last_) routing_table_last_ = last;
342 TLOG(TLVL_DEBUG) << __func__ <<
": Sending RoutingAckPacket with first= " << first <<
" and last= " << last <<
" to " << ack_address_ <<
", port " << ack_port_ <<
" (my_rank = " << my_rank <<
")";
351 std::unique_lock<std::mutex> lck(routing_mutex_);
352 return routing_table_.size();
357 std::unique_lock<std::mutex> lck(routing_mutex_);
359 size_t dist = std::distance(routing_table_.upper_bound(highest_sequence_id_routed_), routing_table_.end());
363 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id)
const
366 if (!use_routing_master_ && enabled_destinations_.size() == 1)
return *enabled_destinations_.begin();
368 if (use_routing_master_)
370 auto start = std::chrono::steady_clock::now();
371 TLOG(15) <<
"calcDest_ use_routing_master check for routing info for seqID="<<sequence_id<<
" routing_timeout_ms="<<routing_timeout_ms_<<
" should_stop_="<<should_stop_;
372 while (!should_stop_ && (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_)))
375 std::unique_lock<std::mutex> lck(routing_mutex_);
378 if (sequence_id > highest_sequence_id_routed_) highest_sequence_id_routed_ = sequence_id;
379 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
380 return routing_table_.at(sequence_id);
384 if (sent_frag_count_.count() + 1 > highest_sequence_id_routed_) highest_sequence_id_routed_ = sent_frag_count_.count() + 1;
385 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
386 return routing_table_.at(sent_frag_count_.count() + 1);
389 usleep(routing_timeout_ms_ * 10);
391 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
394 TLOG(TLVL_WARNING) <<
"Bad Omen: I don't have routing information for seqID " << sequence_id
395 <<
" and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ <<
" ms)!";
399 TLOG(TLVL_WARNING) <<
"Bad Omen: I don't have routing information for send number " << sent_frag_count_.count()
400 <<
" and the Routing Master did not send a table update in routing_timeout_ms window (" << routing_timeout_ms_ <<
" ms)!";
405 auto index = sequence_id % enabled_destinations_.size();
406 auto it = enabled_destinations_.begin();
407 for (; index > 0; --index)
410 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
421 auto start_time = std::chrono::steady_clock::now();
422 if (frag.type() == Fragment::EndOfDataFragmentType)
424 throw cet::exception(
"LogicError")
425 <<
"EOD fragments should not be sent on as received: "
426 <<
"use sendEODFrag() instead.";
428 size_t seqID = frag.sequenceID();
429 size_t fragSize = frag.sizeBytes();
430 TLOG(13) <<
"sendFragment start frag.fragmentHeader()=" << std::hex << (
void*)(frag.headerBeginBytes()) <<
", szB=" << std::dec << fragSize
431 <<
", seqID=" << seqID <<
", type=" << frag.typeString();
434 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
436 for (
auto& bdest : enabled_destinations_)
438 TLOG(TLVL_TRACE) <<
"sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << bdest <<
" (broadcast)";
440 Fragment fragCopy(frag);
441 auto sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
445 sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
449 sent_frag_count_.incSlot(bdest);
452 else if (non_blocking_mode_)
454 auto count = routing_retry_count_;
457 dest = calcDest_(seqID);
461 TLOG(TLVL_WARNING) <<
"Could not get destination for seqID " << seqID << (count > 0 ?
", retrying." :
".");
466 TLOG(TLVL_TRACE) <<
"sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << dest;
468 auto lastWarnTime = std::chrono::steady_clock::now();
472 sts = destinations_[dest]->copyFragment(frag, send_timeout_us_);
475 TLOG(TLVL_WARNING) <<
"sendFragment: Sending fragment " << seqID <<
" to destination " << dest <<
" failed! Retrying...";
476 lastWarnTime = std::chrono::steady_clock::now();
481 sent_frag_count_.incSlot(dest);
483 else if (!should_stop_)
484 TLOG(TLVL_ERROR) <<
"(in non_blocking) calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID
485 <<
". enabled_destinantions_.size()="<<enabled_destinations_.size();
489 auto start = std::chrono::steady_clock::now();
492 dest = calcDest_(seqID);
495 TLOG(TLVL_WARNING) <<
"Could not get destination for seqID " << seqID <<
", send number " << sent_frag_count_.count() <<
", retrying. Waited " << TimeUtils::GetElapsedTime(start) <<
" s for routing information.";
501 TLOG(5) <<
"DataSenderManager::sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << dest;
504 sts = destinations_[dest]->moveFragment(std::move(frag));
506 TLOG(TLVL_ERROR) <<
"sendFragment: Sending fragment " << seqID <<
" to destination "
507 << dest <<
" failed! Data has been lost!";
510 sent_frag_count_.incSlot(dest);
513 else if (!should_stop_)
514 TLOG(TLVL_ERROR) <<
"calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID
515 <<
". enabled_destinantions_.size()="<<enabled_destinations_.size();
519 std::unique_lock<std::mutex> lck(routing_mutex_);
525 routing_table_.erase(routing_table_.find(seqID));
526 else if(routing_table_.find(sent_frag_count_.count()) != routing_table_.end())
527 routing_table_.erase(routing_table_.find(sent_frag_count_.count()));
531 auto delta_t = TimeUtils::GetElapsedTime(start_time);
532 destination_metric_data_[dest].first += fragSize;
533 destination_metric_data_[dest].second += delta_t;
535 if (metricMan && TimeUtils::GetElapsedTime(destination_metric_send_time_[dest]) > 1)
537 TLOG(5) <<
"sendFragment: sending metrics";
538 metricMan->sendMetric(
"Data Send Time to Rank " + std::to_string(dest), destination_metric_data_[dest].second,
"s", 5, MetricMode::Accumulate);
539 metricMan->sendMetric(
"Data Send Size to Rank " + std::to_string(dest), destination_metric_data_[dest].first,
"B", 5, MetricMode::Accumulate);
540 metricMan->sendMetric(
"Data Send Rate to Rank " + std::to_string(dest), destination_metric_data_[dest].first / destination_metric_data_[dest].second,
"B/s", 5, MetricMode::Average);
541 metricMan->sendMetric(
"Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest),
"fragments", 3, MetricMode::LastPoint);
543 destination_metric_send_time_[dest] = std::chrono::steady_clock::now();
544 destination_metric_data_[dest].first = 0;
545 destination_metric_data_[dest].second = 0.0;
547 if (use_routing_master_)
549 metricMan->sendMetric(
"Routing Table Size", GetRoutingTableEntryCount(),
"events", 2, MetricMode::LastPoint);
550 if (routing_wait_time_ > 0)
552 metricMan->sendMetric(
"Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000,
"s", 2, MetricMode::Average);
553 routing_wait_time_ = 0;
557 TLOG(5) <<
"sendFragment: Done sending fragment " << seqID;
558 return std::make_pair(dest, outsts);
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
A row of the Routing Table.
The send operation timed out.
hostMap_t MakeHostMap(fhicl::ParameterSet pset, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
Events should be routed by sequence ID (BR -> EB)
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
std::pair< int, TransferInterface::CopyStatus > sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
size_t GetRemainingRoutingTableEntries() const
Gets the number of sends remaining in the routing table, in case other parts of the system want to us...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, std::string > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
Value to be returned upon receive timeout.
std::map< int, std::string > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
Events should be routed by send count (EB -> Agg)
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.