1 #define TRACE_NAME "DataSenderManager"
2 #include "artdaq/DAQdata/Globals.hh"
3 #include "artdaq/DAQrate/DataSenderManager.hh"
4 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
5 #include "artdaq/TransferPlugins/detail/HostMap.hh"
8 #include "canvas/Utilities/Exception.h"
10 #include <netinet/in.h>
11 #include <sys/types.h>
13 #include <sys/socket.h>
18 , destination_metric_data_()
19 , destination_metric_send_time_()
20 , enabled_destinations_()
22 , broadcast_sends_(pset.get<bool>(
"broadcast_sends", false))
23 , non_blocking_mode_(pset.get<bool>(
"nonblocking_sends", false))
24 , send_timeout_us_(pset.get<size_t>(
"send_timeout_usec", 5000000))
25 , send_retry_count_(pset.get<size_t>(
"send_retry_count", 2))
26 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
31 TLOG(TLVL_DEBUG) <<
"Received pset: " << pset.to_string();
34 if (send_timeout_us_ == 0) send_timeout_us_ = std::numeric_limits<size_t>::max();
36 auto rmConfig = pset.get<fhicl::ParameterSet>(
"routing_table_config", fhicl::ParameterSet());
37 use_routing_master_ = rmConfig.get<
bool>(
"use_routing_master",
false);
38 table_port_ = rmConfig.get<
int>(
"table_update_port", 35556);
39 table_address_ = rmConfig.get<std::string>(
"table_update_address",
"227.128.12.28");
40 ack_port_ = rmConfig.get<
int>(
"table_acknowledge_port", 35557);
41 ack_address_ = rmConfig.get<std::string>(
"routing_master_hostname",
"localhost");
42 routing_timeout_ms_ = (rmConfig.get<
int>(
"routing_timeout_ms", 1000));
43 routing_retry_count_ = rmConfig.get<
int>(
"routing_retry_count", 5);
48 auto dests = pset.get<fhicl::ParameterSet>(
"destinations", fhicl::ParameterSet());
49 for (
auto& d : dests.get_pset_names())
51 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
55 fhicl::ParameterSet dests_mod;
56 for (
auto& d : dests.get_pset_names())
58 auto dest_pset = dests.get<fhicl::ParameterSet>(d);
59 dest_pset.erase(
"host_map");
60 dest_pset.put<std::vector<fhicl::ParameterSet>>(
"host_map", host_map_pset);
61 dests_mod.put<fhicl::ParameterSet>(d, dest_pset);
64 for (
auto& d : dests.get_pset_names())
69 auto destination_rank = transfer->destination_rank();
70 destinations_.emplace(destination_rank, std::move(transfer));
71 destination_metric_data_[destination_rank] = std::pair<size_t, double>();
72 destination_metric_send_time_[destination_rank] = std::chrono::steady_clock::now();
74 catch (std::invalid_argument)
76 TLOG(TLVL_DEBUG) <<
"Invalid destination specification: " << d;
78 catch (cet::exception ex)
80 TLOG(TLVL_WARNING) <<
"Caught cet::exception: " << ex.what();
84 TLOG(TLVL_WARNING) <<
"Non-cet exception while setting up TransferPlugin: " << d <<
".";
87 if (destinations_.size() == 0)
89 TLOG(TLVL_ERROR) <<
"No destinations specified!";
93 auto enabled_dests = pset.get<std::vector<size_t>>(
"enabled_destinations", std::vector<size_t>());
94 if (enabled_dests.size() == 0)
96 TLOG(TLVL_INFO) <<
"enabled_destinations not specified, assuming all destinations enabled.";
97 for (
auto& d : destinations_)
99 enabled_destinations_.insert(d.first);
104 for (
auto& d : enabled_dests)
106 enabled_destinations_.insert(d);
110 if (use_routing_master_) startTableReceiverThread_();
115 TLOG(TLVL_DEBUG) <<
"Shutting down DataSenderManager BEGIN";
117 for (
auto& dest : enabled_destinations_)
119 if (destinations_.count(dest))
121 auto sts = destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
126 if (routing_thread_.joinable()) routing_thread_.join();
127 TLOG(TLVL_DEBUG) <<
"Shutting down DataSenderManager END. Sent " << count() <<
" fragments.";
131 void artdaq::DataSenderManager::setupTableListener_()
133 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
134 if (table_socket_ < 0)
136 TLOG(TLVL_ERROR) <<
"Error creating socket for receiving table updates!";
140 struct sockaddr_in si_me_request;
143 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes,
sizeof(yes)) < 0)
145 TLOG(TLVL_ERROR) <<
" Unable to enable port reuse on request socket";
148 memset(&si_me_request, 0,
sizeof(si_me_request));
149 si_me_request.sin_family = AF_INET;
150 si_me_request.sin_port = htons(table_port_);
151 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
152 if (bind(table_socket_, (
struct sockaddr *)&si_me_request,
sizeof(si_me_request)) == -1)
154 TLOG(TLVL_ERROR) <<
"Cannot bind request socket to port " << table_port_;
159 int sts =
ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
162 TLOG(TLVL_ERROR) <<
"Unable to resolve multicast address for table updates";
165 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
166 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq,
sizeof(mreq)) < 0)
168 TLOG(TLVL_ERROR) <<
"Unable to join multicast group";
172 void artdaq::DataSenderManager::startTableReceiverThread_()
174 if (routing_thread_.joinable()) routing_thread_.join();
175 TLOG(TLVL_INFO) <<
"Starting Routing Thread";
176 routing_thread_ = boost::thread(&DataSenderManager::receiveTableUpdatesLoop_,
this);
178 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
184 TLOG(TLVL_DEBUG) <<
"receiveTableUpdatesLoop: should_stop is " << std::boolalpha << should_stop_ <<
", stopping";
188 TLOG(TLVL_TRACE) <<
"DataSenderManager::receiveTableUpdatesLoop: Polling Request socket for new requests";
189 if (table_socket_ == -1)
191 TLOG(TLVL_DEBUG) <<
"Opening table listener socket";
192 setupTableListener_();
194 if (table_socket_ == -1)
196 TLOG(TLVL_DEBUG) <<
"The listen socket was not opened successfully.";
199 if (ack_socket_ == -1)
201 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
202 auto sts =
ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
205 TLOG(TLVL_ERROR) <<
"Unable to resolve routing_master_address";
208 TLOG(TLVL_DEBUG) <<
"Ack socket is fd " << ack_socket_;
212 fd.fd = table_socket_;
213 fd.events = POLLIN | POLLPRI;
215 auto res = poll(&fd, 1, 1000);
218 auto first = artdaq::Fragment::InvalidSequenceID;
219 auto last = artdaq::Fragment::InvalidSequenceID;
222 TLOG(TLVL_DEBUG) <<
"Going to receive RoutingPacketHeader";
224 TLOG(TLVL_DEBUG) <<
"Received " << stss <<
" bytes. (sizeof(RoutingPacketHeader) == " <<
sizeof(detail::RoutingPacketHeader);
226 TLOG(TLVL_DEBUG) <<
"Checking for valid header";
227 if (hdr.
header == ROUTING_MAGIC)
229 if (routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.
mode)
231 TLOG(TLVL_ERROR) <<
"Received table has different RoutingMasterMode than expected!";
234 routing_master_mode_ = hdr.
mode;
237 TLOG(TLVL_DEBUG) <<
"Receiving data buffer";
240 TLOG(6) <<
"Received a packet of " << sts <<
" bytes";
242 first = buffer[0].sequence_id;
243 last = buffer[buffer.size() - 1].sequence_id;
245 if (first + hdr.
nEntries - 1 != last)
247 TLOG(TLVL_ERROR) <<
"Skipping this RoutingPacket because the first (" << first <<
") and last (" << last <<
") entries are inconsistent (sz=" << hdr.
nEntries <<
")!";
250 auto thisSeqID = first;
252 if (routing_table_.count(last) == 0)
254 for (
auto entry : buffer)
256 if (thisSeqID != entry.sequence_id)
258 TLOG(TLVL_ERROR) <<
"Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id <<
", expected=" << thisSeqID <<
")!";
259 last = thisSeqID - 1;
263 if (routing_table_.count(entry.sequence_id))
265 if (routing_table_[entry.sequence_id] != entry.destination_rank)
267 TLOG(TLVL_ERROR) <<
"Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
268 <<
" should go to rank " << entry.destination_rank <<
", but I had already been told to send it to " << routing_table_[entry.sequence_id] <<
"!"
269 <<
" I will use the original value!";
273 routing_table_[entry.sequence_id] = entry.destination_rank;
274 TLOG(TLVL_DEBUG) <<
"DataSenderManager " << my_rank <<
": received update: SeqID " << entry.sequence_id <<
" -> Rank " << entry.destination_rank;
283 TLOG(TLVL_DEBUG) <<
"Sending RoutingAckPacket with first= " << first <<
" and last= " << last <<
" to " << ack_address_ <<
", port " << ack_port_ <<
" (my_rank = " << my_rank <<
")";
284 TLOG(TLVL_DEBUG) <<
"There are now " << routing_table_.size() <<
" entries in the Routing Table";
293 std::unique_lock<std::mutex> lck(routing_mutex_);
294 return routing_table_.size();
297 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id)
const
300 if (enabled_destinations_.size() == 1)
return *enabled_destinations_.begin();
302 if (use_routing_master_)
304 auto start = std::chrono::steady_clock::now();
305 while (routing_timeout_ms_ <= 0 || TimeUtils::GetElapsedTimeMilliseconds(start) < static_cast<size_t>(routing_timeout_ms_))
307 std::unique_lock<std::mutex> lck(routing_mutex_);
310 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
311 return routing_table_.at(sequence_id);
315 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
316 return routing_table_.at(sent_frag_count_.count());
318 usleep(routing_timeout_ms_ * 10);
320 routing_wait_time_.fetch_add(TimeUtils::GetElapsedTimeMicroseconds(start));
323 TLOG(TLVL_ERROR) <<
"Bad Omen: I don't have routing information for seqID " << sequence_id
324 <<
" and the Routing Master did not send a table update in routing_timeout (" << routing_timeout_ms_ <<
")!";
328 TLOG(TLVL_ERROR) <<
"Bad Omen: I don't have routing information for send number " << sent_frag_count_.count()
329 <<
" and the Routing Master did not send a table update in routing_timeout (" << routing_timeout_ms_ <<
")!";
334 auto index = sequence_id % enabled_destinations_.size();
335 auto it = enabled_destinations_.begin();
336 for (; index > 0; --index)
339 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
350 auto start_time = std::chrono::steady_clock::now();
351 if (frag.type() == Fragment::EndOfDataFragmentType)
353 throw cet::exception(
"LogicError")
354 <<
"EOD fragments should not be sent on as received: "
355 <<
"use sendEODFrag() instead.";
357 size_t seqID = frag.sequenceID();
358 size_t fragSize = frag.sizeBytes();
359 TLOG(13) <<
"sendFragment start frag.fragmentHeader()=" << std::hex << (
void*)(frag.headerBeginBytes()) <<
", szB=" << std::dec << fragSize
360 <<
", seqID=" << seqID <<
", type=" << frag.typeString();
363 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
365 for (
auto& bdest : enabled_destinations_)
367 TLOG(TLVL_TRACE) <<
"sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << bdest <<
" (broadcast)";
369 Fragment fragCopy(frag);
370 auto sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
374 sts = destinations_[bdest]->copyFragment(fragCopy, send_timeout_us_);
378 sent_frag_count_.incSlot(bdest);
381 else if (non_blocking_mode_)
383 auto count = routing_retry_count_;
386 dest = calcDest_(seqID);
390 TLOG(TLVL_WARNING) <<
"Could not get destination for seqID " << seqID << (count > 0 ?
", retrying." :
".");
395 TLOG(TLVL_TRACE) <<
"sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << dest;
397 auto lastWarnTime = std::chrono::steady_clock::now();
401 sts = destinations_[dest]->copyFragment(frag, send_timeout_us_);
404 TLOG(TLVL_ERROR) <<
"sendFragment: Sending fragment " << seqID <<
" to destination " << dest <<
" failed! Retrying...";
405 lastWarnTime = std::chrono::steady_clock::now();
410 sent_frag_count_.incSlot(dest);
414 TLOG(TLVL_WARNING) <<
"calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID;
419 auto count = routing_retry_count_;
422 dest = calcDest_(seqID);
426 TLOG(TLVL_WARNING) <<
"Could not get destination for seqID "
427 << seqID <<
", send number " << sent_frag_count_.count()
428 << (count > 0 ?
", retrying." :
".");
433 TLOG(5) <<
"DataSenderManager::sendFragment: Sending fragment with seqId " << seqID <<
" to destination " << dest;
436 sts = destinations_[dest]->moveFragment(std::move(frag));
438 TLOG(TLVL_ERROR) <<
"sendFragment: Sending fragment " << seqID <<
" to destination "
439 << dest <<
" failed! Data has been lost!";
442 sent_frag_count_.incSlot(dest);
446 TLOG(TLVL_WARNING) <<
"calcDest returned invalid destination rank " << dest
447 <<
"! This event has been lost: " << seqID;
450 && routing_table_.find(seqID - 1) != routing_table_.end())
452 std::unique_lock<std::mutex> lck(routing_mutex_);
453 routing_table_.erase(routing_table_.begin(), routing_table_.find(seqID - 1));
457 std::unique_lock<std::mutex> lck(routing_mutex_);
458 routing_table_.erase(routing_table_.begin(), routing_table_.find(sent_frag_count_.count()));
462 auto delta_t = TimeUtils::GetElapsedTime(start_time);
463 destination_metric_data_[dest].first += fragSize;
464 destination_metric_data_[dest].second += delta_t;
466 if (metricMan && TimeUtils::GetElapsedTime(destination_metric_send_time_[dest]) > 1)
468 TLOG(5) <<
"sendFragment: sending metrics";
469 metricMan->sendMetric(
"Data Send Time to Rank " + std::to_string(dest), destination_metric_data_[dest].second,
"s", 5, MetricMode::Accumulate);
470 metricMan->sendMetric(
"Data Send Size to Rank " + std::to_string(dest), destination_metric_data_[dest].first,
"B", 5, MetricMode::Accumulate);
471 metricMan->sendMetric(
"Data Send Rate to Rank " + std::to_string(dest), destination_metric_data_[dest].first / destination_metric_data_[dest].second,
"B/s", 5, MetricMode::Average);
472 metricMan->sendMetric(
"Data Send Count to Rank " + std::to_string(dest), sent_frag_count_.slotCount(dest),
"fragments", 3, MetricMode::LastPoint);
474 destination_metric_send_time_[dest] = std::chrono::steady_clock::now();
475 destination_metric_data_[dest].first = 0;
476 destination_metric_data_[dest].second = 0.0;
478 if (use_routing_master_)
480 metricMan->sendMetric(
"Routing Table Size", routing_table_.size(),
"events", 2, MetricMode::LastPoint);
481 if (routing_wait_time_ > 0)
482 metricMan->sendMetric(
"Routing Wait Time", static_cast<double>(routing_wait_time_.load()) / 1000000,
"s", 2, MetricMode::Average);
485 TLOG(5) <<
"sendFragment: Done sending fragment " << seqID;
486 return std::make_pair(dest, outsts);
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
A row of the Routing Table.
The send operation timed out.
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
Events should be routed by sequence ID (BR -> EB)
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
std::pair< int, TransferInterface::CopyStatus > sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
std::vector< fhicl::ParameterSet > MakeHostMapPset(std::map< int, DestinationInfo > input)
Create a list of HostMap::HostConfig ParameterSets from a hostMap_t map
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
Some error occurred, but no exception was thrown.
std::map< int, DestinationInfo > hostMap_t
The host_map is a map associating ranks with artdaq::DestinationInfo objects.
The send operation completed successfully.
hostMap_t MakeHostMap(fhicl::ParameterSet pset, int masterPortOffset=0, hostMap_t map=hostMap_t())
Make a hostMap_t from a HostMap::Config ParameterSet
Value to be returned upon receive timeout.
Events should be routed by send count (EB -> Agg)
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.