1 #include "artdaq/DAQrate/DataSenderManager.hh"
2 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
3 #include "artdaq/DAQdata/Globals.hh"
6 #include <canvas/Utilities/Exception.h>
8 #include <netinet/in.h>
11 #include <sys/socket.h>
12 #include "artdaq/Application/Routing/RoutingPacket.hh"
17 , enabled_destinations_()
19 , broadcast_sends_(pset.get<bool>(
"broadcast_sends", false))
20 , non_blocking_mode_(pset.get<bool>(
"nonblocking_sends", false))
21 , routing_master_mode_(detail::RoutingMasterMode::INVALID)
26 TLOG_DEBUG(
"DataSenderManager") <<
"Received pset: " << pset.to_string() << TLOG_ENDL;
27 auto rmConfig = pset.get<fhicl::ParameterSet>(
"routing_table_config", fhicl::ParameterSet());
28 use_routing_master_ = rmConfig.get<
bool>(
"use_routing_master",
false);
29 table_port_ = rmConfig.get<
int>(
"table_update_port", 35556);
30 table_address_ = rmConfig.get<std::string>(
"table_update_address",
"227.128.12.28");
31 ack_port_ = rmConfig.get<
int>(
"table_acknowledge_port", 35557);
32 ack_address_ = rmConfig.get<std::string>(
"routing_master_hostname",
"localhost");
33 routing_timeout_ms_ = (rmConfig.get<
int>(
"routing_timeout_ms", 1000));
36 auto dests = pset.get<fhicl::ParameterSet>(
"destinations", fhicl::ParameterSet());
37 for (
auto& d : dests.get_pset_names())
41 auto dd = dests.get<fhicl::ParameterSet>(d).get<int>(
"destination_rank");
44 catch (std::invalid_argument)
46 TRACE(3,
"Invalid destination specification: " + d);
48 catch (cet::exception ex)
50 TLOG_WARNING(
"DataSenderManager") <<
"Caught cet::exception: " << ex.what() << TLOG_ENDL;
54 TLOG_WARNING(
"DataSenderManager") <<
"Non-cet exception while setting up TransferPlugin: " << d <<
"." << TLOG_ENDL;
57 if (destinations_.size() == 0)
59 TLOG_ERROR(
"DataSenderManager") <<
"No destinations specified!" << TLOG_ENDL;
63 auto enabled_dests = pset.get<std::vector<size_t>>(
"enabled_destinations", std::vector<size_t>());
64 if (enabled_dests.size() == 0)
66 TLOG_INFO(
"DataSenderManager") <<
"enabled_destinations not specified, assuming all destinations enabled." << TLOG_ENDL;
67 for (
auto& d : destinations_)
69 enabled_destinations_.insert(d.first);
74 for (
auto& d : enabled_dests)
76 enabled_destinations_.insert(d);
80 if (use_routing_master_) startTableReceiverThread_();
85 TLOG_DEBUG(
"DataSenderManager") <<
"Shutting down DataSenderManager BEGIN" << TLOG_ENDL;
87 for (
auto& dest : enabled_destinations_)
89 if (destinations_.count(dest))
91 destinations_[dest]->moveFragment(std::move(*Fragment::eodFrag(sent_frag_count_.slotCount(dest))));
95 if (routing_thread_.joinable()) routing_thread_.join();
96 TLOG_DEBUG(
"DataSenderManager") <<
"Shutting down DataSenderManager END. Sent " << count() <<
" fragments." << TLOG_ENDL;
100 void artdaq::DataSenderManager::setupTableListener_()
102 table_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
105 TLOG_ERROR(
"DataSenderManager") <<
"Error creating socket for receiving table updates!" << TLOG_ENDL;
109 struct sockaddr_in si_me_request;
112 if (setsockopt(table_socket_, SOL_SOCKET, SO_REUSEADDR, &yes,
sizeof(yes)) < 0)
114 TLOG_ERROR(
"DataSenderManager") <<
" Unable to enable port reuse on request socket" << TLOG_ENDL;
117 memset(&si_me_request, 0,
sizeof(si_me_request));
118 si_me_request.sin_family = AF_INET;
119 si_me_request.sin_port = htons(table_port_);
120 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
121 if (bind(table_socket_, (
struct sockaddr *)&si_me_request,
sizeof(si_me_request)) == -1)
123 TLOG_ERROR(
"DataSenderManager") <<
"Cannot bind request socket to port " << table_port_ << TLOG_ENDL;
128 int sts =
ResolveHost(table_address_.c_str(), mreq.imr_multiaddr);
131 TLOG_ERROR(
"DataSenderManager") <<
"Unable to resolve multicast address for table updates" << TLOG_ENDL;
134 mreq.imr_interface.s_addr = htonl(INADDR_ANY);
135 if (setsockopt(table_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq,
sizeof(mreq)) < 0)
137 TLOG_ERROR(
"DataSenderManager") <<
"Unable to join multicast group" << TLOG_ENDL;
141 void artdaq::DataSenderManager::startTableReceiverThread_()
143 if (routing_thread_.joinable()) routing_thread_.join();
144 TLOG_INFO(
"DataSenderManager") <<
"Starting Routing Thread" << TLOG_ENDL;
145 routing_thread_ = std::thread(&DataSenderManager::receiveTableUpdatesLoop_,
this);
147 void artdaq::DataSenderManager::receiveTableUpdatesLoop_()
153 TLOG_DEBUG(
"DataSenderManager") <<
"receiveTableUpdatesLoop: should_stop is " << std::boolalpha << should_stop_ <<
", stopping" << TLOG_ENDL;
157 TRACE(4,
"DataSenderManager::receiveTableUpdatesLoop: Polling Request socket for new requests");
158 if (table_socket_ == -1)
160 TLOG_DEBUG(
"DataSenderManager") <<
"Opening table listener socket" << TLOG_ENDL;
161 setupTableListener_();
163 if (table_socket_ == -1)
165 TLOG_DEBUG(
"DataSenderManager") <<
"The listen socket was not opened successfully." << TLOG_ENDL;
168 if (ack_socket_ == -1)
170 ack_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
171 auto sts =
ResolveHost(ack_address_.c_str(), ack_port_, ack_addr_);
174 TLOG_ERROR(
"DataSenderManager") <<
"Unable to resolve routing_master_address" << TLOG_ENDL;
177 TLOG_DEBUG(
"DataSenderManager") <<
"Ack socket is fd " << ack_socket_ << TLOG_ENDL;
181 fd.fd = table_socket_;
182 fd.events = POLLIN | POLLPRI;
184 auto res = poll(&fd, 1, 1000);
186 auto first = artdaq::Fragment::InvalidSequenceID;
187 auto last = artdaq::Fragment::InvalidSequenceID;
190 TLOG_DEBUG(
"DataSenderManager") <<
"Going to receive RoutingPacketHeader" << TLOG_ENDL;
192 TLOG_DEBUG(
"DataSenderManager") <<
"Received " << std::to_string(stss) <<
" bytes. (sizeof(RoutingPacketHeader) == " << std::to_string(
sizeof(detail::RoutingPacketHeader)) << TLOG_ENDL;
194 TLOG_DEBUG(
"DataSenderManager") <<
"Checking for valid header" << TLOG_ENDL;
195 if (hdr.
header == ROUTING_MAGIC) {
196 if(routing_master_mode_ != detail::RoutingMasterMode::INVALID && routing_master_mode_ != hdr.mode)
198 TLOG_ERROR(
"DataSenderManager") <<
"Received table has different RoutingMasterMode than expected!" << TLOG_ENDL;
201 routing_master_mode_ = hdr.mode;
204 TLOG_DEBUG(
"DataSenderManager") <<
"Receiving data buffer" << TLOG_ENDL;
207 TRACE(6,
"Received a packet of %zu bytes", sts);
209 first = buffer[0].sequence_id;
210 last = buffer[buffer.size() - 1].sequence_id;
212 if (first + hdr.
nEntries - 1 != last)
214 TLOG_ERROR(
"DataSenderManager") <<
"Skipping this RoutingPacket because the first (" << first <<
") and last (" << last <<
") entries are inconsistent (sz=" << hdr.
nEntries <<
")!" << TLOG_ENDL;
217 auto thisSeqID = first;
219 if (routing_table_.count(last) == 0) {
220 for (
auto entry : buffer)
222 if (thisSeqID != entry.sequence_id)
224 TLOG_ERROR(
"DataSenderManager") <<
"Aborting processing of this RoutingPacket because I encountered an inconsistent entry (seqid=" << entry.sequence_id <<
", expected=" << thisSeqID <<
")!" << TLOG_ENDL;
225 last = thisSeqID - 1;
229 if (routing_table_.count(entry.sequence_id))
231 if (routing_table_[entry.sequence_id] != entry.destination_rank)
233 TLOG_ERROR(
"DataSenderManager") <<
"Detected routing table corruption! Recevied update specifying that sequence ID " << entry.sequence_id
234 <<
" should go to rank " << entry.destination_rank <<
", but I had already been told to send it to " << routing_table_[entry.sequence_id] <<
"!"
235 <<
" I will use the original value!" << TLOG_ENDL;
239 routing_table_[entry.sequence_id] = entry.destination_rank;
240 TLOG_DEBUG(
"DataSenderManager") <<
"DataSenderManager " << std::to_string(my_rank) <<
": received update: SeqID " << std::to_string(entry.sequence_id) <<
" -> Rank " << std::to_string(entry.destination_rank) << TLOG_ENDL;
249 TLOG_DEBUG(
"DataSenderManager") <<
"Sending RoutingAckPacket with first= " << std::to_string(first) <<
" and last= " << std::to_string(last) <<
" to " << ack_address_ <<
", port " << ack_port_ <<
" (my_rank = " << my_rank <<
")"<< TLOG_ENDL;
250 TLOG_DEBUG(
"DataSenderManager") <<
"There are now " << routing_table_.size() <<
" entries in the Routing Table" << TLOG_ENDL;
259 std::unique_lock<std::mutex> lck(routing_mutex_);
260 return routing_table_.size();
263 int artdaq::DataSenderManager::calcDest_(Fragment::sequence_id_t sequence_id)
const
266 if (enabled_destinations_.size() == 1)
return *enabled_destinations_.begin();
268 if (use_routing_master_)
270 auto start = std::chrono::steady_clock::now();
271 while (routing_timeout_ms_ <= 0 || std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count() < routing_timeout_ms_)
273 std::unique_lock<std::mutex> lck(routing_mutex_);
274 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.count(sequence_id)) {
275 routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
276 return routing_table_.at(sequence_id);
278 else if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount && routing_table_.count(sent_frag_count_.count()))
280 routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
281 return routing_table_.at(sent_frag_count_.count());
283 usleep(routing_timeout_ms_ * 10);
285 routing_wait_time_.fetch_add(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start).count());
286 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID) {
287 TLOG_ERROR(
"DataSenderManager") <<
"Bad Omen: I don't have routing information for seqID " << std::to_string(sequence_id)
288 <<
" and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) <<
")!" << TLOG_ENDL;
292 TLOG_ERROR(
"DataSenderManager") <<
"Bad Omen: I don't have routing information for send number " << std::to_string(sent_frag_count_.count())
293 <<
" and the Routing Master did not send a table update in routing_timeout (" << std::to_string(routing_timeout_ms_) <<
")!" << TLOG_ENDL;
297 auto index = sequence_id % enabled_destinations_.size();
298 auto it = enabled_destinations_.begin();
299 for (; index > 0; --index)
302 if (it == enabled_destinations_.end()) it = enabled_destinations_.begin();
315 auto start_time = std::chrono::steady_clock::now();
316 if (frag.type() == Fragment::EndOfDataFragmentType)
318 throw cet::exception(
"LogicError")
319 <<
"EOD fragments should not be sent on as received: "
320 <<
"use sendEODFrag() instead.";
322 size_t seqID = frag.sequenceID();
323 size_t fragSize = frag.sizeBytes();
324 TLOG_ARB(13,
"DataSenderManager") <<
"sendFragment start frag.fragmentHeader()=" << std::hex << (
void*)(frag.headerBeginBytes()) <<
", szB=" << std::dec << std::to_string(fragSize) <<
", seqID=" << std::to_string(seqID) << TLOG_ENDL;
326 if (broadcast_sends_ || frag.type() == Fragment::EndOfRunFragmentType || frag.type() == Fragment::EndOfSubrunFragmentType || frag.type() == Fragment::InitFragmentType)
328 for (
auto& bdest : enabled_destinations_)
330 TRACE(5,
"DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d (broadcast)", seqID, bdest);
332 Fragment fragCopy(frag);
333 auto sts = destinations_[bdest]->copyFragment(fragCopy);
336 sts = destinations_[bdest]->copyFragment(fragCopy);
338 sent_frag_count_.incSlot(bdest);
341 else if (non_blocking_mode_)
344 dest = calcDest_(seqID);
347 TLOG_WARNING(
"DataSenderManager") <<
"Could not get destination for seqID " << std::to_string(seqID) <<
", retrying." << TLOG_ENDL;
350 if (destinations_.count(dest) && enabled_destinations_.count(dest))
352 TRACE(5,
"DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
354 auto lastWarnTime = std::chrono::steady_clock::now();
357 sts = destinations_[dest]->copyFragment(frag);
360 TLOG_ERROR(
"DataSenderManager") <<
"sendFragment: Sending fragment " << seqID <<
" to destination " << dest <<
" failed! Retrying..." << TLOG_ENDL;
361 lastWarnTime = std::chrono::steady_clock::now();
365 sent_frag_count_.incSlot(dest);
369 TLOG_WARNING(
"DataSenderManager") <<
"calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID << TLOG_ENDL;
375 dest = calcDest_(seqID);
378 TLOG_WARNING(
"DataSenderManager") <<
"Could not get destination for seqID " << std::to_string(seqID) <<
", send number " << sent_frag_count_.count() <<
", retrying." << TLOG_ENDL;
381 if (destinations_.count(dest) && enabled_destinations_.count(dest))
383 TRACE(5,
"DataSenderManager::sendFragment: Sending fragment with seqId %zu to destination %d", seqID, dest);
385 auto lastWarnTime = std::chrono::steady_clock::now();
388 sts = destinations_[dest]->moveFragment(std::move(frag));
391 TLOG_ERROR(
"DataSenderManager") <<
"sendFragment: Sending fragment " << seqID <<
" to destination " << dest <<
" failed! Retrying..." << TLOG_ENDL;
392 lastWarnTime = std::chrono::steady_clock::now();
396 sent_frag_count_.incSlot(dest);
400 TLOG_WARNING(
"DataSenderManager") <<
"calcDest returned invalid destination rank " << dest <<
"! This event has been lost: " << seqID << TLOG_ENDL;
403 if (routing_master_mode_ == detail::RoutingMasterMode::RouteBySequenceID && routing_table_.find(seqID - 1) != routing_table_.end())
405 std::unique_lock<std::mutex> lck(routing_mutex_);
406 routing_table_.erase(routing_table_.begin(), routing_table_.find(seqID - 1));
408 else if(routing_master_mode_ == detail::RoutingMasterMode::RouteBySendCount)
410 std::unique_lock<std::mutex> lck(routing_mutex_);
411 routing_table_.erase(routing_table_.begin(), routing_table_.find(sent_frag_count_.count()));
415 auto delta_t = std::chrono::duration_cast<std::chrono::duration<double, std::ratio<1>>>(std::chrono::steady_clock::now() - start_time).count();
416 metricMan->sendMetric(
"Data Send Time to Rank " + std::to_string(dest), delta_t,
"s", 1);
417 metricMan->sendMetric(
"Data Send Size to Rank " + std::to_string(dest), fragSize,
"B", 1);
418 metricMan->sendMetric(
"Data Send Rate to Rank " + std::to_string(dest), fragSize / delta_t,
"B/s", 1);
419 if (use_routing_master_) {
420 metricMan->sendMetric(
"Routing Table Size", routing_table_.size(),
"events", 1);
421 if (routing_wait_time_ > 0)
423 size_t wttemp = routing_wait_time_;
424 routing_wait_time_ = 0;
425 metricMan->sendMetric(
"Routing Wait Time", wttemp / 1000000000,
"s", 1);
429 TRACE(5,
"DataSenderManager::sendFragment: Done sending fragment %zu", seqID);
int ResolveHost(char const *host_in, in_addr &addr)
Convert a string hostname to a in_addr suitable for socket communication.
A row of the Routing Table.
The send operation timed out.
virtual ~DataSenderManager()
DataSenderManager Destructor.
A RoutingAckPacket contains the rank of the table receiver, plus the first and last sequence IDs in t...
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
std::unique_ptr< artdaq::TransferInterface > MakeTransferPlugin(const fhicl::ParameterSet &pset, std::string plugin_label, TransferInterface::Role role)
Load a TransferInterface plugin.
int sendFragment(Fragment &&frag)
Send the given Fragment. Return the rank of the destination to which the Fragment was sent...
DataSenderManager(const fhicl::ParameterSet &ps)
DataSenderManager Constructor.
Fragment::sequence_id_t first_sequence_id
The first sequence ID in the received RoutingPacket.
size_t GetRoutingTableEntryCount() const
Gets the current size of the Routing Table, in case other parts of the system want to use this inform...
This TransferInterface is a Sender.
Fragment::sequence_id_t last_sequence_id
The last sequence ID in the received RoutingPacket.
std::vector< RoutingPacketEntry > RoutingPacket
A RoutingPacket is simply a vector of RoutingPacketEntry objects. It is not suitable for network tran...
int rank
The rank from which the RoutingAckPacket came.
Some error occurred, but no exception was thrown.
The send operation completed successfully.
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.