1 #define TRACE_NAME "MPITransfer"
3 #include "artdaq-mpich-plugin/TransferPlugins/MPITransfer.hh"
5 #include "canvas/Utilities/Exception.h"
6 #include "cetlib_except/exception.h"
8 #include "artdaq-core/Data/Fragment.hh"
19 #define MPI_TAG_HEADER 0x8E // 142
20 #define MPI_TAG_DATA 0xDA // 218
23 std::mutex artdaq::MPITransfer::mpi_mutex_;
26 : TransferInterface(pset, role), reqs_(2 * buffer_count_, MPI_REQUEST_NULL), payload_(buffer_count_), pos_() {
27 TLOG(TLVL_TRACE) << GetTraceName() <<
" construction: "
28 <<
"source rank " << source_rank() <<
", "
29 <<
"destination rank " << destination_rank() <<
", " << buffer_count_ <<
" buffers. ";
31 if (buffer_count_ == 0) {
32 throw art::Exception(art::errors::Configuration,
"MPITransfer: ") <<
"No buffers configured.";
37 TLOG(TLVL_TRACE) << GetTraceName() <<
": ~MPITransfer: BEGIN";
43 TLOG(TLVL_TRACE) << GetTraceName() <<
": ~MPITransfer: DONE";
47 TLOG(TLVL_TRACE) << GetTraceName() <<
": flush_buffers: Collecting requests that need to be waited on";
48 std::vector<MPI_Request> reqs;
49 for (
size_t ii = 0; ii < reqs_.size(); ++ii) {
50 if (reqs_[ii] != MPI_REQUEST_NULL) {
51 reqs.push_back(reqs_[ii]);
54 if (reqs.size() > 0) {
55 TLOG(TLVL_TRACE) << GetTraceName() <<
": flush_buffers: Waiting on " << reqs.size() <<
" reqs.";
56 MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
61 Fragment
const& frag,
size_t send_timeout_usec) {
62 return sendFragment(Fragment(frag), send_timeout_usec);
66 return sendFragment(std::move(frag), 0);
69 artdaq::TransferInterface::CopyStatus artdaq::MPITransfer::sendFragment(Fragment&& frag,
size_t send_timeout_usec) {
70 if (frag.dataSize() > max_fragment_size_words_) {
71 TLOG(TLVL_WARNING) << GetTraceName() <<
" Fragment has size (" << frag.dataSize()
72 <<
") larger than max_fragment_size_words_ (" << max_fragment_size_words_ <<
")."
73 <<
" Total buffer space is: " << max_fragment_size_words_ * buffer_count_
74 <<
" words. Multiple over-size Fragments will exhaust the buffer!";
77 auto start_time = std::chrono::steady_clock::now();
79 TLOG(5) << GetTraceName() <<
": sendFragment: Finding available send slot, send_timeout_usec=" << send_timeout_usec;
80 auto req_idx = findAvailable();
82 while (req_idx == RECV_TIMEOUT &&
83 (send_timeout_usec == 0 || TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec)) {
85 req_idx = findAvailable();
87 if (counter % 1000 == 0) {
88 TLOG(TLVL_INFO) << GetTraceName() <<
" Rank " << source_rank() <<
" waiting for available buffer to "
89 << destination_rank() <<
". "
90 <<
"Waited " << TimeUtils::GetElapsedTimeMilliseconds(start_time) <<
" ms so far.";
93 if (req_idx == TransferInterface::RECV_TIMEOUT) {
94 TLOG(TLVL_WARNING) << GetTraceName() <<
": sendFragment: No buffers available! Returning RECV_TIMEOUT!";
95 return CopyStatus::kTimeout;
98 TLOG(5) << GetTraceName() <<
": sendFragment send slot is " << req_idx;
99 auto buffer_idx = req_idx / 2;
100 TLOG(5) << GetTraceName() <<
": sendFragment: Swapping in fragment to send to buffer " << buffer_idx;
101 Fragment& curfrag = payload_[buffer_idx];
102 curfrag = std::move(frag);
104 TLOG(5) << GetTraceName() <<
": sendFragment before send src=" << source_rank() <<
" dest=" << destination_rank()
105 <<
" seqID=" << curfrag.sequenceID() <<
" type=" << curfrag.typeString() <<
" found_idx=" << req_idx;
107 std::unique_lock<std::mutex> lk(mpi_mutex_);
110 TLOG(5) << GetTraceName() <<
": sendFragment: Using MPI_Isend";
112 MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() *
sizeof(RawDataType), MPI_BYTE,
113 destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
115 auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
116 auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
117 MPI_Issend(offset, sizeWrds *
sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD,
118 &reqs_[req_idx + 1]);
119 TLOG(5) << GetTraceName() <<
": sendFragment COMPLETE";
121 TLOG(11) << GetTraceName() <<
": sendFragment COMPLETE: "
122 <<
" buffer_idx=" << buffer_idx <<
" send_size=" << curfrag.size() <<
" src=" << source_rank()
123 <<
" dest=" << destination_rank() <<
" sequenceID=" << curfrag.sequenceID()
124 <<
" fragID=" << curfrag.fragmentID();
125 return CopyStatus::kSuccess;
129 TLOG(6) << GetTraceName() <<
": receiveFragmentHeader entered tmo=" << timeout_usec <<
" us (ignored)";
131 int wait_result = MPI_SUCCESS;
135 std::unique_lock<std::mutex> lk(mpi_mutex_);
136 MPI_Irecv(&header, header.num_words() *
sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER,
137 MPI_COMM_WORLD, &req);
143 std::unique_lock<std::mutex> lk(mpi_mutex_);
144 wait_result = MPI_Test(&req, &flag, &status);
151 if (req != MPI_REQUEST_NULL) {
152 TLOG(TLVL_ERROR) << GetTraceName() <<
" INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
153 throw art::Exception(art::errors::LogicError,
"MPITransfer: ")
154 <<
"INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
157 TLOG(8) << GetTraceName() <<
": receiveFragmentHeader recvd";
160 TLOG(8) << GetTraceName() <<
": receiveFragmentHeader: " << my_rank <<
" Wait_error=" << wait_result
161 <<
" status_error=" << status.MPI_ERROR <<
" source=" << status.MPI_SOURCE <<
" tag=" << status.MPI_TAG
162 <<
" Fragment_sequenceID=" << (uint64_t)header.sequence_id <<
" Fragment_size=" << header.word_count
163 <<
" fragID=" << header.fragment_id;
165 char err_buffer[MPI_MAX_ERROR_STRING];
167 switch (wait_result) {
170 case MPI_ERR_IN_STATUS:
171 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
172 TLOG(TLVL_ERROR) << GetTraceName() <<
": Waitany ERROR: " << err_buffer;
175 MPI_Error_string(wait_result, err_buffer, &resultlen);
176 TLOG(TLVL_ERROR) << GetTraceName() <<
": Waitany ERROR: " << err_buffer;
180 return status.MPI_SOURCE;
184 TLOG(6) << GetTraceName() <<
": receiveFragmentData entered wordCount=" << wordCount;
185 int wait_result = MPI_SUCCESS;
190 std::unique_lock<std::mutex> lk(mpi_mutex_);
191 MPI_Irecv(destination, wordCount *
sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD,
198 std::unique_lock<std::mutex> lk(mpi_mutex_);
199 wait_result = MPI_Test(&req, &flag, &status);
205 if (req != MPI_REQUEST_NULL) {
206 TLOG(TLVL_ERROR) << GetTraceName() <<
" INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
207 throw art::Exception(art::errors::LogicError,
"MPITransfer: ")
208 <<
"INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
212 TLOG(8) << GetTraceName() <<
": receiveFragmentData recvd";
214 char err_buffer[MPI_MAX_ERROR_STRING];
216 switch (wait_result) {
219 case MPI_ERR_IN_STATUS:
220 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
221 TLOG(TLVL_ERROR) << GetTraceName() <<
" MPITransfer: Waitany ERROR: " << err_buffer;
224 MPI_Error_string(wait_result, err_buffer, &resultlen);
225 TLOG(TLVL_ERROR) << GetTraceName() <<
" MPITransfer: Waitany ERROR: " << err_buffer;
229 return status.MPI_SOURCE;
232 void artdaq::MPITransfer::cancelReq_(MPI_Request req)
const {
233 if (req == MPI_REQUEST_NULL)
return;
235 TLOG(8) << GetTraceName() <<
": Cancelling post";
237 std::unique_lock<std::mutex> lk(mpi_mutex_);
238 int result = MPI_Cancel(&req);
239 if (result == MPI_SUCCESS) {
241 MPI_Wait(&req, &status);
244 case MPI_ERR_REQUEST:
245 throw art::Exception(art::errors::LogicError,
"MPITransfer: ") <<
"MPI_Cancel returned MPI_ERR_REQUEST.\n";
247 throw art::Exception(art::errors::LogicError,
"MPITransfer: ") <<
"MPI_Cancel returned MPI_ERR_ARG.\n";
249 throw art::Exception(art::errors::LogicError,
"MPITransfer: ") <<
"MPI_Cancel returned unknown error code.\n";
254 int artdaq::MPITransfer::findAvailable() {
256 int flag = 0, flag2 = 0;
261 std::unique_lock<std::mutex> lk(mpi_mutex_);
262 MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
264 MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
266 pos_ = (pos_ + 2) % reqs_.size();
268 }
while (flag2 == 0 && loops < buffer_count_);
269 if (loops == buffer_count_) {
270 return TransferInterface::RECV_TIMEOUT;
272 TLOG(5) << GetTraceName() <<
" findAvailable returning use_me=" << use_me <<
" loops=" << loops;
virtual ~MPITransfer()
MPITransfer Destructor.
void flush_buffers() override
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout) override
Receive a Fragment Header from the transport mechanism.
MPITransfer(fhicl::ParameterSet pset, Role role)
MPITransfer Constructor.
CopyStatus transfer_fragment_reliable_mode(Fragment &&frag) override
Move a Fragment to the destination.
MPITransfer is a TransferInterface implementation plugin that transfers data using MPI...
CopyStatus transfer_fragment_min_blocking_mode(Fragment const &frag, size_t timeout_usec) override
Copy a Fragment to the destination. Forces asynchronous send.
int receiveFragmentData(RawDataType *destination, size_t wordCount) override
Receive the body of a Fragment to the given destination pointer.