1 #define TRACE_NAME "MPITransfer"
2 #include "artdaq-mpich-plugin/TransferPlugins/MPITransfer.hh"
5 #include "canvas/Utilities/Exception.h"
6 #include "cetlib_except/exception.h"
8 #include "artdaq-core/Data/Fragment.hh"
20 #define MPI_TAG_HEADER 0x8E // 142
21 #define MPI_TAG_DATA 0xDA // 218
24 std::mutex artdaq::MPITransfer::mpi_mutex_;
27 : TransferInterface(pset, role)
28 , reqs_(2 * buffer_count_, MPI_REQUEST_NULL)
29 , payload_(buffer_count_)
32 TLOG(TLVL_TRACE) << GetTraceName() <<
" construction: "
33 <<
"source rank " << source_rank() <<
", "
34 <<
"destination rank " << destination_rank() <<
", "
35 << buffer_count_ <<
" buffers. ";
37 if (buffer_count_ == 0)
39 throw art::Exception(art::errors::Configuration,
"MPITransfer: ")
40 <<
"No buffers configured.";
47 TLOG(TLVL_TRACE) << GetTraceName() <<
": ~MPITransfer: BEGIN";
53 TLOG(TLVL_TRACE) << GetTraceName() <<
": ~MPITransfer: DONE";
58 TLOG(TLVL_TRACE) << GetTraceName() <<
": flush_buffers: Collecting requests that need to be waited on";
59 std::vector<MPI_Request> reqs;
60 for (
size_t ii = 0; ii < reqs_.size(); ++ii) {
61 if (reqs_[ii] != MPI_REQUEST_NULL) {
62 reqs.push_back(reqs_[ii]);
65 if (reqs.size() > 0) {
66 TLOG(TLVL_TRACE) << GetTraceName() <<
": flush_buffers: Waiting on " << reqs.size() <<
" reqs.";
67 MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
71 artdaq::TransferInterface::CopyStatus
75 return sendFragment(Fragment(frag), send_timeout_usec);
78 artdaq::TransferInterface::CopyStatus
82 return sendFragment(std::move(frag), 0);
85 artdaq::TransferInterface::CopyStatus
87 sendFragment(Fragment&& frag,
size_t send_timeout_usec)
89 if (frag.dataSize() > max_fragment_size_words_)
91 TLOG(TLVL_WARNING) << GetTraceName() <<
" Fragment has size (" << frag.dataSize() <<
") larger than max_fragment_size_words_ (" << max_fragment_size_words_ <<
")."
92 <<
" Total buffer space is: " << max_fragment_size_words_ * buffer_count_ <<
" words. Multiple over-size Fragments will exhaust the buffer!";
95 auto start_time = std::chrono::steady_clock::now();
97 TLOG(5) << GetTraceName() <<
": sendFragment: Finding available send slot, send_timeout_usec=" << send_timeout_usec;
98 auto req_idx = findAvailable();
100 while (req_idx == RECV_TIMEOUT && (send_timeout_usec == 0 || TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec))
103 req_idx = findAvailable();
105 if (counter % 1000 == 0)
107 TLOG(TLVL_INFO) << GetTraceName() <<
" Rank " << source_rank() <<
" waiting for available buffer to " << destination_rank() <<
". "
108 <<
"Waited " << TimeUtils::GetElapsedTimeMilliseconds(start_time) <<
" ms so far.";
112 if (req_idx == TransferInterface::RECV_TIMEOUT)
114 TLOG(TLVL_WARNING) << GetTraceName() <<
": sendFragment: No buffers available! Returning RECV_TIMEOUT!";
115 return CopyStatus::kTimeout;
118 TLOG(5) << GetTraceName() <<
": sendFragment send slot is " << req_idx;
119 auto buffer_idx = req_idx / 2;
120 TLOG(5) << GetTraceName() <<
": sendFragment: Swapping in fragment to send to buffer " << buffer_idx;
121 Fragment& curfrag = payload_[buffer_idx];
122 curfrag = std::move(frag);
124 TLOG(5) << GetTraceName() <<
": sendFragment before send src=" << source_rank() <<
" dest=" << destination_rank() <<
" seqID=" << curfrag.sequenceID() <<
" type=" << curfrag.typeString() <<
" found_idx=" << req_idx;
126 std::unique_lock<std::mutex> lk(mpi_mutex_);
129 TLOG(5) << GetTraceName() <<
": sendFragment: Using MPI_Isend";
131 MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() *
sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
133 auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
134 auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
135 MPI_Issend(offset, sizeWrds *
sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &reqs_[req_idx + 1]);
136 TLOG(5) << GetTraceName() <<
": sendFragment COMPLETE";
138 TLOG(11) << GetTraceName() <<
": sendFragment COMPLETE: "
139 <<
" buffer_idx=" << buffer_idx
140 <<
" send_size=" << curfrag.size()
141 <<
" src=" << source_rank()
142 <<
" dest=" << destination_rank()
143 <<
" sequenceID=" << curfrag.sequenceID()
144 <<
" fragID=" << curfrag.fragmentID();
145 return CopyStatus::kSuccess;
150 TLOG(6) << GetTraceName() <<
": receiveFragmentHeader entered tmo=" << timeout_usec <<
" us (ignored)";
152 int wait_result = MPI_SUCCESS;
156 std::unique_lock<std::mutex> lk(mpi_mutex_);
157 MPI_Irecv(&header, header.num_words() *
sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &req);
163 std::unique_lock<std::mutex> lk(mpi_mutex_);
164 wait_result = MPI_Test(&req, &flag, &status);
171 if (req != MPI_REQUEST_NULL)
173 TLOG(TLVL_ERROR) << GetTraceName() <<
" INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
174 throw art::Exception(art::errors::LogicError,
"MPITransfer: ") <<
"INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
177 TLOG(8) << GetTraceName() <<
": receiveFragmentHeader recvd";
180 {TLOG(8) << GetTraceName() <<
": receiveFragmentHeader: " << my_rank
181 <<
" Wait_error=" << wait_result
182 <<
" status_error=" << status.MPI_ERROR
183 <<
" source=" << status.MPI_SOURCE
184 <<
" tag=" << status.MPI_TAG
185 <<
" Fragment_sequenceID=" << (uint64_t)header.sequence_id
186 <<
" Fragment_size=" << header.word_count
187 <<
" fragID=" << header.fragment_id;
189 char err_buffer[MPI_MAX_ERROR_STRING];
195 case MPI_ERR_IN_STATUS:
196 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
197 TLOG(TLVL_ERROR) << GetTraceName()
198 <<
": Waitany ERROR: " << err_buffer;
201 MPI_Error_string(wait_result, err_buffer, &resultlen);
202 TLOG(TLVL_ERROR) << GetTraceName()
203 <<
": Waitany ERROR: " << err_buffer;
207 return status.MPI_SOURCE;
212 TLOG(6) << GetTraceName() <<
": receiveFragmentData entered wordCount=" << wordCount;
213 int wait_result = MPI_SUCCESS;
218 std::unique_lock<std::mutex> lk(mpi_mutex_);
219 MPI_Irecv(destination, wordCount *
sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &req);
225 std::unique_lock<std::mutex> lk(mpi_mutex_);
226 wait_result = MPI_Test(&req, &flag, &status);
232 if (req != MPI_REQUEST_NULL)
234 TLOG(TLVL_ERROR) << GetTraceName() <<
" INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
235 throw art::Exception(art::errors::LogicError,
"MPITransfer: ") <<
"INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
239 TLOG(8) << GetTraceName() <<
": receiveFragmentData recvd";
242 char err_buffer[MPI_MAX_ERROR_STRING];
248 case MPI_ERR_IN_STATUS:
249 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
250 TLOG(TLVL_ERROR) << GetTraceName()
251 <<
" MPITransfer: Waitany ERROR: " << err_buffer;
254 MPI_Error_string(wait_result, err_buffer, &resultlen);
255 TLOG(TLVL_ERROR) << GetTraceName()
256 <<
" MPITransfer: Waitany ERROR: " << err_buffer;
260 return status.MPI_SOURCE;
264 artdaq::MPITransfer::
265 cancelReq_(MPI_Request req)
const
267 if (req == MPI_REQUEST_NULL)
return;
269 TLOG(8) << GetTraceName() <<
": Cancelling post";
271 std::unique_lock<std::mutex> lk(mpi_mutex_);
272 int result = MPI_Cancel(&req);
273 if (result == MPI_SUCCESS)
276 MPI_Wait(&req, &status);
282 case MPI_ERR_REQUEST:
283 throw art::Exception(art::errors::LogicError,
"MPITransfer: ")
284 <<
"MPI_Cancel returned MPI_ERR_REQUEST.\n";
286 throw art::Exception(art::errors::LogicError,
"MPITransfer: ")
287 <<
"MPI_Cancel returned MPI_ERR_ARG.\n";
289 throw art::Exception(art::errors::LogicError,
"MPITransfer: ")
290 <<
"MPI_Cancel returned unknown error code.\n";
295 int artdaq::MPITransfer::findAvailable()
298 int flag = 0, flag2 = 0;
304 std::unique_lock<std::mutex> lk(mpi_mutex_);
305 MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
307 MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
309 pos_ = (pos_ + 2) % reqs_.size();
311 }
while (flag2 == 0 && loops < buffer_count_);
312 if (loops == buffer_count_) {
return TransferInterface::RECV_TIMEOUT; }
313 TLOG(5) << GetTraceName() <<
" findAvailable returning use_me=" << use_me <<
" loops=" << loops;
virtual ~MPITransfer()
MPITransfer Destructor.
void flush_buffers() override
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout) override
Receive a Fragment Header from the transport mechanism.
MPITransfer(fhicl::ParameterSet pset, Role role)
MPITransfer Constructor.
CopyStatus transfer_fragment_reliable_mode(Fragment &&frag) override
Move a Fragment to the destination.
MPITransfer is a TransferInterface implementation plugin that transfers data using MPI...
CopyStatus transfer_fragment_min_blocking_mode(Fragment const &frag, size_t timeout_usec) override
Copy a Fragment to the destination. Forces asynchronous send.
int receiveFragmentData(RawDataType *destination, size_t wordCount) override
Receive the body of a Fragment to the given destination pointer.