1 #define TRACE_NAME "MPITransfer"
2 #include "artdaq-mpich-plugin/TransferPlugins/MPITransfer.hh"
5 #include "canvas/Utilities/Exception.h"
6 #include "cetlib_except/exception.h"
8 #include "artdaq-core/Data/Fragment.hh"
20 #define MPI_TAG_HEADER 0x8E // 142
21 #define MPI_TAG_DATA 0xDA // 218
24 std::mutex artdaq::MPITransfer::mpi_mutex_;
27 : TransferInterface(pset, role)
28 , reqs_(2 * buffer_count_, MPI_REQUEST_NULL)
29 , payload_(buffer_count_)
32 TLOG(TLVL_TRACE) << GetTraceName() <<
" construction: "
33 <<
"source rank " << source_rank() <<
", "
34 <<
"destination rank " << destination_rank() <<
", "
35 << buffer_count_ <<
" buffers. ";
37 if (buffer_count_ == 0)
39 throw art::Exception(art::errors::Configuration,
"MPITransfer: ")
40 <<
"No buffers configured.";
47 TLOG(TLVL_TRACE) << GetTraceName() <<
": ~MPITransfer: BEGIN";
48 TLOG(TLVL_TRACE) << GetTraceName() <<
": ~MPITransfer: Collecting requests that need to be waited on";
49 std::vector<MPI_Request> reqs;
50 for (
size_t ii = 0; ii < reqs_.size(); ++ii)
52 if (reqs_[ii] != MPI_REQUEST_NULL)
54 reqs.push_back(reqs_[ii]);
59 TLOG(TLVL_TRACE) << GetTraceName() <<
": ~MPITransfer: Waiting on " << reqs.size() <<
" reqs.";
60 MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
66 TLOG(TLVL_TRACE) << GetTraceName() <<
": ~MPITransfer: DONE";
69 artdaq::TransferInterface::CopyStatus
73 return sendFragment(std::move(frag), send_timeout_usec);
76 artdaq::TransferInterface::CopyStatus
80 return sendFragment(std::move(frag), 0);
83 artdaq::TransferInterface::CopyStatus
85 sendFragment(Fragment&& frag,
size_t send_timeout_usec)
87 if (frag.dataSize() > max_fragment_size_words_)
89 TLOG(TLVL_WARNING) << GetTraceName() <<
" Fragment has size (" << frag.dataSize() <<
") larger than max_fragment_size_words_ (" << max_fragment_size_words_ <<
")."
90 <<
" Total buffer space is: " << max_fragment_size_words_ * buffer_count_ <<
" words. Multiple over-size Fragments will exhaust the buffer!";
93 auto start_time = std::chrono::steady_clock::now();
95 TLOG(5) << GetTraceName() <<
": moveFragment: Finding available send slot, send_timeout_usec=" << send_timeout_usec;
96 auto req_idx = findAvailable();
98 while (req_idx == RECV_TIMEOUT && (send_timeout_usec == 0 || TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec))
101 req_idx = findAvailable();
103 if (counter % 1000 == 0)
105 TLOG(TLVL_INFO) << GetTraceName() <<
" Rank " << source_rank() <<
" waiting for available buffer to " << destination_rank() <<
". "
106 <<
"Waited " << TimeUtils::GetElapsedTimeMilliseconds(start_time) <<
" ms so far.";
110 if (req_idx == TransferInterface::RECV_TIMEOUT)
112 TLOG(TLVL_WARNING) << GetTraceName() <<
": sendFragment: No buffers available! Returning RECV_TIMEOUT!";
113 return CopyStatus::kTimeout;
116 TLOG(5) << GetTraceName() <<
": moveFragment send slot is " << req_idx;
117 auto buffer_idx = req_idx / 2;
118 TLOG(5) << GetTraceName() <<
": moveFragment: Swapping in fragment to send to buffer " << buffer_idx;
119 Fragment& curfrag = payload_[buffer_idx];
120 curfrag = std::move(frag);
122 TLOG(5) << GetTraceName() <<
": moveFragment before send src=" << source_rank() <<
" dest=" << destination_rank() <<
" seqID=" << curfrag.sequenceID() <<
" type=" << curfrag.typeString() <<
" found_idx=" << req_idx;
124 std::unique_lock<std::mutex> lk(mpi_mutex_);
127 TLOG(5) << GetTraceName() <<
": moveFragment: Using MPI_Isend";
129 MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() *
sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
131 auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
132 auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
133 MPI_Issend(offset, sizeWrds *
sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &reqs_[req_idx + 1]);
134 TLOG(5) << GetTraceName() <<
": moveFragment COMPLETE";
136 TLOG(11) << GetTraceName() <<
": moveFragment COMPLETE: "
137 <<
" buffer_idx=" << buffer_idx
138 <<
" send_size=" << curfrag.size()
139 <<
" src=" << source_rank()
140 <<
" dest=" << destination_rank()
141 <<
" sequenceID=" << curfrag.sequenceID()
142 <<
" fragID=" << curfrag.fragmentID();
143 return CopyStatus::kSuccess;
148 TLOG(6) << GetTraceName() <<
": receiveFragmentHeader entered tmo=" << timeout_usec <<
" us (ignored)";
150 int wait_result = MPI_SUCCESS;
154 std::unique_lock<std::mutex> lk(mpi_mutex_);
155 MPI_Irecv(&header, header.num_words() *
sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &req);
161 std::unique_lock<std::mutex> lk(mpi_mutex_);
162 wait_result = MPI_Test(&req, &flag, &status);
169 if (req != MPI_REQUEST_NULL)
171 TLOG(TLVL_ERROR) << GetTraceName() <<
" INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
172 throw art::Exception(art::errors::LogicError,
"MPITransfer: ") <<
"INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
175 TLOG(8) << GetTraceName() <<
": receiveFragmentHeader recvd";
178 {TLOG(8) << GetTraceName() <<
": receiveFragmentHeader: " << my_rank
179 <<
" Wait_error=" << wait_result
180 <<
" status_error=" << status.MPI_ERROR
181 <<
" source=" << status.MPI_SOURCE
182 <<
" tag=" << status.MPI_TAG
183 <<
" Fragment_sequenceID=" << (uint64_t)header.sequence_id
184 <<
" Fragment_size=" << header.word_count
185 <<
" fragID=" << header.fragment_id;
187 char err_buffer[MPI_MAX_ERROR_STRING];
193 case MPI_ERR_IN_STATUS:
194 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
195 TLOG(TLVL_ERROR) << GetTraceName()
196 <<
": Waitany ERROR: " << err_buffer;
199 MPI_Error_string(wait_result, err_buffer, &resultlen);
200 TLOG(TLVL_ERROR) << GetTraceName()
201 <<
": Waitany ERROR: " << err_buffer;
205 return status.MPI_SOURCE;
210 TLOG(6) << GetTraceName() <<
": receiveFragmentData entered wordCount=" << wordCount;
211 int wait_result = MPI_SUCCESS;
216 std::unique_lock<std::mutex> lk(mpi_mutex_);
217 MPI_Irecv(destination, wordCount *
sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &req);
223 std::unique_lock<std::mutex> lk(mpi_mutex_);
224 wait_result = MPI_Test(&req, &flag, &status);
230 if (req != MPI_REQUEST_NULL)
232 TLOG(TLVL_ERROR) << GetTraceName() <<
" INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
233 throw art::Exception(art::errors::LogicError,
"MPITransfer: ") <<
"INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
237 TLOG(8) << GetTraceName() <<
": receiveFragmentData recvd";
240 char err_buffer[MPI_MAX_ERROR_STRING];
246 case MPI_ERR_IN_STATUS:
247 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
248 TLOG(TLVL_ERROR) << GetTraceName()
249 <<
" MPITransfer: Waitany ERROR: " << err_buffer;
252 MPI_Error_string(wait_result, err_buffer, &resultlen);
253 TLOG(TLVL_ERROR) << GetTraceName()
254 <<
" MPITransfer: Waitany ERROR: " << err_buffer;
258 return status.MPI_SOURCE;
262 artdaq::MPITransfer::
263 cancelReq_(MPI_Request req)
const
265 if (req == MPI_REQUEST_NULL)
return;
267 TLOG(8) << GetTraceName() <<
": Cancelling post";
269 std::unique_lock<std::mutex> lk(mpi_mutex_);
270 int result = MPI_Cancel(&req);
271 if (result == MPI_SUCCESS)
274 MPI_Wait(&req, &status);
280 case MPI_ERR_REQUEST:
281 throw art::Exception(art::errors::LogicError,
"MPITransfer: ")
282 <<
"MPI_Cancel returned MPI_ERR_REQUEST.\n";
284 throw art::Exception(art::errors::LogicError,
"MPITransfer: ")
285 <<
"MPI_Cancel returned MPI_ERR_ARG.\n";
287 throw art::Exception(art::errors::LogicError,
"MPITransfer: ")
288 <<
"MPI_Cancel returned unknown error code.\n";
293 int artdaq::MPITransfer::findAvailable()
296 int flag = 0, flag2 = 0;
302 std::unique_lock<std::mutex> lk(mpi_mutex_);
303 MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
305 MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
307 pos_ = (pos_ + 2) % reqs_.size();
309 }
while (flag2 == 0 && loops < buffer_count_);
310 if (loops == buffer_count_) {
return TransferInterface::RECV_TIMEOUT; }
311 TLOG(5) << GetTraceName() <<
" findAvailable returning use_me=" << use_me <<
" loops=" << loops;
virtual ~MPITransfer()
MPITransfer Destructor.
int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout) override
Receive a Fragment Header from the transport mechanism.
CopyStatus moveFragment(Fragment &&frag) override
Move a Fragment to the destination.
MPITransfer(fhicl::ParameterSet pset, Role role)
MPITransfer Constructor.
MPITransfer is a TransferInterface implementation plugin that transfers data using MPI...
int receiveFragmentData(RawDataType *destination, size_t wordCount) override
Receive the body of a Fragment to the given destination pointer.
CopyStatus copyFragment(Fragment &frag, size_t timeout_usec) override
Copy a Fragment to the destination. Forces asynchronous send.