1 #define TRACE_NAME "MPITransfer"
3 #include "artdaq-mpich-plugin/TransferPlugins/MPITransfer.hh"
5 #include "canvas/Utilities/Exception.h"
6 #include "cetlib_except/exception.h"
8 #include "artdaq-core/Data/Fragment.hh"
19 #define MPI_TAG_HEADER 0x8E // 142
20 #define MPI_TAG_DATA 0xDA // 218
23 std::mutex artdaq::MPITransfer::mpi_mutex_;
26 : TransferInterface(pset, role), reqs_(2 * buffer_count_, MPI_REQUEST_NULL), payload_(buffer_count_), pos_()
28 TLOG(TLVL_TRACE) << GetTraceName() <<
" construction: "
29 <<
"source rank " << source_rank() <<
", "
30 <<
"destination rank " << destination_rank() <<
", " << buffer_count_ <<
" buffers. ";
32 if (buffer_count_ == 0)
34 throw art::Exception(art::errors::Configuration,
"MPITransfer: ") <<
"No buffers configured.";
40 TLOG(TLVL_TRACE) << GetTraceName() <<
": ~MPITransfer: BEGIN";
46 TLOG(TLVL_TRACE) << GetTraceName() <<
": ~MPITransfer: DONE";
51 TLOG(TLVL_TRACE) << GetTraceName() <<
": flush_buffers: Collecting requests that need to be waited on";
52 std::vector<MPI_Request> reqs;
53 for (
size_t ii = 0; ii < reqs_.size(); ++ii)
55 if (reqs_[ii] != MPI_REQUEST_NULL)
57 reqs.push_back(reqs_[ii]);
62 TLOG(TLVL_TRACE) << GetTraceName() <<
": flush_buffers: Waiting on " << reqs.size() <<
" reqs.";
63 MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
68 Fragment
const& frag,
size_t send_timeout_usec)
70 return sendFragment(Fragment(frag), send_timeout_usec);
75 return sendFragment(std::move(frag), 0);
78 artdaq::TransferInterface::CopyStatus artdaq::MPITransfer::sendFragment(Fragment&& frag,
size_t send_timeout_usec)
80 if (frag.dataSize() > max_fragment_size_words_)
82 TLOG(TLVL_WARNING) << GetTraceName() <<
" Fragment has size (" << frag.dataSize()
83 <<
") larger than max_fragment_size_words_ (" << max_fragment_size_words_ <<
")."
84 <<
" Total buffer space is: " << max_fragment_size_words_ * buffer_count_
85 <<
" words. Multiple over-size Fragments will exhaust the buffer!";
88 auto start_time = std::chrono::steady_clock::now();
90 TLOG(5) << GetTraceName() <<
": sendFragment: Finding available send slot, send_timeout_usec=" << send_timeout_usec;
91 auto req_idx = findAvailable();
93 while (req_idx == RECV_TIMEOUT &&
94 (send_timeout_usec == 0 || TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec))
97 req_idx = findAvailable();
99 if (counter % 1000 == 0)
101 TLOG(TLVL_INFO) << GetTraceName() <<
" Rank " << source_rank() <<
" waiting for available buffer to "
102 << destination_rank() <<
". "
103 <<
"Waited " << TimeUtils::GetElapsedTimeMilliseconds(start_time) <<
" ms so far.";
106 if (req_idx == TransferInterface::RECV_TIMEOUT)
108 TLOG(TLVL_WARNING) << GetTraceName() <<
": sendFragment: No buffers available! Returning RECV_TIMEOUT!";
109 return CopyStatus::kTimeout;
112 TLOG(5) << GetTraceName() <<
": sendFragment send slot is " << req_idx;
113 auto buffer_idx = req_idx / 2;
114 TLOG(5) << GetTraceName() <<
": sendFragment: Swapping in fragment to send to buffer " << buffer_idx;
115 Fragment& curfrag = payload_[buffer_idx];
116 curfrag = std::move(frag);
118 TLOG(5) << GetTraceName() <<
": sendFragment before send src=" << source_rank() <<
" dest=" << destination_rank()
119 <<
" seqID=" << curfrag.sequenceID() <<
" type=" << curfrag.typeString() <<
" found_idx=" << req_idx;
121 std::unique_lock<std::mutex> lk(mpi_mutex_);
124 TLOG(5) << GetTraceName() <<
": sendFragment: Using MPI_Isend";
126 MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() *
sizeof(RawDataType), MPI_BYTE,
127 destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
129 auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
130 auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
131 MPI_Issend(offset, sizeWrds *
sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD,
132 &reqs_[req_idx + 1]);
133 TLOG(5) << GetTraceName() <<
": sendFragment COMPLETE";
135 TLOG(11) << GetTraceName() <<
": sendFragment COMPLETE: "
136 <<
" buffer_idx=" << buffer_idx <<
" send_size=" << curfrag.size() <<
" src=" << source_rank()
137 <<
" dest=" << destination_rank() <<
" sequenceID=" << curfrag.sequenceID()
138 <<
" fragID=" << curfrag.fragmentID();
139 return CopyStatus::kSuccess;
144 TLOG(6) << GetTraceName() <<
": receiveFragmentHeader entered tmo=" << timeout_usec <<
" us (ignored)";
146 int wait_result = MPI_SUCCESS;
150 std::unique_lock<std::mutex> lk(mpi_mutex_);
151 MPI_Irecv(&header, header.num_words() *
sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER,
152 MPI_COMM_WORLD, &req);
159 std::unique_lock<std::mutex> lk(mpi_mutex_);
160 wait_result = MPI_Test(&req, &flag, &status);
168 if (req != MPI_REQUEST_NULL)
170 TLOG(TLVL_ERROR) << GetTraceName() <<
" INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
171 throw art::Exception(art::errors::LogicError,
"MPITransfer: ")
172 <<
"INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
175 TLOG(8) << GetTraceName() <<
": receiveFragmentHeader recvd";
178 TLOG(8) << GetTraceName() <<
": receiveFragmentHeader: " << my_rank <<
" Wait_error=" << wait_result
179 <<
" status_error=" << status.MPI_ERROR <<
" source=" << status.MPI_SOURCE <<
" tag=" << status.MPI_TAG
180 <<
" Fragment_sequenceID=" << (uint64_t)header.sequence_id <<
" Fragment_size=" << header.word_count
181 <<
" fragID=" << header.fragment_id;
183 char err_buffer[MPI_MAX_ERROR_STRING];
189 case MPI_ERR_IN_STATUS:
190 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
191 TLOG(TLVL_ERROR) << GetTraceName() <<
": Waitany ERROR: " << err_buffer;
194 MPI_Error_string(wait_result, err_buffer, &resultlen);
195 TLOG(TLVL_ERROR) << GetTraceName() <<
": Waitany ERROR: " << err_buffer;
199 return status.MPI_SOURCE;
204 TLOG(6) << GetTraceName() <<
": receiveFragmentData entered wordCount=" << wordCount;
205 int wait_result = MPI_SUCCESS;
210 std::unique_lock<std::mutex> lk(mpi_mutex_);
211 MPI_Irecv(destination, wordCount *
sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD,
219 std::unique_lock<std::mutex> lk(mpi_mutex_);
220 wait_result = MPI_Test(&req, &flag, &status);
227 if (req != MPI_REQUEST_NULL)
229 TLOG(TLVL_ERROR) << GetTraceName() <<
" INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
230 throw art::Exception(art::errors::LogicError,
"MPITransfer: ")
231 <<
"INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
235 TLOG(8) << GetTraceName() <<
": receiveFragmentData recvd";
237 char err_buffer[MPI_MAX_ERROR_STRING];
243 case MPI_ERR_IN_STATUS:
244 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
245 TLOG(TLVL_ERROR) << GetTraceName() <<
" MPITransfer: Waitany ERROR: " << err_buffer;
248 MPI_Error_string(wait_result, err_buffer, &resultlen);
249 TLOG(TLVL_ERROR) << GetTraceName() <<
" MPITransfer: Waitany ERROR: " << err_buffer;
253 return status.MPI_SOURCE;
256 void artdaq::MPITransfer::cancelReq_(MPI_Request req)
const
258 if (req == MPI_REQUEST_NULL)
return;
260 TLOG(8) << GetTraceName() <<
": Cancelling post";
262 std::unique_lock<std::mutex> lk(mpi_mutex_);
263 int result = MPI_Cancel(&req);
264 if (result == MPI_SUCCESS)
267 MPI_Wait(&req, &status);
273 case MPI_ERR_REQUEST:
274 throw art::Exception(art::errors::LogicError,
"MPITransfer: ") <<
"MPI_Cancel returned MPI_ERR_REQUEST.\n";
276 throw art::Exception(art::errors::LogicError,
"MPITransfer: ") <<
"MPI_Cancel returned MPI_ERR_ARG.\n";
278 throw art::Exception(art::errors::LogicError,
"MPITransfer: ") <<
"MPI_Cancel returned unknown error code.\n";
283 int artdaq::MPITransfer::findAvailable()
286 int flag = 0, flag2 = 0;
292 std::unique_lock<std::mutex> lk(mpi_mutex_);
293 MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
296 MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
298 pos_ = (pos_ + 2) % reqs_.size();
300 }
while (flag2 == 0 && loops < buffer_count_);
301 if (loops == buffer_count_)
303 return TransferInterface::RECV_TIMEOUT;
305 TLOG(5) << GetTraceName() <<
" findAvailable returning use_me=" << use_me <<
" loops=" << loops;
virtual ~MPITransfer()
MPITransfer Destructor.
void flush_buffers() override
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout) override
Receive a Fragment Header from the transport mechanism.
MPITransfer(fhicl::ParameterSet pset, Role role)
MPITransfer Constructor.
CopyStatus transfer_fragment_reliable_mode(Fragment &&frag) override
Move a Fragment to the destination.
MPITransfer is a TransferInterface implementation plugin that transfers data using MPI...
CopyStatus transfer_fragment_min_blocking_mode(Fragment const &frag, size_t timeout_usec) override
Copy a Fragment to the destination. Forces asynchronous send.
int receiveFragmentData(RawDataType *destination, size_t wordCount) override
Receive the body of a Fragment to the given destination pointer.