00001 #define TRACE_NAME "MPITransfer"
00002 #include "artdaq-mpich-plugin/TransferPlugins/MPITransfer.hh"
00003 #include <algorithm>
00004
00005 #include "canvas/Utilities/Exception.h"
00006 #include "cetlib_except/exception.h"
00007
00008 #include "artdaq-core/Data/Fragment.hh"
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #define MPI_TAG_HEADER 0x8E // 142
00021 #define MPI_TAG_DATA 0xDA // 218
00022 #define USE_RECV 1
00023
00024 std::mutex artdaq::MPITransfer::mpi_mutex_;
00025
00026 artdaq::MPITransfer::MPITransfer(fhicl::ParameterSet pset, Role role)
00027 : TransferInterface(pset, role)
00028 , reqs_(2 * buffer_count_, MPI_REQUEST_NULL)
00029 , payload_(buffer_count_)
00030 , pos_()
00031 {
00032 TLOG(TLVL_TRACE) << GetTraceName() << " construction: "
00033 << "source rank " << source_rank() << ", "
00034 << "destination rank " << destination_rank() << ", "
00035 << buffer_count_ << " buffers. ";
00036
00037 if (buffer_count_ == 0)
00038 {
00039 throw art::Exception(art::errors::Configuration, "MPITransfer: ")
00040 << "No buffers configured.";
00041 }
00042 }
00043
00044 artdaq::MPITransfer::
00045 ~MPITransfer()
00046 {
00047 TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: BEGIN";
00048 TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: Collecting requests that need to be waited on";
00049 std::vector<MPI_Request> reqs;
00050 for (size_t ii = 0; ii < reqs_.size(); ++ii)
00051 {
00052 if (reqs_[ii] != MPI_REQUEST_NULL)
00053 {
00054 reqs.push_back(reqs_[ii]);
00055 }
00056 }
00057 if (reqs.size() > 0)
00058 {
00059 TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: Waiting on " << reqs.size() << " reqs.";
00060 MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
00061 }
00062
00063
00064
00065
00066 TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: DONE";
00067 }
00068
00069 artdaq::TransferInterface::CopyStatus
00070 artdaq::MPITransfer::
00071 copyFragment(Fragment& frag, size_t send_timeout_usec)
00072 {
00073 return sendFragment(std::move(frag), send_timeout_usec);
00074 }
00075
00076 artdaq::TransferInterface::CopyStatus
00077 artdaq::MPITransfer::
00078 moveFragment(Fragment&& frag)
00079 {
00080 return sendFragment(std::move(frag), 0);
00081 }
00082
00083 artdaq::TransferInterface::CopyStatus
00084 artdaq::MPITransfer::
00085 sendFragment(Fragment&& frag, size_t send_timeout_usec)
00086 {
00087 if (frag.dataSize() > max_fragment_size_words_)
00088 {
00089 TLOG(TLVL_WARNING) << GetTraceName() << " Fragment has size (" << frag.dataSize() << ") larger than max_fragment_size_words_ (" << max_fragment_size_words_ << ")."
00090 << " Total buffer space is: " << max_fragment_size_words_ * buffer_count_ << " words. Multiple over-size Fragments will exhaust the buffer!";
00091 }
00092
00093 auto start_time = std::chrono::steady_clock::now();
00094
00095 TLOG(5) << GetTraceName() << ": moveFragment: Finding available send slot, send_timeout_usec=" << send_timeout_usec;
00096 auto req_idx = findAvailable();
00097 auto counter = 0;
00098 while (req_idx == RECV_TIMEOUT && (send_timeout_usec == 0 || TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec))
00099 {
00100 usleep(1000);
00101 req_idx = findAvailable();
00102 counter++;
00103 if (counter % 1000 == 0)
00104 {
00105 TLOG(TLVL_INFO) << GetTraceName() << " Rank " << source_rank() << " waiting for available buffer to " << destination_rank() << ". "
00106 << "Waited " << TimeUtils::GetElapsedTimeMilliseconds(start_time) << " ms so far.";
00107 }
00108
00109 }
00110 if (req_idx == TransferInterface::RECV_TIMEOUT)
00111 {
00112 TLOG(TLVL_WARNING) << GetTraceName() << ": sendFragment: No buffers available! Returning RECV_TIMEOUT!";
00113 return CopyStatus::kTimeout;
00114 }
00115
00116 TLOG(5) << GetTraceName() << ": moveFragment send slot is " << req_idx;
00117 auto buffer_idx = req_idx / 2;
00118 TLOG(5) << GetTraceName() << ": moveFragment: Swapping in fragment to send to buffer " << buffer_idx;
00119 Fragment& curfrag = payload_[buffer_idx];
00120 curfrag = std::move(frag);
00121
00122 TLOG(5) << GetTraceName() << ": moveFragment before send src=" << source_rank() << " dest=" << destination_rank() << " seqID=" << curfrag.sequenceID() << " type=" << curfrag.typeString() << " found_idx=" << req_idx;
00123
00124 std::unique_lock<std::mutex> lk(mpi_mutex_);
00125
00126
00127 TLOG(5) << GetTraceName() << ": moveFragment: Using MPI_Isend";
00128
00129 MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
00130
00131 auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
00132 auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
00133 MPI_Issend(offset, sizeWrds * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &reqs_[req_idx + 1]);
00134 TLOG(5) << GetTraceName() << ": moveFragment COMPLETE";
00135
00136 TLOG(11) << GetTraceName() << ": moveFragment COMPLETE: "
00137 << " buffer_idx=" << buffer_idx
00138 << " send_size=" << curfrag.size()
00139 << " src=" << source_rank()
00140 << " dest=" << destination_rank()
00141 << " sequenceID=" << curfrag.sequenceID()
00142 << " fragID=" << curfrag.fragmentID();
00143 return CopyStatus::kSuccess;
00144 }
00145
00146 int artdaq::MPITransfer::receiveFragmentHeader(detail::RawFragmentHeader& header, size_t timeout_usec)
00147 {
00148 TLOG(6) << GetTraceName() << ": receiveFragmentHeader entered tmo=" << timeout_usec << " us (ignored)";
00149 MPI_Status status;
00150 int wait_result = MPI_SUCCESS;
00151
00152 MPI_Request req;
00153 {
00154 std::unique_lock<std::mutex> lk(mpi_mutex_);
00155 MPI_Irecv(&header, header.num_words() * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &req);
00156 }
00157
00158
00159 int flag;
00160 do {
00161 std::unique_lock<std::mutex> lk(mpi_mutex_);
00162 wait_result = MPI_Test(&req, &flag, &status);
00163 if (!flag) {
00164 usleep(1000);
00165
00166 }
00167 } while (!flag);
00168
00169 if (req != MPI_REQUEST_NULL)
00170 {
00171 TLOG(TLVL_ERROR) << GetTraceName() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
00172 throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
00173 }
00174
00175 TLOG(8) << GetTraceName() << ": receiveFragmentHeader recvd";
00176
00177
00178 {TLOG(8) << GetTraceName() << ": receiveFragmentHeader: " << my_rank
00179 << " Wait_error=" << wait_result
00180 << " status_error=" << status.MPI_ERROR
00181 << " source=" << status.MPI_SOURCE
00182 << " tag=" << status.MPI_TAG
00183 << " Fragment_sequenceID=" << (uint64_t)header.sequence_id
00184 << " Fragment_size=" << header.word_count
00185 << " fragID=" << header.fragment_id;
00186 }
00187 char err_buffer[MPI_MAX_ERROR_STRING];
00188 int resultlen;
00189 switch (wait_result)
00190 {
00191 case MPI_SUCCESS:
00192 break;
00193 case MPI_ERR_IN_STATUS:
00194 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
00195 TLOG(TLVL_ERROR) << GetTraceName()
00196 << ": Waitany ERROR: " << err_buffer;
00197 break;
00198 default:
00199 MPI_Error_string(wait_result, err_buffer, &resultlen);
00200 TLOG(TLVL_ERROR) << GetTraceName()
00201 << ": Waitany ERROR: " << err_buffer;
00202 }
00203
00204
00205 return status.MPI_SOURCE;
00206 }
00207
00208 int artdaq::MPITransfer::receiveFragmentData(RawDataType* destination, size_t wordCount)
00209 {
00210 TLOG(6) << GetTraceName() << ": receiveFragmentData entered wordCount=" << wordCount;
00211 int wait_result = MPI_SUCCESS;
00212 MPI_Status status;
00213
00214 MPI_Request req;
00215 {
00216 std::unique_lock<std::mutex> lk(mpi_mutex_);
00217 MPI_Irecv(destination, wordCount * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &req);
00218 }
00219
00220
00221 int flag;
00222 do {
00223 std::unique_lock<std::mutex> lk(mpi_mutex_);
00224 wait_result = MPI_Test(&req, &flag, &status);
00225 if (!flag) {
00226 usleep(1000);
00227
00228 }
00229 } while (!flag);
00230 if (req != MPI_REQUEST_NULL)
00231 {
00232 TLOG(TLVL_ERROR) << GetTraceName() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
00233 throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
00234 }
00235
00236
00237 TLOG(8) << GetTraceName() << ": receiveFragmentData recvd";
00238
00239
00240 char err_buffer[MPI_MAX_ERROR_STRING];
00241 int resultlen;
00242 switch (wait_result)
00243 {
00244 case MPI_SUCCESS:
00245 break;
00246 case MPI_ERR_IN_STATUS:
00247 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
00248 TLOG(TLVL_ERROR) << GetTraceName()
00249 << " MPITransfer: Waitany ERROR: " << err_buffer;
00250 break;
00251 default:
00252 MPI_Error_string(wait_result, err_buffer, &resultlen);
00253 TLOG(TLVL_ERROR) << GetTraceName()
00254 << " MPITransfer: Waitany ERROR: " << err_buffer;
00255 }
00256
00257
00258 return status.MPI_SOURCE;
00259 }
00260
00261 void
00262 artdaq::MPITransfer::
00263 cancelReq_(MPI_Request req) const
00264 {
00265 if (req == MPI_REQUEST_NULL) return;
00266
00267 TLOG(8) << GetTraceName() << ": Cancelling post";
00268
00269 std::unique_lock<std::mutex> lk(mpi_mutex_);
00270 int result = MPI_Cancel(&req);
00271 if (result == MPI_SUCCESS)
00272 {
00273 MPI_Status status;
00274 MPI_Wait(&req, &status);
00275 }
00276 else
00277 {
00278 switch (result)
00279 {
00280 case MPI_ERR_REQUEST:
00281 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00282 << "MPI_Cancel returned MPI_ERR_REQUEST.\n";
00283 case MPI_ERR_ARG:
00284 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00285 << "MPI_Cancel returned MPI_ERR_ARG.\n";
00286 default:
00287 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00288 << "MPI_Cancel returned unknown error code.\n";
00289 }
00290 }
00291 }
00292
00293 int artdaq::MPITransfer::findAvailable()
00294 {
00295 int use_me;
00296 int flag, flag2;
00297 size_t loops = 0;
00298
00299 do
00300 {
00301 use_me = pos_;
00302 std::unique_lock<std::mutex> lk(mpi_mutex_);
00303 MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
00304 if (flag) {
00305 MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
00306 }
00307 pos_ = (pos_ + 2) % reqs_.size();
00308 ++loops;
00309 } while (!flag2 && loops < buffer_count_);
00310 if (loops == buffer_count_) { return TransferInterface::RECV_TIMEOUT; }
00311 TLOG(5) << GetTraceName() << " findAvailable returning use_me=" << use_me << " loops=" << loops;
00312
00313
00314 return use_me;
00315 }
00316
00317 DEFINE_ARTDAQ_TRANSFER(artdaq::MPITransfer)