00001 #include "artdaq/TransferPlugins/MPITransfer.hh"
00002 #include <algorithm>
00003
00004 #include "canvas/Utilities/Exception.h"
00005 #include "cetlib_except/exception.h"
00006
00007 #include "artdaq-core/Data/Fragment.hh"
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #define MPI_TAG_HEADER 0x8E // 142
00020 #define MPI_TAG_DATA 0xDA // 218
00021 #define USE_RECV 1
00022
00023 std::mutex artdaq::MPITransfer::mpi_mutex_;
00024
00025 artdaq::MPITransfer::MPITransfer(fhicl::ParameterSet pset, TransferInterface::Role role)
00026 : TransferInterface(pset, role)
00027 , reqs_(2 * buffer_count_, MPI_REQUEST_NULL)
00028 , payload_(buffer_count_)
00029 , pos_()
00030 {
00031 TLOG_TRACE(uniqueLabel()) << "MPITransfer construction: "
00032 << "source rank " << source_rank() << ", "
00033 << "destination rank " << destination_rank() << ", "
00034 << buffer_count_ << " buffers. " << TLOG_ENDL;
00035
00036 if (buffer_count_ == 0)
00037 {
00038 throw art::Exception(art::errors::Configuration, "MPITransfer: ")
00039 << "No buffers configured.";
00040 }
00041 }
00042
00043 artdaq::MPITransfer::
00044 ~MPITransfer()
00045 {
00046 TLOG_TRACE(uniqueLabel()) << "MPITransfer::~MPITransfer: BEGIN" << TLOG_ENDL;
00047 TLOG_TRACE(uniqueLabel()) << "MPITransfer::~MPITransfer: Collecting requests that need to be waited on" << TLOG_ENDL;
00048 std::vector<MPI_Request> reqs;
00049 for (size_t ii = 0; ii < reqs_.size(); ++ii)
00050 {
00051 if (reqs_[ii] != MPI_REQUEST_NULL)
00052 {
00053 reqs.push_back(reqs_[ii]);
00054 }
00055 }
00056 if (reqs.size() > 0)
00057 {
00058 TLOG_TRACE(uniqueLabel()) << "MPITransfer::~MPITransfer: Waiting on " << std::to_string(reqs.size()) << " reqs." << TLOG_ENDL;
00059 MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
00060 }
00061
00062
00063
00064
00065 TLOG_TRACE(uniqueLabel()) << "MPITransfer::~MPITransfer: DONE" << TLOG_ENDL;
00066 }
00067
00068 artdaq::TransferInterface::CopyStatus
00069 artdaq::MPITransfer::
00070 copyFragment(Fragment& frag, size_t send_timeout_usec)
00071 {
00072 return moveFragment(std::move(frag), send_timeout_usec);
00073 }
00074
00075 artdaq::TransferInterface::CopyStatus
00076 artdaq::MPITransfer::
00077 moveFragment(Fragment&& frag, size_t send_timeout_usec)
00078 {
00079 if (frag.dataSize() > max_fragment_size_words_)
00080 {
00081 TLOG_WARNING(uniqueLabel()) << "Fragment has size (" << frag.dataSize() << ") larger than max_fragment_size_words_ (" << max_fragment_size_words_ << ")."
00082 << " Total buffer space is: " << max_fragment_size_words_ * buffer_count_ << " words. Multiple over-size Fragments will exhaust the buffer!" << TLOG_ENDL;
00083 }
00084
00085 auto start_time = std::chrono::steady_clock::now();
00086
00087 TLOG_ARB(5, uniqueLabel()) << "MPITransfer::moveFragment: Finding available send slot, send_timeout_usec=" << std::to_string(send_timeout_usec) << TLOG_ENDL;
00088 auto req_idx = findAvailable();
00089 auto counter = 0;
00090 while (req_idx == RECV_TIMEOUT && TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec)
00091 {
00092 usleep(1000);
00093 req_idx = findAvailable();
00094 counter++;
00095 if (counter % 1000 == 0)
00096 {
00097 TLOG_INFO(uniqueLabel()) << "Rank " << source_rank() << " waiting for available buffer to " << destination_rank() << ". "
00098 << "Waited " << std::to_string(TimeUtils::GetElapsedTimeMilliseconds(start_time)) << " ms so far." << TLOG_ENDL;
00099 }
00100
00101 }
00102 if (req_idx == TransferInterface::RECV_TIMEOUT)
00103 {
00104 TLOG_WARNING(uniqueLabel()) << "MPITransfer::sendFragment: No buffers available! Returning RECV_TIMEOUT!" << TLOG_ENDL;
00105 return CopyStatus::kTimeout;
00106 }
00107
00108 TLOG_ARB(5, uniqueLabel()) << "MPITransfer::moveFragment send slot is " << req_idx << TLOG_ENDL;
00109 auto buffer_idx = req_idx / 2;
00110 TLOG_ARB(5, uniqueLabel()) << "MPITransfer::moveFragment: Swapping in fragment to send to buffer " << buffer_idx << TLOG_ENDL;
00111 Fragment& curfrag = payload_[buffer_idx];
00112 curfrag = std::move(frag);
00113
00114 TLOG_ARB(5, uniqueLabel()) << "MPITransfer::moveFragment before send src=" << source_rank() << " dest=" << destination_rank() << " seqID=" << std::to_string(curfrag.sequenceID()) << " type=" << curfrag.typeString() << " found_idx=" << req_idx << TLOG_ENDL;
00115
00116 std::unique_lock<std::mutex> lk(mpi_mutex_);
00117
00118
00119 TLOG_ARB(5, uniqueLabel()) << "MPITransfer::moveFragment: Using MPI_Isend" << TLOG_ENDL;
00120
00121 MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
00122
00123 auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
00124 auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
00125 MPI_Issend(offset, sizeWrds * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &reqs_[req_idx + 1]);
00126 TLOG_ARB(5, uniqueLabel()) << "MPITransfer::moveFragment COMPLETE" << TLOG_ENDL;
00127
00128 TLOG_ARB(11, uniqueLabel()) << "MPITransfer::moveFragment COMPLETE: "
00129 << " buffer_idx=" << buffer_idx
00130 << " send_size=" << curfrag.size()
00131 << " src=" << source_rank()
00132 << " dest=" << destination_rank()
00133 << " sequenceID=" << curfrag.sequenceID()
00134 << " fragID=" << curfrag.fragmentID() << TLOG_ENDL;
00135 return CopyStatus::kSuccess;
00136 }
00137
00138 int artdaq::MPITransfer::receiveFragmentHeader(detail::RawFragmentHeader& header, size_t timeout_usec)
00139 {
00140 TLOG_ARB(6, uniqueLabel()) << "MPITransfer::receiveFragmentHeader entered tmo=" << std::to_string(timeout_usec) << " us (ignored)" << TLOG_ENDL;
00141 MPI_Status status;
00142 int wait_result = MPI_SUCCESS;
00143
00144 MPI_Request req;
00145 {
00146 std::unique_lock<std::mutex> lk(mpi_mutex_);
00147 MPI_Irecv(&header, header.num_words() * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &req);
00148 }
00149
00150
00151 int flag;
00152 do {
00153 std::unique_lock<std::mutex> lk(mpi_mutex_);
00154 wait_result = MPI_Test(&req, &flag, &status);
00155 if (!flag) {
00156 usleep(1000);
00157
00158 }
00159 } while (!flag);
00160
00161 if (req != MPI_REQUEST_NULL)
00162 {
00163 TLOG_ERROR(uniqueLabel()) << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader." << TLOG_ENDL;
00164 throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
00165 }
00166
00167 TLOG_ARB(8, uniqueLabel()) << "MPITransfer::receiveFragmentHeader recvd" << TLOG_ENDL;
00168
00169
00170 {TLOG_ARB(8, uniqueLabel()) << "MPITransfer::receiveFragmentHeader: " << my_rank
00171 << " Wait_error=" << wait_result
00172 << " status_error=" << status.MPI_ERROR
00173 << " source=" << status.MPI_SOURCE
00174 << " tag=" << status.MPI_TAG
00175 << " Fragment_sequenceID=" << (uint64_t)header.sequence_id
00176 << " Fragment_size=" << header.word_count
00177 << " fragID=" << header.fragment_id << TLOG_ENDL;
00178 }
00179 char err_buffer[MPI_MAX_ERROR_STRING];
00180 int resultlen;
00181 switch (wait_result)
00182 {
00183 case MPI_SUCCESS:
00184 break;
00185 case MPI_ERR_IN_STATUS:
00186 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
00187 TLOG_ERROR(uniqueLabel())
00188 << "MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
00189 break;
00190 default:
00191 MPI_Error_string(wait_result, err_buffer, &resultlen);
00192 TLOG_ERROR(uniqueLabel())
00193 << "MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
00194 }
00195
00196
00197 return status.MPI_SOURCE;
00198 }
00199
00200 int artdaq::MPITransfer::receiveFragmentData(RawDataType* destination, size_t wordCount)
00201 {
00202 TLOG_ARB(6, uniqueLabel()) << "MPITransfer::receiveFragmentData entered wordCount=" << std::to_string(wordCount) << TLOG_ENDL;
00203 int wait_result = MPI_SUCCESS;
00204 MPI_Status status;
00205
00206 MPI_Request req;
00207 {
00208 std::unique_lock<std::mutex> lk(mpi_mutex_);
00209 MPI_Irecv(destination, wordCount * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &req);
00210 }
00211
00212
00213 int flag;
00214 do {
00215 std::unique_lock<std::mutex> lk(mpi_mutex_);
00216 wait_result = MPI_Test(&req, &flag, &status);
00217 if (!flag) {
00218 usleep(1000);
00219
00220 }
00221 } while (!flag);
00222 if (req != MPI_REQUEST_NULL)
00223 {
00224 TLOG_ERROR(uniqueLabel()) << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData." << TLOG_ENDL;
00225 throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
00226 }
00227
00228
00229 TLOG_ARB(8, uniqueLabel()) << "MPITransfer::receiveFragmentData recvd" << TLOG_ENDL;
00230
00231
00232 char err_buffer[MPI_MAX_ERROR_STRING];
00233 int resultlen;
00234 switch (wait_result)
00235 {
00236 case MPI_SUCCESS:
00237 break;
00238 case MPI_ERR_IN_STATUS:
00239 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
00240 TLOG_ERROR(uniqueLabel())
00241 << "MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
00242 break;
00243 default:
00244 MPI_Error_string(wait_result, err_buffer, &resultlen);
00245 TLOG_ERROR(uniqueLabel())
00246 << "MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
00247 }
00248
00249
00250 return status.MPI_SOURCE;
00251 }
00252
00253 void
00254 artdaq::MPITransfer::
00255 cancelReq_(MPI_Request req) const
00256 {
00257 if (req == MPI_REQUEST_NULL) return;
00258
00259 TLOG_ARB(8, uniqueLabel()) << "Cancelling post" << TLOG_ENDL;
00260
00261 std::unique_lock<std::mutex> lk(mpi_mutex_);
00262 int result = MPI_Cancel(&req);
00263 if (result == MPI_SUCCESS)
00264 {
00265 MPI_Status status;
00266 MPI_Wait(&req, &status);
00267 }
00268 else
00269 {
00270 switch (result)
00271 {
00272 case MPI_ERR_REQUEST:
00273 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00274 << "MPI_Cancel returned MPI_ERR_REQUEST.\n";
00275 case MPI_ERR_ARG:
00276 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00277 << "MPI_Cancel returned MPI_ERR_ARG.\n";
00278 default:
00279 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00280 << "MPI_Cancel returned unknown error code.\n";
00281 }
00282 }
00283 }
00284
00285 int artdaq::MPITransfer::findAvailable()
00286 {
00287 int use_me;
00288 int flag, flag2;
00289 size_t loops = 0;
00290
00291 do
00292 {
00293 use_me = pos_;
00294 std::unique_lock<std::mutex> lk(mpi_mutex_);
00295 MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
00296 if (flag) {
00297 MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
00298 }
00299 pos_ = (pos_ + 2) % reqs_.size();
00300 ++loops;
00301 } while (!flag2 && loops < buffer_count_);
00302 if (loops == buffer_count_) { return TransferInterface::RECV_TIMEOUT; }
00303 TLOG_ARB(5, uniqueLabel()) << "findAvailable returning use_me=" << use_me << " loops=" << std::to_string(loops) << TLOG_ENDL;
00304
00305
00306 return use_me;
00307 }
00308
00309 DEFINE_ARTDAQ_TRANSFER(artdaq::MPITransfer)