00001 #define TRACE_NAME "MPITransfer"
00002 #include "artdaq/TransferPlugins/MPITransfer.hh"
00003 #include <algorithm>
00004
00005 #include "canvas/Utilities/Exception.h"
00006 #include "cetlib_except/exception.h"
00007
00008 #include "artdaq-core/Data/Fragment.hh"
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #define MPI_TAG_HEADER 0x8E // 142
00021 #define MPI_TAG_DATA 0xDA // 218
00022 #define USE_RECV 1
00023
00024 std::mutex artdaq::MPITransfer::mpi_mutex_;
00025
00026 artdaq::MPITransfer::MPITransfer(fhicl::ParameterSet pset, TransferInterface::Role role)
00027 : TransferInterface(pset, role)
00028 , reqs_(2 * buffer_count_, MPI_REQUEST_NULL)
00029 , payload_(buffer_count_)
00030 , pos_()
00031 {
00032 TLOG_TRACE("MPITransfer") << uniqueLabel() << " MPITransfer construction: "
00033 << "source rank " << source_rank() << ", "
00034 << "destination rank " << destination_rank() << ", "
00035 << buffer_count_ << " buffers. " << TLOG_ENDL;
00036
00037 if (buffer_count_ == 0)
00038 {
00039 throw art::Exception(art::errors::Configuration, "MPITransfer: ")
00040 << "No buffers configured.";
00041 }
00042 }
00043
00044 artdaq::MPITransfer::
00045 ~MPITransfer()
00046 {
00047 TLOG_TRACE("MPITransfer") << uniqueLabel() << " MPITransfer::~MPITransfer: BEGIN" << TLOG_ENDL;
00048 TLOG_TRACE("MPITransfer") << uniqueLabel() << " MPITransfer::~MPITransfer: Collecting requests that need to be waited on" << TLOG_ENDL;
00049 std::vector<MPI_Request> reqs;
00050 for (size_t ii = 0; ii < reqs_.size(); ++ii)
00051 {
00052 if (reqs_[ii] != MPI_REQUEST_NULL)
00053 {
00054 reqs.push_back(reqs_[ii]);
00055 }
00056 }
00057 if (reqs.size() > 0)
00058 {
00059 TLOG_TRACE("MPITransfer") << uniqueLabel() << "MPITransfer::~MPITransfer: Waiting on " << std::to_string(reqs.size()) << " reqs." << TLOG_ENDL;
00060 MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
00061 }
00062
00063
00064
00065
00066 TLOG_TRACE("MPITransfer") << uniqueLabel() << " MPITransfer::~MPITransfer: DONE" << TLOG_ENDL;
00067 }
00068
00069 artdaq::TransferInterface::CopyStatus
00070 artdaq::MPITransfer::
00071 copyFragment(Fragment& frag, size_t send_timeout_usec)
00072 {
00073 return moveFragment(std::move(frag), send_timeout_usec);
00074 }
00075
00076 artdaq::TransferInterface::CopyStatus
00077 artdaq::MPITransfer::
00078 moveFragment(Fragment&& frag, size_t send_timeout_usec)
00079 {
00080 if (frag.dataSize() > max_fragment_size_words_)
00081 {
00082 TLOG_WARNING("MPITransfer") << uniqueLabel() << " Fragment has size (" << frag.dataSize() << ") larger than max_fragment_size_words_ (" << max_fragment_size_words_ << ")."
00083 << " Total buffer space is: " << max_fragment_size_words_ * buffer_count_ << " words. Multiple over-size Fragments will exhaust the buffer!" << TLOG_ENDL;
00084 }
00085
00086 auto start_time = std::chrono::steady_clock::now();
00087
00088 TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment: Finding available send slot, send_timeout_usec=" << std::to_string(send_timeout_usec) << TLOG_ENDL;
00089 auto req_idx = findAvailable();
00090 auto counter = 0;
00091 while (req_idx == RECV_TIMEOUT && TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec)
00092 {
00093 usleep(1000);
00094 req_idx = findAvailable();
00095 counter++;
00096 if (counter % 1000 == 0)
00097 {
00098 TLOG_INFO("MPITransfer") << uniqueLabel() << " Rank " << source_rank() << " waiting for available buffer to " << destination_rank() << ". "
00099 << "Waited " << std::to_string(TimeUtils::GetElapsedTimeMilliseconds(start_time)) << " ms so far." << TLOG_ENDL;
00100 }
00101
00102 }
00103 if (req_idx == TransferInterface::RECV_TIMEOUT)
00104 {
00105 TLOG_WARNING("MPITransfer") << uniqueLabel() << " MPITransfer::sendFragment: No buffers available! Returning RECV_TIMEOUT!" << TLOG_ENDL;
00106 return CopyStatus::kTimeout;
00107 }
00108
00109 TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment send slot is " << req_idx << TLOG_ENDL;
00110 auto buffer_idx = req_idx / 2;
00111 TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment: Swapping in fragment to send to buffer " << buffer_idx << TLOG_ENDL;
00112 Fragment& curfrag = payload_[buffer_idx];
00113 curfrag = std::move(frag);
00114
00115 TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment before send src=" << source_rank() << " dest=" << destination_rank() << " seqID=" << std::to_string(curfrag.sequenceID()) << " type=" << curfrag.typeString() << " found_idx=" << req_idx << TLOG_ENDL;
00116
00117 std::unique_lock<std::mutex> lk(mpi_mutex_);
00118
00119
00120 TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment: Using MPI_Isend" << TLOG_ENDL;
00121
00122 MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
00123
00124 auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
00125 auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
00126 MPI_Issend(offset, sizeWrds * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &reqs_[req_idx + 1]);
00127 TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment COMPLETE" << TLOG_ENDL;
00128
00129 TLOG_ARB(11, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment COMPLETE: "
00130 << " buffer_idx=" << buffer_idx
00131 << " send_size=" << curfrag.size()
00132 << " src=" << source_rank()
00133 << " dest=" << destination_rank()
00134 << " sequenceID=" << curfrag.sequenceID()
00135 << " fragID=" << curfrag.fragmentID() << TLOG_ENDL;
00136 return CopyStatus::kSuccess;
00137 }
00138
00139 int artdaq::MPITransfer::receiveFragmentHeader(detail::RawFragmentHeader& header, size_t timeout_usec)
00140 {
00141 TLOG_ARB(6, "MPITransfer") << uniqueLabel() << " MPITransfer::receiveFragmentHeader entered tmo=" << std::to_string(timeout_usec) << " us (ignored)" << TLOG_ENDL;
00142 MPI_Status status;
00143 int wait_result = MPI_SUCCESS;
00144
00145 MPI_Request req;
00146 {
00147 std::unique_lock<std::mutex> lk(mpi_mutex_);
00148 MPI_Irecv(&header, header.num_words() * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &req);
00149 }
00150
00151
00152 int flag;
00153 do {
00154 std::unique_lock<std::mutex> lk(mpi_mutex_);
00155 wait_result = MPI_Test(&req, &flag, &status);
00156 if (!flag) {
00157 usleep(1000);
00158
00159 }
00160 } while (!flag);
00161
00162 if (req != MPI_REQUEST_NULL)
00163 {
00164 TLOG_ERROR("MPITransfer") << uniqueLabel() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader." << TLOG_ENDL;
00165 TLOG(TLVL_ERROR) << uniqueLabel() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader." << TLOG_ENDL;
00166 throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
00167 }
00168
00169 TLOG_ARB(8, "MPITransfer") << uniqueLabel() << " MPITransfer::receiveFragmentHeader recvd" << TLOG_ENDL;
00170
00171
00172 {TLOG_ARB(8, "MPITransfer") << uniqueLabel() << " MPITransfer::receiveFragmentHeader: " << my_rank
00173 << " Wait_error=" << wait_result
00174 << " status_error=" << status.MPI_ERROR
00175 << " source=" << status.MPI_SOURCE
00176 << " tag=" << status.MPI_TAG
00177 << " Fragment_sequenceID=" << (uint64_t)header.sequence_id
00178 << " Fragment_size=" << header.word_count
00179 << " fragID=" << header.fragment_id << TLOG_ENDL;
00180 }
00181 char err_buffer[MPI_MAX_ERROR_STRING];
00182 int resultlen;
00183 switch (wait_result)
00184 {
00185 case MPI_SUCCESS:
00186 break;
00187 case MPI_ERR_IN_STATUS:
00188 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
00189 TLOG_ERROR("MPITransfer") << uniqueLabel()
00190 << " MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
00191 break;
00192 default:
00193 MPI_Error_string(wait_result, err_buffer, &resultlen);
00194 TLOG_ERROR("MPITransfer") << uniqueLabel()
00195 << " MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
00196 }
00197
00198
00199 return status.MPI_SOURCE;
00200 }
00201
00202 int artdaq::MPITransfer::receiveFragmentData(RawDataType* destination, size_t wordCount)
00203 {
00204 TLOG_ARB(6, "MPITransfer") << uniqueLabel() << " MPITransfer::receiveFragmentData entered wordCount=" << std::to_string(wordCount) << TLOG_ENDL;
00205 int wait_result = MPI_SUCCESS;
00206 MPI_Status status;
00207
00208 MPI_Request req;
00209 {
00210 std::unique_lock<std::mutex> lk(mpi_mutex_);
00211 MPI_Irecv(destination, wordCount * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &req);
00212 }
00213
00214
00215 int flag;
00216 do {
00217 std::unique_lock<std::mutex> lk(mpi_mutex_);
00218 wait_result = MPI_Test(&req, &flag, &status);
00219 if (!flag) {
00220 usleep(1000);
00221
00222 }
00223 } while (!flag);
00224 if (req != MPI_REQUEST_NULL)
00225 {
00226 TLOG_ERROR("MPITransfer") << uniqueLabel() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData." << TLOG_ENDL;
00227 throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
00228 }
00229
00230
00231 TLOG_ARB(8, "MPITransfer") << uniqueLabel() << " MPITransfer::receiveFragmentData recvd" << TLOG_ENDL;
00232
00233
00234 char err_buffer[MPI_MAX_ERROR_STRING];
00235 int resultlen;
00236 switch (wait_result)
00237 {
00238 case MPI_SUCCESS:
00239 break;
00240 case MPI_ERR_IN_STATUS:
00241 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
00242 TLOG_ERROR("MPITransfer") << uniqueLabel()
00243 << " MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
00244 break;
00245 default:
00246 MPI_Error_string(wait_result, err_buffer, &resultlen);
00247 TLOG_ERROR("MPITransfer") << uniqueLabel()
00248 << " MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
00249 }
00250
00251
00252 return status.MPI_SOURCE;
00253 }
00254
00255 void
00256 artdaq::MPITransfer::
00257 cancelReq_(MPI_Request req) const
00258 {
00259 if (req == MPI_REQUEST_NULL) return;
00260
00261 TLOG_ARB(8, "MPITransfer") << uniqueLabel() << " Cancelling post" << TLOG_ENDL;
00262
00263 std::unique_lock<std::mutex> lk(mpi_mutex_);
00264 int result = MPI_Cancel(&req);
00265 if (result == MPI_SUCCESS)
00266 {
00267 MPI_Status status;
00268 MPI_Wait(&req, &status);
00269 }
00270 else
00271 {
00272 switch (result)
00273 {
00274 case MPI_ERR_REQUEST:
00275 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00276 << "MPI_Cancel returned MPI_ERR_REQUEST.\n";
00277 case MPI_ERR_ARG:
00278 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00279 << "MPI_Cancel returned MPI_ERR_ARG.\n";
00280 default:
00281 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00282 << "MPI_Cancel returned unknown error code.\n";
00283 }
00284 }
00285 }
00286
00287 int artdaq::MPITransfer::findAvailable()
00288 {
00289 int use_me;
00290 int flag, flag2;
00291 size_t loops = 0;
00292
00293 do
00294 {
00295 use_me = pos_;
00296 std::unique_lock<std::mutex> lk(mpi_mutex_);
00297 MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
00298 if (flag) {
00299 MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
00300 }
00301 pos_ = (pos_ + 2) % reqs_.size();
00302 ++loops;
00303 } while (!flag2 && loops < buffer_count_);
00304 if (loops == buffer_count_) { return TransferInterface::RECV_TIMEOUT; }
00305 TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " findAvailable returning use_me=" << use_me << " loops=" << std::to_string(loops) << TLOG_ENDL;
00306
00307
00308 return use_me;
00309 }
00310
00311 DEFINE_ARTDAQ_TRANSFER(artdaq::MPITransfer)