00001 #include "artdaq/TransferPlugins/MPITransfer.hh"
00002 #include <algorithm>
00003
00004 #include "canvas/Utilities/Exception.h"
00005 #include "cetlib/exception.h"
00006
00007 #include "artdaq-core/Data/Fragment.hh"
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 std::mutex artdaq::MPITransfer::mpi_mutex_;
00020
00021 artdaq::MPITransfer::MPITransfer(fhicl::ParameterSet pset, TransferInterface::Role role)
00022 : TransferInterface(pset, role)
00023 , src_status_(status_t::SENDING)
00024 , recvd_count_(0)
00025 , expected_count_(-1)
00026 , payload_(buffer_count_)
00027 , synchronous_sends_(pset.get<bool>("synchronous_sends", true))
00028 , reqs_(buffer_count_, MPI_REQUEST_NULL)
00029 , pos_()
00030 {
00031 {
00032 std::ostringstream debugstream;
00033 debugstream << "MPITransfer construction: "
00034 << "source rank " << source_rank() << ", "
00035 << "destination rank " << destination_rank() << ", "
00036 << buffer_count_ << " buffers. ";
00037 TRACE(TLVL_TRACE, debugstream.str().c_str());
00038 }
00039
00040 if (buffer_count_ == 0)
00041 {
00042 throw art::Exception(art::errors::Configuration, "MPITransfer: ")
00043 << "No buffers configured.\n";
00044 }
00045
00046 for (size_t i = 0; i < buffer_count_; ++i)
00047 {
00048
00049 payload_[i].resize(max_fragment_size_words_);
00050
00051
00052
00053 if (role == TransferInterface::Role::kReceive) post_(i);
00054 }
00055 }
00056
00057 artdaq::MPITransfer::
00058 ~MPITransfer()
00059 {
00060 TRACE(TLVL_TRACE, "MPITransfer::~MPITransfer: BEGIN");
00061 if (role() == TransferInterface::Role::kReceive)
00062 {
00063
00064 TRACE(TLVL_TRACE, "MPITransfer::~MPITransfer: Cancelling all reqs");
00065 for (size_t i = 0; i < buffer_count_; ++i)
00066 {
00067 cancelReq_(i, false);
00068 }
00069 }
00070 TRACE(TLVL_TRACE, "MPITransfer::~MPITransfer: Collecting requests that need to be waited on");
00071 std::vector<MPI_Request> reqs;
00072 for (size_t ii = 0; ii < reqs_.size(); ++ii)
00073 {
00074 if (reqs_[ii] != MPI_REQUEST_NULL)
00075 {
00076 reqs.push_back(reqs_[ii]);
00077 }
00078 }
00079 if (reqs.size() > 0)
00080 {
00081 TRACE(TLVL_TRACE, "MPITransfer::~MPITransfer: Waiting on %zu reqs.", reqs.size());
00082 MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
00083 }
00084
00085
00086
00087 TRACE(TLVL_TRACE, "MPITransfer::~MPITransfer: DONE");
00088 }
00089
00090 int
00091 artdaq::MPITransfer::
00092 receiveFragment(Fragment& output, size_t timeout_usec)
00093 {
00094 TRACE(6, "MPITransfer::receiveFragment entered tmo=%lu us", timeout_usec);
00095
00096 int wait_result;
00097 int which;
00098 MPI_Status status;
00099
00100 if (timeout_usec > 0)
00101 {
00102 #if USE_TESTSOME
00103 if (ready_indices_.size() == 0) {
00104 ready_indices_.resize(buffer_count_, -1);
00105 ready_statuses_.resize(buffer_count_);
00106
00107 int readyCount = 0;
00108 wait_result = MPI_Testsome(buffer_count_, &(reqs_[0]), &readyCount, &(ready_indices_[0]), &(ready_statuses_[0]));
00109 if (readyCount > 0) {
00110 saved_wait_result_ = wait_result;
00111 ready_indices_.resize(readyCount);
00112 ready_statuses_.resize(readyCount);
00113 }
00114 else {
00115 size_t sleep_loops = 10;
00116 size_t sleep_time = timeout_usec / sleep_loops;
00117 if (sleep_time > 250) {
00118 sleep_time = 250;
00119 sleep_loops = timeout_usec / sleep_time;
00120 }
00121 for (size_t idx = 0; idx < sleep_loops; ++idx) {
00122 usleep(sleep_time);
00123 wait_result = MPI_Testsome(buffer_count_, &reqs_[0], &readyCount,
00124 &ready_indices_[0], &ready_statuses_[0]);
00125 if (readyCount > 0) { break; }
00126 }
00127 if (readyCount > 0) {
00128 saved_wait_result_ = wait_result;
00129 ready_indices_.resize(readyCount);
00130 ready_statuses_.resize(readyCount);
00131 }
00132 else {
00133 ready_indices_.clear();
00134 ready_statuses_.clear();
00135 }
00136 }
00137 }
00138 if (ready_indices_.size() > 0) {
00139 wait_result = saved_wait_result_;
00140 which = ready_indices_.front();
00141 status = ready_statuses_.front();
00142 ready_indices_.erase(ready_indices_.begin());
00143 ready_statuses_.erase(ready_statuses_.begin());
00144 }
00145 else {
00146 return RECV_TIMEOUT;
00147 }
00148 #else
00149 int flag = 0;
00150
00151 {
00152 std::unique_lock<std::mutex> lk(mpi_mutex_);
00153 wait_result = MPI_Testany(buffer_count_, &reqs_[0], &which, &flag, &status);
00154 }
00155 if (!flag)
00156 {
00157 size_t sleep_loops = 10;
00158 size_t sleep_time = timeout_usec / sleep_loops;
00159 if (sleep_time > 250)
00160 {
00161 sleep_time = 250;
00162 sleep_loops = timeout_usec / sleep_time;
00163 }
00164 for (size_t idx = 0; idx < sleep_loops; ++idx)
00165 {
00166 usleep(sleep_time);
00167
00168 {
00169 std::unique_lock<std::mutex> lk(mpi_mutex_);
00170 wait_result = MPI_Testany(buffer_count_, &reqs_[0], &which, &flag, &status);
00171 }
00172 if (flag || which >= 0) { break; }
00173 }
00174 if (!flag)
00175 {
00176 return RECV_TIMEOUT;
00177 }
00178 }
00179 #endif
00180 }
00181 else
00182 {
00183 {
00184 std::unique_lock<std::mutex> lk(mpi_mutex_);
00185 wait_result = MPI_Waitany(buffer_count_, &reqs_[0], &which, &status);
00186 }
00187 }
00188
00189 TRACE(8, "recvFragment recvd");
00190
00191 if (which == MPI_UNDEFINED)
00192 {
00193 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00194 << "MPI_UNDEFINED returned as on index value from Waitany.\n";
00195 }
00196 if (reqs_[which] != MPI_REQUEST_NULL)
00197 {
00198 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00199 << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in recvFragment.\n";
00200 }
00201 Fragment::sequence_id_t sequence_id = payload_[which].sequenceID();
00202
00203 {
00204 std::ostringstream debugstream;
00205 debugstream << "recv: " << my_rank
00206 << " idx=" << which
00207 << " Waitany_error=" << wait_result
00208 << " status_error=" << status.MPI_ERROR
00209 << " source=" << status.MPI_SOURCE
00210 << " tag=" << status.MPI_TAG
00211 << " Fragment_sequenceID=" << sequence_id
00212 << " Fragment_size=" << payload_[which].size()
00213 << " preAutoResize_Fragment_dataSize=" << payload_[which].dataSize()
00214 << " fragID=" << payload_[which].fragmentID()
00215 << '\n';
00216
00217 TRACE(4, debugstream.str().c_str());
00218 }
00219 char err_buffer[MPI_MAX_ERROR_STRING];
00220 int resultlen;
00221 switch (wait_result)
00222 {
00223 case MPI_SUCCESS:
00224 break;
00225 case MPI_ERR_IN_STATUS:
00226 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
00227 TLOG_ERROR(uniqueLabel())
00228 << "MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
00229 break;
00230 default:
00231 MPI_Error_string(wait_result, err_buffer, &resultlen);
00232 TLOG_ERROR(uniqueLabel())
00233 << "MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
00234 }
00235
00236
00237 TRACE(7, "recvFragment before autoResize/swap");
00238
00239 payload_[which].autoResize();
00240 output.swap(payload_[which]);
00241 TRACE(7, "recvFragment after autoResize/swap seqID=%lu. "
00242 "Reset our buffer. max=%zu adr=%p"
00243 , output.sequenceID(), max_fragment_size_words_, (void*)output.headerAddress());
00244
00245 Fragment tmp(max_fragment_size_words_);
00246 TRACE(7, "recvFragment before payload_[which].swap(tmp) adr=%p", (void*)tmp.headerAddress());
00247 payload_[which].swap(tmp);
00248 TRACE(7, "recvFragment after payload_[which].swap(tmp)");
00249
00250 if (output.type() == Fragment::EndOfDataFragmentType)
00251 {
00252 src_status_ = status_t::PENDING;
00253 expected_count_ = *output.dataBegin();
00254
00255 {
00256 std::ostringstream debugstream;
00257 debugstream << "Received EOD from source " << status.MPI_SOURCE
00258 << " expecting total of "
00259 << *output.dataBegin() << " fragments" << '\n';
00260
00261 TRACE(4, debugstream.str().c_str());
00262 }
00263 }
00264 else
00265 {
00266 recvd_count_++;
00267 }
00268 switch (src_status_)
00269 {
00270 case status_t::PENDING:
00271 {
00272 std::ostringstream debugstream;
00273 debugstream << "Checking received count "
00274 << recvd_count_
00275 << " against expected total "
00276 << expected_count_
00277 << '\n';
00278
00279 TRACE(4, debugstream.str().c_str());
00280 }
00281 if (recvd_count_ == expected_count_)
00282 {
00283 src_status_ = status_t::DONE;
00284 }
00285 break;
00286 case status_t::DONE:
00287 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00288 << "Received extra fragments from source "
00289 << status.MPI_SOURCE
00290 << ".\n";
00291 case status_t::SENDING:
00292 break;
00293 default:
00294 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00295 << "INTERNAL ERROR: Unrecognized status_t value "
00296 << static_cast<int>(src_status_)
00297 << ".\n";
00298 }
00299
00300 if (src_status_ == status_t::DONE)
00301 {
00302 if (nextSource_() != MPI_ANY_SOURCE)
00303 {
00304 post_(which);
00305 }
00306 }
00307 else
00308 {
00309 post_(which);
00310 }
00311
00312 return status.MPI_SOURCE;
00313 }
00314
00315 int
00316 artdaq::MPITransfer::
00317 nextSource_()
00318 {
00319
00320
00321 if (src_status_ != status_t::DONE)
00322 {
00323 return source_rank();
00324 }
00325 return MPI_ANY_SOURCE;
00326 }
00327
00328 void
00329 artdaq::MPITransfer::
00330 cancelReq_(size_t buf, bool blocking_wait)
00331 {
00332 if (reqs_[buf] == MPI_REQUEST_NULL) return;
00333
00334 {
00335 std::ostringstream debugstream;
00336 debugstream << "Cancelling post for buffer "
00337 << buf
00338 << '\n';
00339 TRACE(4, debugstream.str().c_str());
00340
00341 }
00342
00343 std::unique_lock<std::mutex> lk(mpi_mutex_);
00344 int result = MPI_Cancel(&reqs_[buf]);
00345 if (result == MPI_SUCCESS)
00346 {
00347 MPI_Status status;
00348 if (blocking_wait)
00349 {
00350 MPI_Wait(&reqs_[buf], &status);
00351 }
00352 else
00353 {
00354 int doneFlag;
00355 MPI_Test(&reqs_[buf], &doneFlag, &status);
00356 if (!doneFlag)
00357 {
00358 size_t sleep_loops = 10;
00359 size_t sleep_time = 100000;
00360 for (size_t idx = 0; idx < sleep_loops; ++idx)
00361 {
00362 usleep(sleep_time);
00363 MPI_Test(&reqs_[buf], &doneFlag, &status);
00364 if (doneFlag) { break; }
00365 }
00366 if (!doneFlag)
00367 {
00368 TLOG_ERROR(uniqueLabel())
00369 << "MPITransfer::cancelReq_: Timeout waiting to cancel the request for MPI buffer "
00370 << buf << TLOG_ENDL;
00371 }
00372 }
00373 }
00374 }
00375 else
00376 {
00377 switch (result)
00378 {
00379 case MPI_ERR_REQUEST:
00380 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00381 << "MPI_Cancel returned MPI_ERR_REQUEST.\n";
00382 case MPI_ERR_ARG:
00383 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00384 << "MPI_Cancel returned MPI_ERR_ARG.\n";
00385 default:
00386 throw art::Exception(art::errors::LogicError, "MPITransfer: ")
00387 << "MPI_Cancel returned unknown error code.\n";
00388 }
00389 }
00390 }
00391
00392 void
00393 artdaq::MPITransfer::
00394 post_(size_t buf)
00395 {
00396 {
00397 std::ostringstream debugstream;
00398 debugstream << "Posting buffer " << buf
00399 << " size=" << payload_[buf].size()
00400 << " header address=0x" << std::hex << payload_[buf].headerAddress() << std::dec
00401 << '\n';
00402 TRACE(4, debugstream.str().c_str());
00403
00404 }
00405
00406 std::unique_lock<std::mutex> lk(mpi_mutex_);
00407 MPI_Irecv(&*payload_[buf].headerBegin(),
00408 (payload_[buf].size() * sizeof(Fragment::value_type)),
00409 MPI_BYTE,
00410 source_rank(),
00411 MPI_ANY_TAG,
00412 MPI_COMM_WORLD,
00413 &reqs_[buf]);
00414 }
00415
00416 int artdaq::MPITransfer::findAvailable()
00417 {
00418 int use_me;
00419 int flag;
00420 size_t loops = 0;
00421 TRACE(5, "findAvailable initial pos_=%d", pos_);
00422 do
00423 {
00424 use_me = pos_;
00425 std::unique_lock<std::mutex> lk(mpi_mutex_);
00426 MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
00427 pos_ = (pos_ + 1) % buffer_count_;
00428 ++loops;
00429 } while (!flag && loops < buffer_count_);
00430 if (loops == buffer_count_) { return TransferInterface::RECV_TIMEOUT; }
00431 TRACE(5, "findAvailable returning use_me=%d loops=%zu", use_me, loops);
00432
00433
00434 return use_me;
00435 }
00436
00437 artdaq::TransferInterface::CopyStatus
00438 artdaq::MPITransfer::
00439 moveFragment(Fragment&& frag, size_t send_timeout_usec)
00440 {
00441 return sendFragment(std::move(frag), send_timeout_usec, false);
00442 }
00443
00444 artdaq::TransferInterface::CopyStatus
00445 artdaq::MPITransfer::
00446 copyFragment(Fragment& frag, size_t send_timeout_usec)
00447 {
00448 return sendFragment(std::move(frag), send_timeout_usec, true);
00449 }
00450
00451
00452 artdaq::TransferInterface::CopyStatus
00453 artdaq::MPITransfer::
00454 sendFragment(Fragment&& frag, size_t send_timeout_usec, bool force_async)
00455 {
00456 TRACE(5, "copyFragmentTo timeout unused: %zu", send_timeout_usec);
00457 if (frag.dataSize() > max_fragment_size_words_)
00458 {
00459 throw cet::exception("Unimplemented")
00460 << "Currently unable to deal with overlarge fragment payload ("
00461 << frag.dataSize()
00462 << " words > "
00463 << max_fragment_size_words_
00464 << ").";
00465 }
00466
00467 TRACE(5, "MPITransfer::sendFragment: Checking whether to force async mode...");
00468 if (frag.type() == Fragment::EndOfDataFragmentType)
00469 {
00470 TRACE(5, "MPITransfer::sendFragment: EndOfDataFragment detected. Forcing async mode");
00471 force_async = true;
00472 }
00473 TRACE(5, "MPITransfer::sendFragment: Finding available buffer");
00474 int buffer_idx = findAvailable();
00475 if (buffer_idx == TransferInterface::RECV_TIMEOUT)
00476 {
00477 TRACE(TLVL_WARNING, "MPITransfer::sendFragment: No buffers available! Returning RECV_TIMEOUT!");
00478 return CopyStatus::kTimeout;
00479 }
00480 TRACE(5, "MPITransfer::sendFragment: Swapping in fragment to send to buffer %d", buffer_idx);
00481 Fragment& curfrag = payload_[buffer_idx];
00482 curfrag = std::move(frag);
00483 TRACE(5, "sendFragTo before send src=%d dest=%d seqID=%lu type=%d found_idx=%d"
00484 , source_rank(), destination_rank(), curfrag.sequenceID(), curfrag.type(), buffer_idx);
00485 std::unique_lock<std::mutex> lk(mpi_mutex_);
00486 if (!synchronous_sends_ || force_async)
00487 {
00488
00489 TRACE(5, "MPITransfer::sendFragment: Using MPI_Isend");
00490 MPI_Isend(&*curfrag.headerBegin(),
00491 curfrag.size() * sizeof(Fragment::value_type),
00492 MPI_BYTE,
00493 destination_rank(),
00494 1,
00495 MPI_COMM_WORLD,
00496 &reqs_[buffer_idx]);
00497 }
00498 else
00499 {
00500
00501
00502
00503
00504
00505 TRACE(5, "MPITransfer::sendFragment: Using MPI_Ssend");
00506 MPI_Ssend(&*curfrag.headerBegin(),
00507 curfrag.size() * sizeof(Fragment::value_type),
00508 MPI_BYTE,
00509 destination_rank(),
00510 1,
00511 MPI_COMM_WORLD);
00512 }
00513 TRACE(5, "sendFragTo COMPLETE");
00514
00515 {
00516 std::ostringstream debugstream;
00517 debugstream << "send COMPLETE: "
00518 << " buffer_idx=" << buffer_idx
00519 << " send_size=" << curfrag.size()
00520 << " src=" << source_rank()
00521 << " dest=" << destination_rank()
00522 << " sequenceID=" << curfrag.sequenceID()
00523 << " fragID=" << curfrag.fragmentID()
00524 << '\n';
00525 TRACE(11, debugstream.str().c_str());
00526 }
00527 return CopyStatus::kSuccess;
00528 }
00529
00530 DEFINE_ARTDAQ_TRANSFER(artdaq::MPITransfer)