$treeview $search $mathjax $extrastylesheet
artdaq_mpich_plugin
v1_00_06a
$projectbrief
|
$projectbrief
|
$searchbox |
00001 #define TRACE_NAME "MPITransfer" 00002 #include "artdaq-mpich-plugin/TransferPlugins/MPITransfer.hh" 00003 #include <algorithm> 00004 00005 #include "canvas/Utilities/Exception.h" 00006 #include "cetlib_except/exception.h" 00007 00008 #include "artdaq-core/Data/Fragment.hh" 00009 00010 00011 /* 00012 Protocol: want to do a send for each request object, then wait for for 00013 pending requests to complete, followed by a reset to allow another set 00014 of sends to be completed. 00015 00016 This needs to be separated into a thing for sending and a thing for receiving. 00017 There probably needs to be a common class that both use. 00018 */ 00019 00020 #define MPI_TAG_HEADER 0x8E // 142 00021 #define MPI_TAG_DATA 0xDA // 218 00022 #define USE_RECV 1 00023 00024 std::mutex artdaq::MPITransfer::mpi_mutex_; 00025 00026 artdaq::MPITransfer::MPITransfer(fhicl::ParameterSet pset, Role role) 00027 : TransferInterface(pset, role) 00028 , reqs_(2 * buffer_count_, MPI_REQUEST_NULL) 00029 , payload_(buffer_count_) 00030 , pos_() 00031 { 00032 TLOG(TLVL_TRACE) << GetTraceName() << " construction: " 00033 << "source rank " << source_rank() << ", " 00034 << "destination rank " << destination_rank() << ", " 00035 << buffer_count_ << " buffers. "; 00036 00037 if (buffer_count_ == 0) 00038 { 00039 throw art::Exception(art::errors::Configuration, "MPITransfer: ") 00040 << "No buffers configured."; 00041 } 00042 } 00043 00044 artdaq::MPITransfer:: 00045 ~MPITransfer() 00046 { 00047 TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: BEGIN"; 00048 flush_buffers(); 00049 /* 00050 TLOG_ARB(TLVL_VERBOSE, "MPITransfer") << uniqueLabel() << " ::~MPITransfer: Entering Barrier"); 00051 MPI_Barrier(MPI_COMM_WORLD); 00052 TLOG_ARB(TLVL_VERBOSE, "MPITransfer") << uniqueLabel() << " ::~MPITransfer: Done with Barrier");*/ 00053 TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: DONE"; 00054 } 00055 00056 void artdaq::MPITransfer::flush_buffers() 00057 { 00058 TLOG(TLVL_TRACE) << GetTraceName() << ": flush_buffers: Collecting requests that need to be waited on"; 00059 std::vector<MPI_Request> reqs; 00060 for (size_t ii = 0; ii < reqs_.size(); ++ii) { 00061 if (reqs_[ii] != MPI_REQUEST_NULL) { 00062 reqs.push_back(reqs_[ii]); 00063 } 00064 } 00065 if (reqs.size() > 0) { 00066 TLOG(TLVL_TRACE) << GetTraceName() << ": flush_buffers: Waiting on " << reqs.size() << " reqs."; 00067 MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE); 00068 } 00069 } 00070 00071 artdaq::TransferInterface::CopyStatus 00072 artdaq::MPITransfer:: 00073 transfer_fragment_min_blocking_mode(Fragment const& frag, size_t send_timeout_usec) 00074 { 00075 return sendFragment(Fragment(frag), send_timeout_usec); 00076 } 00077 00078 artdaq::TransferInterface::CopyStatus 00079 artdaq::MPITransfer:: 00080 transfer_fragment_reliable_mode(Fragment&& frag) 00081 { 00082 return sendFragment(std::move(frag), 0); 00083 } 00084 00085 artdaq::TransferInterface::CopyStatus 00086 artdaq::MPITransfer:: 00087 sendFragment(Fragment&& frag, size_t send_timeout_usec) 00088 { 00089 if (frag.dataSize() > max_fragment_size_words_) 00090 { 00091 TLOG(TLVL_WARNING) << GetTraceName() << " Fragment has size (" << frag.dataSize() << ") larger than max_fragment_size_words_ (" << max_fragment_size_words_ << ")." 00092 << " Total buffer space is: " << max_fragment_size_words_ * buffer_count_ << " words. Multiple over-size Fragments will exhaust the buffer!"; 00093 } 00094 00095 auto start_time = std::chrono::steady_clock::now(); 00096 00097 TLOG(5) << GetTraceName() << ": sendFragment: Finding available send slot, send_timeout_usec=" << send_timeout_usec; 00098 auto req_idx = findAvailable(); 00099 auto counter = 0; 00100 while (req_idx == RECV_TIMEOUT && (send_timeout_usec == 0 || TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec)) 00101 { 00102 usleep(1000); 00103 req_idx = findAvailable(); 00104 counter++; 00105 if (counter % 1000 == 0) 00106 { 00107 TLOG(TLVL_INFO) << GetTraceName() << " Rank " << source_rank() << " waiting for available buffer to " << destination_rank() << ". " 00108 << "Waited " << TimeUtils::GetElapsedTimeMilliseconds(start_time) << " ms so far."; 00109 } 00110 00111 } 00112 if (req_idx == TransferInterface::RECV_TIMEOUT) 00113 { 00114 TLOG(TLVL_WARNING) << GetTraceName() << ": sendFragment: No buffers available! Returning RECV_TIMEOUT!"; 00115 return CopyStatus::kTimeout; 00116 } 00117 00118 TLOG(5) << GetTraceName() << ": sendFragment send slot is " << req_idx; 00119 auto buffer_idx = req_idx / 2; 00120 TLOG(5) << GetTraceName() << ": sendFragment: Swapping in fragment to send to buffer " << buffer_idx; 00121 Fragment& curfrag = payload_[buffer_idx]; 00122 curfrag = std::move(frag); 00123 00124 TLOG(5) << GetTraceName() << ": sendFragment before send src=" << source_rank() << " dest=" << destination_rank() << " seqID=" << curfrag.sequenceID() << " type=" << curfrag.typeString() << " found_idx=" << req_idx; 00125 00126 std::unique_lock<std::mutex> lk(mpi_mutex_); 00127 00128 // 14-Sep-2015, KAB: we should consider MPI_Issend here (see below)... 00129 TLOG(5) << GetTraceName() << ": sendFragment: Using MPI_Isend"; 00130 //Waits for the receiver to acknowledge header 00131 MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]); 00132 00133 auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words(); 00134 auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words(); 00135 MPI_Issend(offset, sizeWrds * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &reqs_[req_idx + 1]); 00136 TLOG(5) << GetTraceName() << ": sendFragment COMPLETE"; 00137 00138 TLOG(11) << GetTraceName() << ": sendFragment COMPLETE: " 00139 << " buffer_idx=" << buffer_idx 00140 << " send_size=" << curfrag.size() 00141 << " src=" << source_rank() 00142 << " dest=" << destination_rank() 00143 << " sequenceID=" << curfrag.sequenceID() 00144 << " fragID=" << curfrag.fragmentID(); 00145 return CopyStatus::kSuccess; 00146 } 00147 00148 int artdaq::MPITransfer::receiveFragmentHeader(detail::RawFragmentHeader& header, size_t timeout_usec) 00149 { 00150 TLOG(6) << GetTraceName() << ": receiveFragmentHeader entered tmo=" << timeout_usec << " us (ignored)"; 00151 MPI_Status status; 00152 int wait_result = MPI_SUCCESS; 00153 00154 MPI_Request req; 00155 { 00156 std::unique_lock<std::mutex> lk(mpi_mutex_); 00157 MPI_Irecv(&header, header.num_words() * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &req); 00158 } 00159 //TLOG(TLVL_DEBUG) << GetTraceName() << ": receiveFragmentHeader: Start of receiveFragment" ; 00160 00161 int flag; 00162 do { 00163 std::unique_lock<std::mutex> lk(mpi_mutex_); 00164 wait_result = MPI_Test(&req, &flag, &status); 00165 if (!flag) { 00166 usleep(1000); 00167 //TLOG_ARB(6) << uniqueLabel() << " MPITransfer::receiveFragmentHeader wait loop, flag=" << flag ; 00168 } 00169 } while (!flag); 00170 00171 if (req != MPI_REQUEST_NULL) 00172 { 00173 TLOG(TLVL_ERROR) << GetTraceName() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader."; 00174 throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader."; 00175 } 00176 //TLOG(TLVL_DEBUG) << GetTraceName() << " After testing/waiting res=" << wait_result ; 00177 TLOG(8) << GetTraceName() << ": receiveFragmentHeader recvd"; 00178 00179 00180 {TLOG(8) << GetTraceName() << ": receiveFragmentHeader: " << my_rank 00181 << " Wait_error=" << wait_result 00182 << " status_error=" << status.MPI_ERROR 00183 << " source=" << status.MPI_SOURCE 00184 << " tag=" << status.MPI_TAG 00185 << " Fragment_sequenceID=" << (uint64_t)header.sequence_id 00186 << " Fragment_size=" << header.word_count 00187 << " fragID=" << header.fragment_id; 00188 } 00189 char err_buffer[MPI_MAX_ERROR_STRING]; 00190 int resultlen; 00191 switch (wait_result) 00192 { 00193 case MPI_SUCCESS: 00194 break; 00195 case MPI_ERR_IN_STATUS: 00196 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen); 00197 TLOG(TLVL_ERROR) << GetTraceName() 00198 << ": Waitany ERROR: " << err_buffer; 00199 break; 00200 default: 00201 MPI_Error_string(wait_result, err_buffer, &resultlen); 00202 TLOG(TLVL_ERROR) << GetTraceName() 00203 << ": Waitany ERROR: " << err_buffer; 00204 } 00205 00206 //TLOG_INFO) << GetTraceName() << " End of receiveFragment" ; 00207 return status.MPI_SOURCE; 00208 } 00209 00210 int artdaq::MPITransfer::receiveFragmentData(RawDataType* destination, size_t wordCount) 00211 { 00212 TLOG(6) << GetTraceName() << ": receiveFragmentData entered wordCount=" << wordCount; 00213 int wait_result = MPI_SUCCESS; 00214 MPI_Status status; 00215 00216 MPI_Request req; 00217 { 00218 std::unique_lock<std::mutex> lk(mpi_mutex_); 00219 MPI_Irecv(destination, wordCount * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &req); 00220 } 00221 //TLOG(TLVL_DEBUG) << GetTraceName() << " Start of receiveFragment" ; 00222 00223 int flag; 00224 do { 00225 std::unique_lock<std::mutex> lk(mpi_mutex_); 00226 wait_result = MPI_Test(&req, &flag, &status); 00227 if (!flag) { 00228 usleep(1000); 00229 //TLOG(6) << GetTraceName() << ": receiveFragmentData wait loop, flag=" << flag ; 00230 } 00231 } while (!flag); 00232 if (req != MPI_REQUEST_NULL) 00233 { 00234 TLOG(TLVL_ERROR) << GetTraceName() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData."; 00235 throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData."; 00236 } 00237 00238 //TLOG(TLVL_DEBUG) << GetTraceName() << " After testing/waiting res=" << wait_result ; 00239 TLOG(8) << GetTraceName() << ": receiveFragmentData recvd"; 00240 00241 00242 char err_buffer[MPI_MAX_ERROR_STRING]; 00243 int resultlen; 00244 switch (wait_result) 00245 { 00246 case MPI_SUCCESS: 00247 break; 00248 case MPI_ERR_IN_STATUS: 00249 MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen); 00250 TLOG(TLVL_ERROR) << GetTraceName() 00251 << " MPITransfer: Waitany ERROR: " << err_buffer; 00252 break; 00253 default: 00254 MPI_Error_string(wait_result, err_buffer, &resultlen); 00255 TLOG(TLVL_ERROR) << GetTraceName() 00256 << " MPITransfer: Waitany ERROR: " << err_buffer; 00257 } 00258 00259 //TLOG_INFO) << GetTraceName() << " End of MPITransfer::receiveFragmentData" ; 00260 return status.MPI_SOURCE; 00261 } 00262 00263 void 00264 artdaq::MPITransfer:: 00265 cancelReq_(MPI_Request req) const 00266 { 00267 if (req == MPI_REQUEST_NULL) return; 00268 00269 TLOG(8) << GetTraceName() << ": Cancelling post"; 00270 00271 std::unique_lock<std::mutex> lk(mpi_mutex_); 00272 int result = MPI_Cancel(&req); 00273 if (result == MPI_SUCCESS) 00274 { 00275 MPI_Status status; 00276 MPI_Wait(&req, &status); 00277 } 00278 else 00279 { 00280 switch (result) 00281 { 00282 case MPI_ERR_REQUEST: 00283 throw art::Exception(art::errors::LogicError, "MPITransfer: ") 00284 << "MPI_Cancel returned MPI_ERR_REQUEST.\n"; 00285 case MPI_ERR_ARG: 00286 throw art::Exception(art::errors::LogicError, "MPITransfer: ") 00287 << "MPI_Cancel returned MPI_ERR_ARG.\n"; 00288 default: 00289 throw art::Exception(art::errors::LogicError, "MPITransfer: ") 00290 << "MPI_Cancel returned unknown error code.\n"; 00291 } 00292 } 00293 } 00294 00295 int artdaq::MPITransfer::findAvailable() 00296 { 00297 int use_me; 00298 int flag = 0, flag2 = 0; 00299 size_t loops = 0; 00300 //TLOG_ARB(5) << uniqueLabel() << " findAvailable initial pos_=" << pos_ ; 00301 do 00302 { 00303 use_me = pos_; 00304 std::unique_lock<std::mutex> lk(mpi_mutex_); 00305 MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE); 00306 if (flag) { 00307 MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE); 00308 } 00309 pos_ = (pos_ + 2) % reqs_.size(); 00310 ++loops; 00311 } while (flag2 == 0 && loops < buffer_count_); 00312 if (loops == buffer_count_) { return TransferInterface::RECV_TIMEOUT; } 00313 TLOG(5) << GetTraceName() << " findAvailable returning use_me=" << use_me << " loops=" << loops; 00314 // pos_ is pointing at the next slot to check 00315 // use_me is pointing at the slot to use 00316 return use_me; 00317 } 00318 00319 DEFINE_ARTDAQ_TRANSFER(artdaq::MPITransfer)