artdaq_mpich_plugin  v1_00_08
MPI_transfer.cc
1 #define TRACE_NAME "MPITransfer"
2 #include "artdaq-mpich-plugin/TransferPlugins/MPITransfer.hh"
3 #include <algorithm>
4 
5 #include "canvas/Utilities/Exception.h"
6 #include "cetlib_except/exception.h"
7 
8 #include "artdaq-core/Data/Fragment.hh"
9 
10 
11 /*
12  Protocol: want to do a send for each request object, then wait for for
13  pending requests to complete, followed by a reset to allow another set
14  of sends to be completed.
15 
16  This needs to be separated into a thing for sending and a thing for receiving.
17  There probably needs to be a common class that both use.
18 */
19 
20 #define MPI_TAG_HEADER 0x8E // 142
21 #define MPI_TAG_DATA 0xDA // 218
22 #define USE_RECV 1
23 
24 std::mutex artdaq::MPITransfer::mpi_mutex_;
25 
26 artdaq::MPITransfer::MPITransfer(fhicl::ParameterSet pset, Role role)
27  : TransferInterface(pset, role)
28  , reqs_(2 * buffer_count_, MPI_REQUEST_NULL)
29  , payload_(buffer_count_)
30  , pos_()
31 {
32  TLOG(TLVL_TRACE) << GetTraceName() << " construction: "
33  << "source rank " << source_rank() << ", "
34  << "destination rank " << destination_rank() << ", "
35  << buffer_count_ << " buffers. ";
36 
37  if (buffer_count_ == 0)
38  {
39  throw art::Exception(art::errors::Configuration, "MPITransfer: ")
40  << "No buffers configured.";
41  }
42 }
43 
46 {
47  TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: BEGIN";
48  flush_buffers();
49  /*
50  TLOG_ARB(TLVL_VERBOSE, "MPITransfer") << uniqueLabel() << " ::~MPITransfer: Entering Barrier");
51  MPI_Barrier(MPI_COMM_WORLD);
52  TLOG_ARB(TLVL_VERBOSE, "MPITransfer") << uniqueLabel() << " ::~MPITransfer: Done with Barrier");*/
53  TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: DONE";
54 }
55 
57 {
58  TLOG(TLVL_TRACE) << GetTraceName() << ": flush_buffers: Collecting requests that need to be waited on";
59  std::vector<MPI_Request> reqs;
60  for (size_t ii = 0; ii < reqs_.size(); ++ii) {
61  if (reqs_[ii] != MPI_REQUEST_NULL) {
62  reqs.push_back(reqs_[ii]);
63  }
64  }
65  if (reqs.size() > 0) {
66  TLOG(TLVL_TRACE) << GetTraceName() << ": flush_buffers: Waiting on " << reqs.size() << " reqs.";
67  MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
68  }
69 }
70 
71 artdaq::TransferInterface::CopyStatus
73 transfer_fragment_min_blocking_mode(Fragment const& frag, size_t send_timeout_usec)
74 {
75  return sendFragment(Fragment(frag), send_timeout_usec);
76 }
77 
78 artdaq::TransferInterface::CopyStatus
81 {
82  return sendFragment(std::move(frag), 0);
83 }
84 
85 artdaq::TransferInterface::CopyStatus
86 artdaq::MPITransfer::
87 sendFragment(Fragment&& frag, size_t send_timeout_usec)
88 {
89  if (frag.dataSize() > max_fragment_size_words_)
90  {
91  TLOG(TLVL_WARNING) << GetTraceName() << " Fragment has size (" << frag.dataSize() << ") larger than max_fragment_size_words_ (" << max_fragment_size_words_ << ")."
92  << " Total buffer space is: " << max_fragment_size_words_ * buffer_count_ << " words. Multiple over-size Fragments will exhaust the buffer!";
93  }
94 
95  auto start_time = std::chrono::steady_clock::now();
96 
97  TLOG(5) << GetTraceName() << ": sendFragment: Finding available send slot, send_timeout_usec=" << send_timeout_usec;
98  auto req_idx = findAvailable();
99  auto counter = 0;
100  while (req_idx == RECV_TIMEOUT && (send_timeout_usec == 0 || TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec))
101  {
102  usleep(1000);
103  req_idx = findAvailable();
104  counter++;
105  if (counter % 1000 == 0)
106  {
107  TLOG(TLVL_INFO) << GetTraceName() << " Rank " << source_rank() << " waiting for available buffer to " << destination_rank() << ". "
108  << "Waited " << TimeUtils::GetElapsedTimeMilliseconds(start_time) << " ms so far.";
109  }
110 
111  }
112  if (req_idx == TransferInterface::RECV_TIMEOUT)
113  {
114  TLOG(TLVL_WARNING) << GetTraceName() << ": sendFragment: No buffers available! Returning RECV_TIMEOUT!";
115  return CopyStatus::kTimeout;
116  }
117 
118  TLOG(5) << GetTraceName() << ": sendFragment send slot is " << req_idx;
119  auto buffer_idx = req_idx / 2;
120  TLOG(5) << GetTraceName() << ": sendFragment: Swapping in fragment to send to buffer " << buffer_idx;
121  Fragment& curfrag = payload_[buffer_idx];
122  curfrag = std::move(frag);
123 
124  TLOG(5) << GetTraceName() << ": sendFragment before send src=" << source_rank() << " dest=" << destination_rank() << " seqID=" << curfrag.sequenceID() << " type=" << curfrag.typeString() << " found_idx=" << req_idx;
125 
126  std::unique_lock<std::mutex> lk(mpi_mutex_);
127 
128  // 14-Sep-2015, KAB: we should consider MPI_Issend here (see below)...
129  TLOG(5) << GetTraceName() << ": sendFragment: Using MPI_Isend";
130  //Waits for the receiver to acknowledge header
131  MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
132 
133  auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
134  auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
135  MPI_Issend(offset, sizeWrds * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &reqs_[req_idx + 1]);
136  TLOG(5) << GetTraceName() << ": sendFragment COMPLETE";
137 
138  TLOG(11) << GetTraceName() << ": sendFragment COMPLETE: "
139  << " buffer_idx=" << buffer_idx
140  << " send_size=" << curfrag.size()
141  << " src=" << source_rank()
142  << " dest=" << destination_rank()
143  << " sequenceID=" << curfrag.sequenceID()
144  << " fragID=" << curfrag.fragmentID();
145  return CopyStatus::kSuccess;
146 }
147 
148 int artdaq::MPITransfer::receiveFragmentHeader(detail::RawFragmentHeader& header, size_t timeout_usec)
149 {
150  TLOG(6) << GetTraceName() << ": receiveFragmentHeader entered tmo=" << timeout_usec << " us (ignored)";
151  MPI_Status status;
152  int wait_result = MPI_SUCCESS;
153 
154  MPI_Request req;
155  {
156  std::unique_lock<std::mutex> lk(mpi_mutex_);
157  MPI_Irecv(&header, header.num_words() * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &req);
158  }
159  //TLOG(TLVL_DEBUG) << GetTraceName() << ": receiveFragmentHeader: Start of receiveFragment" ;
160 
161  int flag;
162  do {
163  std::unique_lock<std::mutex> lk(mpi_mutex_);
164  wait_result = MPI_Test(&req, &flag, &status);
165  if (!flag) {
166  usleep(1000);
167  //TLOG_ARB(6) << uniqueLabel() << " MPITransfer::receiveFragmentHeader wait loop, flag=" << flag ;
168  }
169  } while (!flag);
170 
171  if (req != MPI_REQUEST_NULL)
172  {
173  TLOG(TLVL_ERROR) << GetTraceName() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
174  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
175  }
176  //TLOG(TLVL_DEBUG) << GetTraceName() << " After testing/waiting res=" << wait_result ;
177  TLOG(8) << GetTraceName() << ": receiveFragmentHeader recvd";
178 
179 
180  {TLOG(8) << GetTraceName() << ": receiveFragmentHeader: " << my_rank
181  << " Wait_error=" << wait_result
182  << " status_error=" << status.MPI_ERROR
183  << " source=" << status.MPI_SOURCE
184  << " tag=" << status.MPI_TAG
185  << " Fragment_sequenceID=" << (uint64_t)header.sequence_id
186  << " Fragment_size=" << header.word_count
187  << " fragID=" << header.fragment_id;
188  }
189  char err_buffer[MPI_MAX_ERROR_STRING];
190  int resultlen;
191  switch (wait_result)
192  {
193  case MPI_SUCCESS:
194  break;
195  case MPI_ERR_IN_STATUS:
196  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
197  TLOG(TLVL_ERROR) << GetTraceName()
198  << ": Waitany ERROR: " << err_buffer;
199  break;
200  default:
201  MPI_Error_string(wait_result, err_buffer, &resultlen);
202  TLOG(TLVL_ERROR) << GetTraceName()
203  << ": Waitany ERROR: " << err_buffer;
204  }
205 
206  //TLOG_INFO) << GetTraceName() << " End of receiveFragment" ;
207  return status.MPI_SOURCE;
208 }
209 
210 int artdaq::MPITransfer::receiveFragmentData(RawDataType* destination, size_t wordCount)
211 {
212  TLOG(6) << GetTraceName() << ": receiveFragmentData entered wordCount=" << wordCount;
213  int wait_result = MPI_SUCCESS;
214  MPI_Status status;
215 
216  MPI_Request req;
217  {
218  std::unique_lock<std::mutex> lk(mpi_mutex_);
219  MPI_Irecv(destination, wordCount * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &req);
220  }
221  //TLOG(TLVL_DEBUG) << GetTraceName() << " Start of receiveFragment" ;
222 
223  int flag;
224  do {
225  std::unique_lock<std::mutex> lk(mpi_mutex_);
226  wait_result = MPI_Test(&req, &flag, &status);
227  if (!flag) {
228  usleep(1000);
229  //TLOG(6) << GetTraceName() << ": receiveFragmentData wait loop, flag=" << flag ;
230  }
231  } while (!flag);
232  if (req != MPI_REQUEST_NULL)
233  {
234  TLOG(TLVL_ERROR) << GetTraceName() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
235  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
236  }
237 
238  //TLOG(TLVL_DEBUG) << GetTraceName() << " After testing/waiting res=" << wait_result ;
239  TLOG(8) << GetTraceName() << ": receiveFragmentData recvd";
240 
241 
242  char err_buffer[MPI_MAX_ERROR_STRING];
243  int resultlen;
244  switch (wait_result)
245  {
246  case MPI_SUCCESS:
247  break;
248  case MPI_ERR_IN_STATUS:
249  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
250  TLOG(TLVL_ERROR) << GetTraceName()
251  << " MPITransfer: Waitany ERROR: " << err_buffer;
252  break;
253  default:
254  MPI_Error_string(wait_result, err_buffer, &resultlen);
255  TLOG(TLVL_ERROR) << GetTraceName()
256  << " MPITransfer: Waitany ERROR: " << err_buffer;
257  }
258 
259  //TLOG_INFO) << GetTraceName() << " End of MPITransfer::receiveFragmentData" ;
260  return status.MPI_SOURCE;
261 }
262 
263 void
264 artdaq::MPITransfer::
265 cancelReq_(MPI_Request req) const
266 {
267  if (req == MPI_REQUEST_NULL) return;
268 
269  TLOG(8) << GetTraceName() << ": Cancelling post";
270 
271  std::unique_lock<std::mutex> lk(mpi_mutex_);
272  int result = MPI_Cancel(&req);
273  if (result == MPI_SUCCESS)
274  {
275  MPI_Status status;
276  MPI_Wait(&req, &status);
277  }
278  else
279  {
280  switch (result)
281  {
282  case MPI_ERR_REQUEST:
283  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
284  << "MPI_Cancel returned MPI_ERR_REQUEST.\n";
285  case MPI_ERR_ARG:
286  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
287  << "MPI_Cancel returned MPI_ERR_ARG.\n";
288  default:
289  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
290  << "MPI_Cancel returned unknown error code.\n";
291  }
292  }
293 }
294 
295 int artdaq::MPITransfer::findAvailable()
296 {
297  int use_me;
298  int flag = 0, flag2 = 0;
299  size_t loops = 0;
300  //TLOG_ARB(5) << uniqueLabel() << " findAvailable initial pos_=" << pos_ ;
301  do
302  {
303  use_me = pos_;
304  std::unique_lock<std::mutex> lk(mpi_mutex_);
305  MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
306  if (flag) {
307  MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
308  }
309  pos_ = (pos_ + 2) % reqs_.size();
310  ++loops;
311  } while (flag2 == 0 && loops < buffer_count_);
312  if (loops == buffer_count_) { return TransferInterface::RECV_TIMEOUT; }
313  TLOG(5) << GetTraceName() << " findAvailable returning use_me=" << use_me << " loops=" << loops;
314  // pos_ is pointing at the next slot to check
315  // use_me is pointing at the slot to use
316  return use_me;
317 }
318 
319 DEFINE_ARTDAQ_TRANSFER(artdaq::MPITransfer)
virtual ~MPITransfer()
MPITransfer Destructor.
Definition: MPI_transfer.cc:45
void flush_buffers() override
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
Definition: MPI_transfer.cc:56
int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout) override
Receive a Fragment Header from the transport mechanism.
MPITransfer(fhicl::ParameterSet pset, Role role)
MPITransfer Constructor.
Definition: MPI_transfer.cc:26
CopyStatus transfer_fragment_reliable_mode(Fragment &&frag) override
Move a Fragment to the destination.
Definition: MPI_transfer.cc:80
MPITransfer is a TransferInterface implementation plugin that transfers data using MPI...
Definition: MPITransfer.hh:24
CopyStatus transfer_fragment_min_blocking_mode(Fragment const &frag, size_t timeout_usec) override
Copy a Fragment to the destination. Forces asynchronous send.
Definition: MPI_transfer.cc:73
int receiveFragmentData(RawDataType *destination, size_t wordCount) override
Receive the body of a Fragment to the given destination pointer.