artdaq_mpich_plugin  v1_00_02
MPI_transfer.cc
1 #define TRACE_NAME "MPITransfer"
2 #include "artdaq-mpich-plugin/TransferPlugins/MPITransfer.hh"
3 #include <algorithm>
4 
5 #include "canvas/Utilities/Exception.h"
6 #include "cetlib_except/exception.h"
7 
8 #include "artdaq-core/Data/Fragment.hh"
9 
10 
11 /*
12  Protocol: want to do a send for each request object, then wait for for
13  pending requests to complete, followed by a reset to allow another set
14  of sends to be completed.
15 
16  This needs to be separated into a thing for sending and a thing for receiving.
17  There probably needs to be a common class that both use.
18 */
19 
20 #define MPI_TAG_HEADER 0x8E // 142
21 #define MPI_TAG_DATA 0xDA // 218
22 #define USE_RECV 1
23 
24 std::mutex artdaq::MPITransfer::mpi_mutex_;
25 
26 artdaq::MPITransfer::MPITransfer(fhicl::ParameterSet pset, Role role)
27  : TransferInterface(pset, role)
28  , reqs_(2 * buffer_count_, MPI_REQUEST_NULL)
29  , payload_(buffer_count_)
30  , pos_()
31 {
32  TLOG(TLVL_TRACE) << GetTraceName() << " construction: "
33  << "source rank " << source_rank() << ", "
34  << "destination rank " << destination_rank() << ", "
35  << buffer_count_ << " buffers. ";
36 
37  if (buffer_count_ == 0)
38  {
39  throw art::Exception(art::errors::Configuration, "MPITransfer: ")
40  << "No buffers configured.";
41  }
42 }
43 
46 {
47  TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: BEGIN";
48  TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: Collecting requests that need to be waited on";
49  std::vector<MPI_Request> reqs;
50  for (size_t ii = 0; ii < reqs_.size(); ++ii)
51  {
52  if (reqs_[ii] != MPI_REQUEST_NULL)
53  {
54  reqs.push_back(reqs_[ii]);
55  }
56  }
57  if (reqs.size() > 0)
58  {
59  TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: Waiting on " << reqs.size() << " reqs.";
60  MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
61  }
62  /*
63  TLOG_ARB(TLVL_VERBOSE, "MPITransfer") << uniqueLabel() << " ::~MPITransfer: Entering Barrier");
64  MPI_Barrier(MPI_COMM_WORLD);
65  TLOG_ARB(TLVL_VERBOSE, "MPITransfer") << uniqueLabel() << " ::~MPITransfer: Done with Barrier");*/
66  TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: DONE";
67 }
68 
69 artdaq::TransferInterface::CopyStatus
71 copyFragment(Fragment& frag, size_t send_timeout_usec)
72 {
73  return sendFragment(std::move(frag), send_timeout_usec);
74 }
75 
76 artdaq::TransferInterface::CopyStatus
78 moveFragment(Fragment&& frag)
79 {
80  return sendFragment(std::move(frag), 0);
81 }
82 
83 artdaq::TransferInterface::CopyStatus
84 artdaq::MPITransfer::
85 sendFragment(Fragment&& frag, size_t send_timeout_usec)
86 {
87  if (frag.dataSize() > max_fragment_size_words_)
88  {
89  TLOG(TLVL_WARNING) << GetTraceName() << " Fragment has size (" << frag.dataSize() << ") larger than max_fragment_size_words_ (" << max_fragment_size_words_ << ")."
90  << " Total buffer space is: " << max_fragment_size_words_ * buffer_count_ << " words. Multiple over-size Fragments will exhaust the buffer!";
91  }
92 
93  auto start_time = std::chrono::steady_clock::now();
94 
95  TLOG(5) << GetTraceName() << ": moveFragment: Finding available send slot, send_timeout_usec=" << send_timeout_usec;
96  auto req_idx = findAvailable();
97  auto counter = 0;
98  while (req_idx == RECV_TIMEOUT && (send_timeout_usec == 0 || TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec))
99  {
100  usleep(1000);
101  req_idx = findAvailable();
102  counter++;
103  if (counter % 1000 == 0)
104  {
105  TLOG(TLVL_INFO) << GetTraceName() << " Rank " << source_rank() << " waiting for available buffer to " << destination_rank() << ". "
106  << "Waited " << TimeUtils::GetElapsedTimeMilliseconds(start_time) << " ms so far.";
107  }
108 
109  }
110  if (req_idx == TransferInterface::RECV_TIMEOUT)
111  {
112  TLOG(TLVL_WARNING) << GetTraceName() << ": sendFragment: No buffers available! Returning RECV_TIMEOUT!";
113  return CopyStatus::kTimeout;
114  }
115 
116  TLOG(5) << GetTraceName() << ": moveFragment send slot is " << req_idx;
117  auto buffer_idx = req_idx / 2;
118  TLOG(5) << GetTraceName() << ": moveFragment: Swapping in fragment to send to buffer " << buffer_idx;
119  Fragment& curfrag = payload_[buffer_idx];
120  curfrag = std::move(frag);
121 
122  TLOG(5) << GetTraceName() << ": moveFragment before send src=" << source_rank() << " dest=" << destination_rank() << " seqID=" << curfrag.sequenceID() << " type=" << curfrag.typeString() << " found_idx=" << req_idx;
123 
124  std::unique_lock<std::mutex> lk(mpi_mutex_);
125 
126  // 14-Sep-2015, KAB: we should consider MPI_Issend here (see below)...
127  TLOG(5) << GetTraceName() << ": moveFragment: Using MPI_Isend";
128  //Waits for the receiver to acknowledge header
129  MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
130 
131  auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
132  auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
133  MPI_Issend(offset, sizeWrds * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &reqs_[req_idx + 1]);
134  TLOG(5) << GetTraceName() << ": moveFragment COMPLETE";
135 
136  TLOG(11) << GetTraceName() << ": moveFragment COMPLETE: "
137  << " buffer_idx=" << buffer_idx
138  << " send_size=" << curfrag.size()
139  << " src=" << source_rank()
140  << " dest=" << destination_rank()
141  << " sequenceID=" << curfrag.sequenceID()
142  << " fragID=" << curfrag.fragmentID();
143  return CopyStatus::kSuccess;
144 }
145 
146 int artdaq::MPITransfer::receiveFragmentHeader(detail::RawFragmentHeader& header, size_t timeout_usec)
147 {
148  TLOG(6) << GetTraceName() << ": receiveFragmentHeader entered tmo=" << timeout_usec << " us (ignored)";
149  MPI_Status status;
150  int wait_result = MPI_SUCCESS;
151 
152  MPI_Request req;
153  {
154  std::unique_lock<std::mutex> lk(mpi_mutex_);
155  MPI_Irecv(&header, header.num_words() * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &req);
156  }
157  //TLOG(TLVL_DEBUG) << GetTraceName() << ": receiveFragmentHeader: Start of receiveFragment" ;
158 
159  int flag;
160  do {
161  std::unique_lock<std::mutex> lk(mpi_mutex_);
162  wait_result = MPI_Test(&req, &flag, &status);
163  if (!flag) {
164  usleep(1000);
165  //TLOG_ARB(6) << uniqueLabel() << " MPITransfer::receiveFragmentHeader wait loop, flag=" << flag ;
166  }
167  } while (!flag);
168 
169  if (req != MPI_REQUEST_NULL)
170  {
171  TLOG(TLVL_ERROR) << GetTraceName() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
172  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
173  }
174  //TLOG(TLVL_DEBUG) << GetTraceName() << " After testing/waiting res=" << wait_result ;
175  TLOG(8) << GetTraceName() << ": receiveFragmentHeader recvd";
176 
177 
178  {TLOG(8) << GetTraceName() << ": receiveFragmentHeader: " << my_rank
179  << " Wait_error=" << wait_result
180  << " status_error=" << status.MPI_ERROR
181  << " source=" << status.MPI_SOURCE
182  << " tag=" << status.MPI_TAG
183  << " Fragment_sequenceID=" << (uint64_t)header.sequence_id
184  << " Fragment_size=" << header.word_count
185  << " fragID=" << header.fragment_id;
186  }
187  char err_buffer[MPI_MAX_ERROR_STRING];
188  int resultlen;
189  switch (wait_result)
190  {
191  case MPI_SUCCESS:
192  break;
193  case MPI_ERR_IN_STATUS:
194  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
195  TLOG(TLVL_ERROR) << GetTraceName()
196  << ": Waitany ERROR: " << err_buffer;
197  break;
198  default:
199  MPI_Error_string(wait_result, err_buffer, &resultlen);
200  TLOG(TLVL_ERROR) << GetTraceName()
201  << ": Waitany ERROR: " << err_buffer;
202  }
203 
204  //TLOG_INFO) << GetTraceName() << " End of receiveFragment" ;
205  return status.MPI_SOURCE;
206 }
207 
208 int artdaq::MPITransfer::receiveFragmentData(RawDataType* destination, size_t wordCount)
209 {
210  TLOG(6) << GetTraceName() << ": receiveFragmentData entered wordCount=" << wordCount;
211  int wait_result = MPI_SUCCESS;
212  MPI_Status status;
213 
214  MPI_Request req;
215  {
216  std::unique_lock<std::mutex> lk(mpi_mutex_);
217  MPI_Irecv(destination, wordCount * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &req);
218  }
219  //TLOG(TLVL_DEBUG) << GetTraceName() << " Start of receiveFragment" ;
220 
221  int flag;
222  do {
223  std::unique_lock<std::mutex> lk(mpi_mutex_);
224  wait_result = MPI_Test(&req, &flag, &status);
225  if (!flag) {
226  usleep(1000);
227  //TLOG(6) << GetTraceName() << ": receiveFragmentData wait loop, flag=" << flag ;
228  }
229  } while (!flag);
230  if (req != MPI_REQUEST_NULL)
231  {
232  TLOG(TLVL_ERROR) << GetTraceName() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
233  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
234  }
235 
236  //TLOG(TLVL_DEBUG) << GetTraceName() << " After testing/waiting res=" << wait_result ;
237  TLOG(8) << GetTraceName() << ": receiveFragmentData recvd";
238 
239 
240  char err_buffer[MPI_MAX_ERROR_STRING];
241  int resultlen;
242  switch (wait_result)
243  {
244  case MPI_SUCCESS:
245  break;
246  case MPI_ERR_IN_STATUS:
247  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
248  TLOG(TLVL_ERROR) << GetTraceName()
249  << " MPITransfer: Waitany ERROR: " << err_buffer;
250  break;
251  default:
252  MPI_Error_string(wait_result, err_buffer, &resultlen);
253  TLOG(TLVL_ERROR) << GetTraceName()
254  << " MPITransfer: Waitany ERROR: " << err_buffer;
255  }
256 
257  //TLOG_INFO) << GetTraceName() << " End of MPITransfer::receiveFragmentData" ;
258  return status.MPI_SOURCE;
259 }
260 
261 void
262 artdaq::MPITransfer::
263 cancelReq_(MPI_Request req) const
264 {
265  if (req == MPI_REQUEST_NULL) return;
266 
267  TLOG(8) << GetTraceName() << ": Cancelling post";
268 
269  std::unique_lock<std::mutex> lk(mpi_mutex_);
270  int result = MPI_Cancel(&req);
271  if (result == MPI_SUCCESS)
272  {
273  MPI_Status status;
274  MPI_Wait(&req, &status);
275  }
276  else
277  {
278  switch (result)
279  {
280  case MPI_ERR_REQUEST:
281  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
282  << "MPI_Cancel returned MPI_ERR_REQUEST.\n";
283  case MPI_ERR_ARG:
284  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
285  << "MPI_Cancel returned MPI_ERR_ARG.\n";
286  default:
287  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
288  << "MPI_Cancel returned unknown error code.\n";
289  }
290  }
291 }
292 
293 int artdaq::MPITransfer::findAvailable()
294 {
295  int use_me;
296  int flag, flag2;
297  size_t loops = 0;
298  //TLOG_ARB(5) << uniqueLabel() << " findAvailable initial pos_=" << pos_ ;
299  do
300  {
301  use_me = pos_;
302  std::unique_lock<std::mutex> lk(mpi_mutex_);
303  MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
304  if (flag) {
305  MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
306  }
307  pos_ = (pos_ + 2) % reqs_.size();
308  ++loops;
309  } while (!flag2 && loops < buffer_count_);
310  if (loops == buffer_count_) { return TransferInterface::RECV_TIMEOUT; }
311  TLOG(5) << GetTraceName() << " findAvailable returning use_me=" << use_me << " loops=" << loops;
312  // pos_ is pointing at the next slot to check
313  // use_me is pointing at the slot to use
314  return use_me;
315 }
316 
317 DEFINE_ARTDAQ_TRANSFER(artdaq::MPITransfer)
virtual ~MPITransfer()
MPITransfer Destructor.
Definition: MPI_transfer.cc:45
int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout) override
Receive a Fragment Header from the transport mechanism.
CopyStatus moveFragment(Fragment &&frag) override
Move a Fragment to the destination.
Definition: MPI_transfer.cc:78
MPITransfer(fhicl::ParameterSet pset, Role role)
MPITransfer Constructor.
Definition: MPI_transfer.cc:26
MPITransfer is a TransferInterface implementation plugin that transfers data using MPI...
Definition: MPITransfer.hh:24
int receiveFragmentData(RawDataType *destination, size_t wordCount) override
Receive the body of a Fragment to the given destination pointer.
CopyStatus copyFragment(Fragment &frag, size_t timeout_usec) override
Copy a Fragment to the destination. Forces asynchronous send.
Definition: MPI_transfer.cc:71