artdaq  v2_03_02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Pages
MPI_transfer.cc
1 #include "artdaq/TransferPlugins/MPITransfer.hh"
2 #include <algorithm>
3 
4 #include "canvas/Utilities/Exception.h"
5 #include "cetlib/exception.h"
6 
7 #include "artdaq-core/Data/Fragment.hh"
8 
9 
10 /*
11  Protocol: want to do a send for each request object, then wait for for
12  pending requests to complete, followed by a reset to allow another set
13  of sends to be completed.
14 
15  This needs to be separated into a thing for sending and a thing for receiving.
16  There probably needs to be a common class that both use.
17 */
18 
19 std::mutex artdaq::MPITransfer::mpi_mutex_;
20 
22  : TransferInterface(pset, role)
23  , src_status_(status_t::SENDING)
24  , recvd_count_(0)
25  , expected_count_(-1)
26  , payload_(buffer_count_)
27  , synchronous_sends_(pset.get<bool>("synchronous_sends", true))
28  , reqs_(buffer_count_, MPI_REQUEST_NULL)
29  , pos_()
30 {
31  {
32  std::ostringstream debugstream;
33  debugstream << "MPITransfer construction: "
34  << "source rank " << source_rank() << ", "
35  << "destination rank " << destination_rank() << ", "
36  << buffer_count_ << " buffers. ";
37  TRACE(TLVL_TRACE, debugstream.str().c_str());
38  }
39 
40  if (buffer_count_ == 0)
41  {
42  throw art::Exception(art::errors::Configuration, "MPITransfer: ")
43  << "No buffers configured.\n";
44  }
45  // Post all the buffers.
46  for (size_t i = 0; i < buffer_count_; ++i)
47  {
48  // make sure all buffers are the correct size
49  payload_[i].resize(max_fragment_size_words_);
50  // Note that nextSource_() is not used here: it is not necessary to
51  // check whether a source is DONE, and we avoid violating the
52  // precondition of nextSource_().
53  if (role == TransferInterface::Role::kReceive) post_(i);
54  }
55 }
56 
59 {
60  TRACE(TLVL_TRACE, "MPITransfer::~MPITransfer: BEGIN");
62  {
63  // clean up the remaining buffers
64  TRACE(TLVL_TRACE, "MPITransfer::~MPITransfer: Cancelling all reqs");
65  for (size_t i = 0; i < buffer_count_; ++i)
66  {
67  cancelReq_(i, false);
68  }
69  }
70  TRACE(TLVL_TRACE, "MPITransfer::~MPITransfer: Collecting requests that need to be waited on");
71  std::vector<MPI_Request> reqs;
72  for (size_t ii = 0; ii < reqs_.size(); ++ii)
73  {
74  if (reqs_[ii] != MPI_REQUEST_NULL)
75  {
76  reqs.push_back(reqs_[ii]);
77  }
78  }
79  if (reqs.size() > 0)
80  {
81  TRACE(TLVL_TRACE, "MPITransfer::~MPITransfer: Waiting on %zu reqs.", reqs.size());
82  MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
83  }/*
84  TRACE(TLVL_VERBOSE, "MPITransfer::~MPITransfer: Entering Barrier");
85  MPI_Barrier(MPI_COMM_WORLD);
86  TRACE(TLVL_VERBOSE, "MPITransfer::~MPITransfer: Done with Barrier");*/
87  TRACE(TLVL_TRACE, "MPITransfer::~MPITransfer: DONE");
88 }
89 
90 int
92 receiveFragment(Fragment& output, size_t timeout_usec)
93 {
94  TRACE(6, "MPITransfer::receiveFragment entered tmo=%lu us", timeout_usec);
95  //TLOG_DEBUG(uniqueLabel()) << "Start of receiveFragment" << TLOG_ENDL;
96  int wait_result;
97  int which;
98  MPI_Status status;
99 
100  if (timeout_usec > 0)
101  {
102 #if USE_TESTSOME
103  if (ready_indices_.size() == 0) {
104  ready_indices_.resize(buffer_count_, -1);
105  ready_statuses_.resize(buffer_count_);
106 
107  int readyCount = 0;
108  wait_result = MPI_Testsome(buffer_count_, &(reqs_[0]), &readyCount, &(ready_indices_[0]), &(ready_statuses_[0]));
109  if (readyCount > 0) {
110  saved_wait_result_ = wait_result;
111  ready_indices_.resize(readyCount);
112  ready_statuses_.resize(readyCount);
113  }
114  else {
115  size_t sleep_loops = 10;
116  size_t sleep_time = timeout_usec / sleep_loops;
117  if (sleep_time > 250) {
118  sleep_time = 250;
119  sleep_loops = timeout_usec / sleep_time;
120  }
121  for (size_t idx = 0; idx < sleep_loops; ++idx) {
122  usleep(sleep_time);
123  wait_result = MPI_Testsome(buffer_count_, &reqs_[0], &readyCount,
124  &ready_indices_[0], &ready_statuses_[0]);
125  if (readyCount > 0) { break; }
126  }
127  if (readyCount > 0) {
128  saved_wait_result_ = wait_result;
129  ready_indices_.resize(readyCount);
130  ready_statuses_.resize(readyCount);
131  }
132  else {
133  ready_indices_.clear();
134  ready_statuses_.clear();
135  }
136  }
137  }
138  if (ready_indices_.size() > 0) {
139  wait_result = saved_wait_result_;
140  which = ready_indices_.front();
141  status = ready_statuses_.front();
142  ready_indices_.erase(ready_indices_.begin());
143  ready_statuses_.erase(ready_statuses_.begin());
144  }
145  else {
146  return RECV_TIMEOUT;
147  }
148 #else
149  int flag = 0;
150  //TLOG_DEBUG(uniqueLabel()) << "Before first Testany buffer_count_=" << buffer_count_ << ", reqs_.size()=" << reqs_.size() << ", &reqs_[0]=" << &reqs_[0] << ", which=" << which << ", flag=" << flag << TLOG_ENDL;
151  {
152  std::unique_lock<std::mutex> lk(mpi_mutex_);
153  wait_result = MPI_Testany(buffer_count_, &reqs_[0], &which, &flag, &status);
154  }
155  if (!flag)
156  {
157  size_t sleep_loops = 10;
158  size_t sleep_time = timeout_usec / sleep_loops;
159  if (sleep_time > 250)
160  {
161  sleep_time = 250;
162  sleep_loops = timeout_usec / sleep_time;
163  }
164  for (size_t idx = 0; idx < sleep_loops; ++idx)
165  {
166  usleep(sleep_time);
167  //TLOG_DEBUG(uniqueLabel()) << "Before second Testany buffer_count_=" << buffer_count_ << ", reqs_.size()=" << reqs_.size() << ", &reqs_[0]=" << &reqs_[0] << ", which=" << which << ", flag=" << flag << TLOG_ENDL;
168  {
169  std::unique_lock<std::mutex> lk(mpi_mutex_);
170  wait_result = MPI_Testany(buffer_count_, &reqs_[0], &which, &flag, &status);
171  }
172  if (flag || which >= 0) { break; }
173  }
174  if (!flag)
175  {
176  return RECV_TIMEOUT;
177  }
178  }
179 #endif
180  }
181  else
182  {
183  {
184  std::unique_lock<std::mutex> lk(mpi_mutex_);
185  wait_result = MPI_Waitany(buffer_count_, &reqs_[0], &which, &status);
186  }
187  }
188  //TLOG_DEBUG(uniqueLabel()) << "After testing/waiting res=" << wait_result << TLOG_ENDL;
189  TRACE(8, "recvFragment recvd");
190 
191  if (which == MPI_UNDEFINED)
192  {
193  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
194  << "MPI_UNDEFINED returned as on index value from Waitany.\n";
195  }
196  if (reqs_[which] != MPI_REQUEST_NULL)
197  {
198  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
199  << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in recvFragment.\n";
200  }
201  Fragment::sequence_id_t sequence_id = payload_[which].sequenceID();
202 
203  {
204  std::ostringstream debugstream;
205  debugstream << "recv: " << my_rank
206  << " idx=" << which
207  << " Waitany_error=" << wait_result
208  << " status_error=" << status.MPI_ERROR
209  << " source=" << status.MPI_SOURCE
210  << " tag=" << status.MPI_TAG
211  << " Fragment_sequenceID=" << sequence_id
212  << " Fragment_size=" << payload_[which].size()
213  << " preAutoResize_Fragment_dataSize=" << payload_[which].dataSize()
214  << " fragID=" << payload_[which].fragmentID()
215  << '\n';
216  //TLOG_INFO(uniqueLabel()) << debugstream.str() << TLOG_ENDL;
217  TRACE(4, debugstream.str().c_str());
218  }
219  char err_buffer[MPI_MAX_ERROR_STRING];
220  int resultlen;
221  switch (wait_result)
222  {
223  case MPI_SUCCESS:
224  break;
225  case MPI_ERR_IN_STATUS:
226  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
227  TLOG_ERROR(uniqueLabel())
228  << "MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
229  break;
230  default:
231  MPI_Error_string(wait_result, err_buffer, &resultlen);
232  TLOG_ERROR(uniqueLabel())
233  << "MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
234  }
235  // The Fragment at index 'which' is now available.
236  // Resize (down) to size to remove trailing garbage.
237  TRACE(7, "recvFragment before autoResize/swap");
238  //TLOG_INFO(uniqueLabel()) << "receiveFragment before resizing for output" << TLOG_ENDL;
239  payload_[which].autoResize();
240  output.swap(payload_[which]);
241  TRACE(7, "recvFragment after autoResize/swap seqID=%lu. "
242  "Reset our buffer. max=%zu adr=%p"
243  , output.sequenceID(), max_fragment_size_words_, (void*)output.headerAddress());
244  // Reset our buffer.
245  Fragment tmp(max_fragment_size_words_);
246  TRACE(7, "recvFragment before payload_[which].swap(tmp) adr=%p", (void*)tmp.headerAddress());
247  payload_[which].swap(tmp);
248  TRACE(7, "recvFragment after payload_[which].swap(tmp)");
249  // Fragment accounting.
250  if (output.type() == Fragment::EndOfDataFragmentType)
251  {
252  src_status_ = status_t::PENDING;
253  expected_count_ = *output.dataBegin();
254 
255  {
256  std::ostringstream debugstream;
257  debugstream << "Received EOD from source " << status.MPI_SOURCE
258  << " expecting total of "
259  << *output.dataBegin() << " fragments" << '\n';
260  //TLOG_INFO(uniqueLabel()) << debugstream.str() << TLOG_ENDL;
261  TRACE(4, debugstream.str().c_str());
262  }
263  }
264  else
265  {
266  recvd_count_++;
267  }
268  switch (src_status_)
269  {
270  case status_t::PENDING:
271  {
272  std::ostringstream debugstream;
273  debugstream << "Checking received count "
274  << recvd_count_
275  << " against expected total "
276  << expected_count_
277  << '\n';
278  //TLOG_INFO(uniqueLabel()) << debugstream.str() << TLOG_ENDL;
279  TRACE(4, debugstream.str().c_str());
280  }
281  if (recvd_count_ == expected_count_)
282  {
283  src_status_ = status_t::DONE;
284  }
285  break;
286  case status_t::DONE:
287  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
288  << "Received extra fragments from source "
289  << status.MPI_SOURCE
290  << ".\n";
291  case status_t::SENDING:
292  break;
293  default:
294  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
295  << "INTERNAL ERROR: Unrecognized status_t value "
296  << static_cast<int>(src_status_)
297  << ".\n";
298  }
299  // Repost to receive more data.
300  if (src_status_ == status_t::DONE)
301  { // Just happened.
302  if (nextSource_() != MPI_ANY_SOURCE)
303  { // No active sources left.
304  post_(which); // This buffer doesn't need cancelling.
305  }
306  }
307  else
308  {
309  post_(which);
310  }
311  //TLOG_INFO(uniqueLabel()) << "End of receiveFragment" << TLOG_ENDL;
312  return status.MPI_SOURCE;
313 }
314 
315 int
316 artdaq::MPITransfer::
317 nextSource_()
318 {
319  // Precondition: last_source_posted_ must be set. This is ensured
320  // provided nextSource_() is never called from the constructor.
321  if (src_status_ != status_t::DONE)
322  {
323  return source_rank();
324  }
325  return MPI_ANY_SOURCE;
326 }
327 
328 void
329 artdaq::MPITransfer::
330 cancelReq_(size_t buf, bool blocking_wait)
331 {
332  if (reqs_[buf] == MPI_REQUEST_NULL) return;
333 
334  {
335  std::ostringstream debugstream;
336  debugstream << "Cancelling post for buffer "
337  << buf
338  << '\n';
339  TRACE(4, debugstream.str().c_str());
340  //TLOG_INFO(uniqueLabel()) << debugstream.str() << TLOG_ENDL;
341  }
342 
343  std::unique_lock<std::mutex> lk(mpi_mutex_);
344  int result = MPI_Cancel(&reqs_[buf]);
345  if (result == MPI_SUCCESS)
346  {
347  MPI_Status status;
348  if (blocking_wait)
349  {
350  MPI_Wait(&reqs_[buf], &status);
351  }
352  else
353  {
354  int doneFlag;
355  MPI_Test(&reqs_[buf], &doneFlag, &status);
356  if (!doneFlag)
357  {
358  size_t sleep_loops = 10;
359  size_t sleep_time = 100000;
360  for (size_t idx = 0; idx < sleep_loops; ++idx)
361  {
362  usleep(sleep_time);
363  MPI_Test(&reqs_[buf], &doneFlag, &status);
364  if (doneFlag) { break; }
365  }
366  if (!doneFlag)
367  {
368  TLOG_ERROR(uniqueLabel())
369  << "MPITransfer::cancelReq_: Timeout waiting to cancel the request for MPI buffer "
370  << buf << TLOG_ENDL;
371  }
372  }
373  }
374  }
375  else
376  {
377  switch (result)
378  {
379  case MPI_ERR_REQUEST:
380  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
381  << "MPI_Cancel returned MPI_ERR_REQUEST.\n";
382  case MPI_ERR_ARG:
383  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
384  << "MPI_Cancel returned MPI_ERR_ARG.\n";
385  default:
386  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
387  << "MPI_Cancel returned unknown error code.\n";
388  }
389  }
390 }
391 
392 void
393 artdaq::MPITransfer::
394 post_(size_t buf)
395 {
396  {
397  std::ostringstream debugstream;
398  debugstream << "Posting buffer " << buf
399  << " size=" << payload_[buf].size()
400  << " header address=0x" << std::hex << payload_[buf].headerAddress() << std::dec
401  << '\n';
402  TRACE(4, debugstream.str().c_str());
403  //TLOG_INFO(uniqueLabel()) << debugstream.str() << TLOG_ENDL;
404  }
405 
406  std::unique_lock<std::mutex> lk(mpi_mutex_);
407  MPI_Irecv(&*payload_[buf].headerBegin(),
408  (payload_[buf].size() * sizeof(Fragment::value_type)),
409  MPI_BYTE,
410  source_rank(),
411  MPI_ANY_TAG,
412  MPI_COMM_WORLD,
413  &reqs_[buf]);
414 }
415 
416 int artdaq::MPITransfer::findAvailable()
417 {
418  int use_me;
419  int flag;
420  size_t loops = 0;
421  TRACE(5, "findAvailable initial pos_=%d", pos_);
422  do
423  {
424  use_me = pos_;
425  std::unique_lock<std::mutex> lk(mpi_mutex_);
426  MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
427  pos_ = (pos_ + 1) % buffer_count_;
428  ++loops;
429  } while (!flag && loops < buffer_count_);
430  if (loops == buffer_count_) { return TransferInterface::RECV_TIMEOUT; }
431  TRACE(5, "findAvailable returning use_me=%d loops=%zu", use_me, loops);
432  // pos_ is pointing at the next slot to check
433  // use_me is pointing at the slot to use
434  return use_me;
435 }
436 
439 moveFragment(Fragment&& frag, size_t send_timeout_usec)
440 {
441  return sendFragment(std::move(frag), send_timeout_usec, false);
442 }
443 
446 copyFragment(Fragment& frag, size_t send_timeout_usec)
447 {
448  return sendFragment(std::move(frag), send_timeout_usec, true);
449 }
450 
451 
453 artdaq::MPITransfer::
454 sendFragment(Fragment&& frag, size_t send_timeout_usec, bool force_async)
455 {
456  TRACE(5, "copyFragmentTo timeout unused: %zu", send_timeout_usec);
457  if (frag.dataSize() > max_fragment_size_words_)
458  {
459  throw cet::exception("Unimplemented")
460  << "Currently unable to deal with overlarge fragment payload ("
461  << frag.dataSize()
462  << " words > "
463  << max_fragment_size_words_
464  << ").";
465  }
466 
467  TRACE(5, "MPITransfer::sendFragment: Checking whether to force async mode...");
468  if (frag.type() == Fragment::EndOfDataFragmentType)
469  {
470  TRACE(5, "MPITransfer::sendFragment: EndOfDataFragment detected. Forcing async mode");
471  force_async = true;
472  }
473  TRACE(5, "MPITransfer::sendFragment: Finding available buffer");
474  int buffer_idx = findAvailable();
475  if (buffer_idx == TransferInterface::RECV_TIMEOUT)
476  {
477  TRACE(TLVL_WARNING, "MPITransfer::sendFragment: No buffers available! Returning RECV_TIMEOUT!");
478  return CopyStatus::kTimeout;
479  }
480  TRACE(5, "MPITransfer::sendFragment: Swapping in fragment to send to buffer %d", buffer_idx);
481  Fragment& curfrag = payload_[buffer_idx];
482  curfrag = std::move(frag);
483  TRACE(5, "sendFragTo before send src=%d dest=%d seqID=%lu type=%d found_idx=%d"
484  , source_rank(), destination_rank(), curfrag.sequenceID(), curfrag.type(), buffer_idx);
485  std::unique_lock<std::mutex> lk(mpi_mutex_);
486  if (!synchronous_sends_ || force_async)
487  {
488  // 14-Sep-2015, KAB: we should consider MPI_Issend here (see below)...
489  TRACE(5, "MPITransfer::sendFragment: Using MPI_Isend");
490  MPI_Isend(&*curfrag.headerBegin(),
491  curfrag.size() * sizeof(Fragment::value_type),
492  MPI_BYTE,
493  destination_rank(),
494  1,
495  MPI_COMM_WORLD,
496  &reqs_[buffer_idx]);
497  }
498  else
499  {
500  // 14-Sep-2015, KAB: switched from MPI_Send to MPI_Ssend based on
501  // http://www.mcs.anl.gov/research/projects/mpi/sendmode.html.
502  // This change was made after we noticed that MPI buffering
503  // downstream of RootMPIOutput was causing EventBuilder memory
504  // usage to grow when using MPI_Send with MPICH 3.1.4 and 3.1.2a.
505  TRACE(5, "MPITransfer::sendFragment: Using MPI_Ssend");
506  MPI_Ssend(&*curfrag.headerBegin(),
507  curfrag.size() * sizeof(Fragment::value_type),
508  MPI_BYTE,
509  destination_rank(),
510  1,
511  MPI_COMM_WORLD);
512  }
513  TRACE(5, "sendFragTo COMPLETE");
514 
515  {
516  std::ostringstream debugstream;
517  debugstream << "send COMPLETE: "
518  << " buffer_idx=" << buffer_idx
519  << " send_size=" << curfrag.size()
520  << " src=" << source_rank()
521  << " dest=" << destination_rank()
522  << " sequenceID=" << curfrag.sequenceID()
523  << " fragID=" << curfrag.fragmentID()
524  << '\n';
525  TRACE(11, debugstream.str().c_str());
526  }
527  return CopyStatus::kSuccess;
528 }
529 
530 DEFINE_ARTDAQ_TRANSFER(artdaq::MPITransfer)
size_t buffer_count_
The number of Fragment transfers the TransferInterface can handle simultaneously. ...
virtual int source_rank() const
Get the source rank for this TransferInterface instance.
virtual ~MPITransfer()
MPITransfer Destructor.
Definition: MPI_transfer.cc:58
int receiveFragment(Fragment &frag, size_t timeout_usec) override
Receive a Fragment using MPI.
Definition: MPI_transfer.cc:92
MPITransfer(fhicl::ParameterSet pset, Role role)
MPITransfer Constructor.
Definition: MPI_transfer.cc:21
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
This TransferInterface is a Receiver.
CopyStatus moveFragment(Fragment &&frag, size_t timeout_usec) override
Move a Fragment to the destination.
Role
Used to determine if a TransferInterface is a Sender or Receiver.
This interface defines the functions used to transfer data between artdaq applications.
virtual int destination_rank() const
Get the destination rank for this TransferInterface instance.
MPITransfer is a TransferInterface implementation plugin that transfers data using MPI...
Definition: MPITransfer.hh:24
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.
const size_t max_fragment_size_words_
The maximum size of the transferred Fragment objects, in artdaq::Fragment::RawDataType words...
CopyStatus copyFragment(Fragment &frag, size_t timeout_usec) override
Copy a Fragment to the destination. Forces asynchronous send.