artdaq  v3_00_03
MPI_transfer.cc
1 #define TRACE_NAME "MPITransfer"
2 #include "artdaq/TransferPlugins/MPITransfer.hh"
3 #include <algorithm>
4 
5 #include "canvas/Utilities/Exception.h"
6 #include "cetlib_except/exception.h"
7 
8 #include "artdaq-core/Data/Fragment.hh"
9 
10 
11 /*
12  Protocol: want to do a send for each request object, then wait for for
13  pending requests to complete, followed by a reset to allow another set
14  of sends to be completed.
15 
16  This needs to be separated into a thing for sending and a thing for receiving.
17  There probably needs to be a common class that both use.
18 */
19 
20 #define MPI_TAG_HEADER 0x8E // 142
21 #define MPI_TAG_DATA 0xDA // 218
22 #define USE_RECV 1
23 
24 std::mutex artdaq::MPITransfer::mpi_mutex_;
25 
27  : TransferInterface(pset, role)
28  , reqs_(2 * buffer_count_, MPI_REQUEST_NULL)
29  , payload_(buffer_count_)
30  , pos_()
31 {
32  TLOG_TRACE("MPITransfer") << uniqueLabel() << " MPITransfer construction: "
33  << "source rank " << source_rank() << ", "
34  << "destination rank " << destination_rank() << ", "
35  << buffer_count_ << " buffers. " << TLOG_ENDL;
36 
37  if (buffer_count_ == 0)
38  {
39  throw art::Exception(art::errors::Configuration, "MPITransfer: ")
40  << "No buffers configured.";
41  }
42 }
43 
46 {
47  TLOG_TRACE("MPITransfer") << uniqueLabel() << " MPITransfer::~MPITransfer: BEGIN" << TLOG_ENDL;
48  TLOG_TRACE("MPITransfer") << uniqueLabel() << " MPITransfer::~MPITransfer: Collecting requests that need to be waited on" << TLOG_ENDL;
49  std::vector<MPI_Request> reqs;
50  for (size_t ii = 0; ii < reqs_.size(); ++ii)
51  {
52  if (reqs_[ii] != MPI_REQUEST_NULL)
53  {
54  reqs.push_back(reqs_[ii]);
55  }
56  }
57  if (reqs.size() > 0)
58  {
59  TLOG_TRACE("MPITransfer") << uniqueLabel() << "MPITransfer::~MPITransfer: Waiting on " << std::to_string(reqs.size()) << " reqs." << TLOG_ENDL;
60  MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
61  }
62  /*
63  TLOG_ARB(TLVL_VERBOSE, "MPITransfer") << uniqueLabel() << " ::~MPITransfer: Entering Barrier");
64  MPI_Barrier(MPI_COMM_WORLD);
65  TLOG_ARB(TLVL_VERBOSE, "MPITransfer") << uniqueLabel() << " ::~MPITransfer: Done with Barrier");*/
66  TLOG_TRACE("MPITransfer") << uniqueLabel() << " MPITransfer::~MPITransfer: DONE" << TLOG_ENDL;
67 }
68 
71 copyFragment(Fragment& frag, size_t send_timeout_usec)
72 {
73  return moveFragment(std::move(frag), send_timeout_usec);
74 }
75 
78 moveFragment(Fragment&& frag, size_t send_timeout_usec)
79 {
80  if (frag.dataSize() > max_fragment_size_words_)
81  {
82  TLOG_WARNING("MPITransfer") << uniqueLabel() << " Fragment has size (" << frag.dataSize() << ") larger than max_fragment_size_words_ (" << max_fragment_size_words_ << ")."
83  << " Total buffer space is: " << max_fragment_size_words_ * buffer_count_ << " words. Multiple over-size Fragments will exhaust the buffer!" << TLOG_ENDL;
84  }
85 
86  auto start_time = std::chrono::steady_clock::now();
87 
88  TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment: Finding available send slot, send_timeout_usec=" << std::to_string(send_timeout_usec) << TLOG_ENDL;
89  auto req_idx = findAvailable();
90  auto counter = 0;
91  while (req_idx == RECV_TIMEOUT && TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec)
92  {
93  usleep(1000);
94  req_idx = findAvailable();
95  counter++;
96  if (counter % 1000 == 0)
97  {
98  TLOG_INFO("MPITransfer") << uniqueLabel() << " Rank " << source_rank() << " waiting for available buffer to " << destination_rank() << ". "
99  << "Waited " << std::to_string(TimeUtils::GetElapsedTimeMilliseconds(start_time)) << " ms so far." << TLOG_ENDL;
100  }
101 
102  }
103  if (req_idx == TransferInterface::RECV_TIMEOUT)
104  {
105  TLOG_WARNING("MPITransfer") << uniqueLabel() << " MPITransfer::sendFragment: No buffers available! Returning RECV_TIMEOUT!" << TLOG_ENDL;
106  return CopyStatus::kTimeout;
107  }
108 
109  TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment send slot is " << req_idx << TLOG_ENDL;
110  auto buffer_idx = req_idx / 2;
111  TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment: Swapping in fragment to send to buffer " << buffer_idx << TLOG_ENDL;
112  Fragment& curfrag = payload_[buffer_idx];
113  curfrag = std::move(frag);
114 
115  TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment before send src=" << source_rank() << " dest=" << destination_rank() << " seqID=" << std::to_string(curfrag.sequenceID()) << " type=" << curfrag.typeString() << " found_idx=" << req_idx << TLOG_ENDL;
116 
117  std::unique_lock<std::mutex> lk(mpi_mutex_);
118 
119  // 14-Sep-2015, KAB: we should consider MPI_Issend here (see below)...
120  TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment: Using MPI_Isend" << TLOG_ENDL;
121  //Waits for the receiver to acknowledge header
122  MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
123 
124  auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
125  auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
126  MPI_Issend(offset, sizeWrds * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &reqs_[req_idx + 1]);
127  TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment COMPLETE" << TLOG_ENDL;
128 
129  TLOG_ARB(11, "MPITransfer") << uniqueLabel() << " MPITransfer::moveFragment COMPLETE: "
130  << " buffer_idx=" << buffer_idx
131  << " send_size=" << curfrag.size()
132  << " src=" << source_rank()
133  << " dest=" << destination_rank()
134  << " sequenceID=" << curfrag.sequenceID()
135  << " fragID=" << curfrag.fragmentID() << TLOG_ENDL;
136  return CopyStatus::kSuccess;
137 }
138 
139 int artdaq::MPITransfer::receiveFragmentHeader(detail::RawFragmentHeader& header, size_t timeout_usec)
140 {
141  TLOG_ARB(6, "MPITransfer") << uniqueLabel() << " MPITransfer::receiveFragmentHeader entered tmo=" << std::to_string(timeout_usec) << " us (ignored)" << TLOG_ENDL;
142  MPI_Status status;
143  int wait_result = MPI_SUCCESS;
144 
145  MPI_Request req;
146  {
147  std::unique_lock<std::mutex> lk(mpi_mutex_);
148  MPI_Irecv(&header, header.num_words() * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &req);
149  }
150  //TLOG_DEBUG("MPITransfer") << uniqueLabel() << " MPITransfer::receiveFragmentHeader: Start of receiveFragment" << TLOG_ENDL;
151 
152  int flag;
153  do {
154  std::unique_lock<std::mutex> lk(mpi_mutex_);
155  wait_result = MPI_Test(&req, &flag, &status);
156  if (!flag) {
157  usleep(1000);
158  //TLOG_ARB(6) << uniqueLabel() << " MPITransfer::receiveFragmentHeader wait loop, flag=" << flag << TLOG_ENDL;
159  }
160  } while (!flag);
161 
162  if (req != MPI_REQUEST_NULL)
163  {
164  TLOG_ERROR("MPITransfer") << uniqueLabel() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader." << TLOG_ENDL;
165  TLOG(TLVL_ERROR) << uniqueLabel() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader." << TLOG_ENDL;
166  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
167  }
168  //TLOG_DEBUG("MPITransfer") << uniqueLabel() << " After testing/waiting res=" << wait_result << TLOG_ENDL;
169  TLOG_ARB(8, "MPITransfer") << uniqueLabel() << " MPITransfer::receiveFragmentHeader recvd" << TLOG_ENDL;
170 
171 
172  {TLOG_ARB(8, "MPITransfer") << uniqueLabel() << " MPITransfer::receiveFragmentHeader: " << my_rank
173  << " Wait_error=" << wait_result
174  << " status_error=" << status.MPI_ERROR
175  << " source=" << status.MPI_SOURCE
176  << " tag=" << status.MPI_TAG
177  << " Fragment_sequenceID=" << (uint64_t)header.sequence_id
178  << " Fragment_size=" << header.word_count
179  << " fragID=" << header.fragment_id << TLOG_ENDL;
180  }
181  char err_buffer[MPI_MAX_ERROR_STRING];
182  int resultlen;
183  switch (wait_result)
184  {
185  case MPI_SUCCESS:
186  break;
187  case MPI_ERR_IN_STATUS:
188  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
189  TLOG_ERROR("MPITransfer") << uniqueLabel()
190  << " MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
191  break;
192  default:
193  MPI_Error_string(wait_result, err_buffer, &resultlen);
194  TLOG_ERROR("MPITransfer") << uniqueLabel()
195  << " MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
196  }
197 
198  //TLOG_INFO("MPITransfer") << uniqueLabel() << " End of receiveFragment" << TLOG_ENDL;
199  return status.MPI_SOURCE;
200 }
201 
202 int artdaq::MPITransfer::receiveFragmentData(RawDataType* destination, size_t wordCount)
203 {
204  TLOG_ARB(6, "MPITransfer") << uniqueLabel() << " MPITransfer::receiveFragmentData entered wordCount=" << std::to_string(wordCount) << TLOG_ENDL;
205  int wait_result = MPI_SUCCESS;
206  MPI_Status status;
207 
208  MPI_Request req;
209  {
210  std::unique_lock<std::mutex> lk(mpi_mutex_);
211  MPI_Irecv(destination, wordCount * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &req);
212  }
213  //TLOG_DEBUG("MPITransfer") << uniqueLabel() << " Start of receiveFragment" << TLOG_ENDL;
214 
215  int flag;
216  do {
217  std::unique_lock<std::mutex> lk(mpi_mutex_);
218  wait_result = MPI_Test(&req, &flag, &status);
219  if (!flag) {
220  usleep(1000);
221  //TLOG_ARB(6, "MPITransfer") << uniqueLabel() << " MPITransfer::receiveFragmentData wait loop, flag=" << flag << TLOG_ENDL;
222  }
223  } while (!flag);
224  if (req != MPI_REQUEST_NULL)
225  {
226  TLOG_ERROR("MPITransfer") << uniqueLabel() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData." << TLOG_ENDL;
227  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
228  }
229 
230  //TLOG_DEBUG("MPITransfer") << uniqueLabel() << " After testing/waiting res=" << wait_result << TLOG_ENDL;
231  TLOG_ARB(8, "MPITransfer") << uniqueLabel() << " MPITransfer::receiveFragmentData recvd" << TLOG_ENDL;
232 
233 
234  char err_buffer[MPI_MAX_ERROR_STRING];
235  int resultlen;
236  switch (wait_result)
237  {
238  case MPI_SUCCESS:
239  break;
240  case MPI_ERR_IN_STATUS:
241  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
242  TLOG_ERROR("MPITransfer") << uniqueLabel()
243  << " MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
244  break;
245  default:
246  MPI_Error_string(wait_result, err_buffer, &resultlen);
247  TLOG_ERROR("MPITransfer") << uniqueLabel()
248  << " MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
249  }
250 
251  //TLOG_INFO("MPITransfer") << uniqueLabel() << " End of MPITransfer::receiveFragmentData" << TLOG_ENDL;
252  return status.MPI_SOURCE;
253 }
254 
255 void
256 artdaq::MPITransfer::
257 cancelReq_(MPI_Request req) const
258 {
259  if (req == MPI_REQUEST_NULL) return;
260 
261  TLOG_ARB(8, "MPITransfer") << uniqueLabel() << " Cancelling post" << TLOG_ENDL;
262 
263  std::unique_lock<std::mutex> lk(mpi_mutex_);
264  int result = MPI_Cancel(&req);
265  if (result == MPI_SUCCESS)
266  {
267  MPI_Status status;
268  MPI_Wait(&req, &status);
269  }
270  else
271  {
272  switch (result)
273  {
274  case MPI_ERR_REQUEST:
275  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
276  << "MPI_Cancel returned MPI_ERR_REQUEST.\n";
277  case MPI_ERR_ARG:
278  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
279  << "MPI_Cancel returned MPI_ERR_ARG.\n";
280  default:
281  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
282  << "MPI_Cancel returned unknown error code.\n";
283  }
284  }
285 }
286 
287 int artdaq::MPITransfer::findAvailable()
288 {
289  int use_me;
290  int flag, flag2;
291  size_t loops = 0;
292  //TLOG_ARB(5) << uniqueLabel() << " findAvailable initial pos_=" << pos_ << TLOG_ENDL;
293  do
294  {
295  use_me = pos_;
296  std::unique_lock<std::mutex> lk(mpi_mutex_);
297  MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
298  if (flag) {
299  MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
300  }
301  pos_ = (pos_ + 2) % reqs_.size();
302  ++loops;
303  } while (!flag2 && loops < buffer_count_);
304  if (loops == buffer_count_) { return TransferInterface::RECV_TIMEOUT; }
305  TLOG_ARB(5, "MPITransfer") << uniqueLabel() << " findAvailable returning use_me=" << use_me << " loops=" << std::to_string(loops) << TLOG_ENDL;
306  // pos_ is pointing at the next slot to check
307  // use_me is pointing at the slot to use
308  return use_me;
309 }
310 
311 DEFINE_ARTDAQ_TRANSFER(artdaq::MPITransfer)
size_t buffer_count_
The number of Fragment transfers the TransferInterface can handle simultaneously. ...
virtual int source_rank() const
Get the source rank for this TransferInterface instance.
virtual ~MPITransfer()
MPITransfer Destructor.
Definition: MPI_transfer.cc:45
CopyStatus moveFragment(Fragment &&frag, size_t timeout_usec=std::numeric_limits< size_t >::max()) override
Move a Fragment to the destination.
Definition: MPI_transfer.cc:78
int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout) override
Receive a Fragment Header from the transport mechanism.
MPITransfer(fhicl::ParameterSet pset, Role role)
MPITransfer Constructor.
Definition: MPI_transfer.cc:26
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
Role
Used to determine if a TransferInterface is a Sender or Receiver.
CopyStatus copyFragment(Fragment &frag, size_t timeout_usec=std::numeric_limits< size_t >::max()) override
Copy a Fragment to the destination. Forces asynchronous send.
Definition: MPI_transfer.cc:71
std::string uniqueLabel() const
Get the unique label of this TransferInterface instance.
This interface defines the functions used to transfer data between artdaq applications.
virtual int destination_rank() const
Get the destination rank for this TransferInterface instance.
MPITransfer is a TransferInterface implementation plugin that transfers data using MPI...
Definition: MPITransfer.hh:24
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.
int receiveFragmentData(RawDataType *destination, size_t wordCount) override
Receive the body of a Fragment to the given destination pointer.