artdaq  v3_00_01
MPI_transfer.cc
1 #include "artdaq/TransferPlugins/MPITransfer.hh"
2 #include <algorithm>
3 
4 #include "canvas/Utilities/Exception.h"
5 #include "cetlib_except/exception.h"
6 
7 #include "artdaq-core/Data/Fragment.hh"
8 
9 
10 /*
11  Protocol: want to do a send for each request object, then wait for for
12  pending requests to complete, followed by a reset to allow another set
13  of sends to be completed.
14 
15  This needs to be separated into a thing for sending and a thing for receiving.
16  There probably needs to be a common class that both use.
17 */
18 
19 #define MPI_TAG_HEADER 0x8E // 142
20 #define MPI_TAG_DATA 0xDA // 218
21 #define USE_RECV 1
22 
23 std::mutex artdaq::MPITransfer::mpi_mutex_;
24 
26  : TransferInterface(pset, role)
27  , reqs_(2 * buffer_count_, MPI_REQUEST_NULL)
28  , payload_(buffer_count_)
29  , pos_()
30 {
31  TLOG_TRACE(uniqueLabel()) << "MPITransfer construction: "
32  << "source rank " << source_rank() << ", "
33  << "destination rank " << destination_rank() << ", "
34  << buffer_count_ << " buffers. " << TLOG_ENDL;
35 
36  if (buffer_count_ == 0)
37  {
38  throw art::Exception(art::errors::Configuration, "MPITransfer: ")
39  << "No buffers configured.";
40  }
41 }
42 
45 {
46  TLOG_TRACE(uniqueLabel()) << "MPITransfer::~MPITransfer: BEGIN" << TLOG_ENDL;
47  TLOG_TRACE(uniqueLabel()) << "MPITransfer::~MPITransfer: Collecting requests that need to be waited on" << TLOG_ENDL;
48  std::vector<MPI_Request> reqs;
49  for (size_t ii = 0; ii < reqs_.size(); ++ii)
50  {
51  if (reqs_[ii] != MPI_REQUEST_NULL)
52  {
53  reqs.push_back(reqs_[ii]);
54  }
55  }
56  if (reqs.size() > 0)
57  {
58  TLOG_TRACE(uniqueLabel()) << "MPITransfer::~MPITransfer: Waiting on " << std::to_string(reqs.size()) << " reqs." << TLOG_ENDL;
59  MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
60  }
61  /*
62  TLOG_ARB(TLVL_VERBOSE, "MPITransfer::~MPITransfer: Entering Barrier");
63  MPI_Barrier(MPI_COMM_WORLD);
64  TLOG_ARB(TLVL_VERBOSE, "MPITransfer::~MPITransfer: Done with Barrier");*/
65  TLOG_TRACE(uniqueLabel()) << "MPITransfer::~MPITransfer: DONE" << TLOG_ENDL;
66 }
67 
70 copyFragment(Fragment& frag, size_t send_timeout_usec)
71 {
72  return moveFragment(std::move(frag), send_timeout_usec);
73 }
74 
77 moveFragment(Fragment&& frag, size_t send_timeout_usec)
78 {
79  if (frag.dataSize() > max_fragment_size_words_)
80  {
81  TLOG_WARNING(uniqueLabel()) << "Fragment has size (" << frag.dataSize() << ") larger than max_fragment_size_words_ (" << max_fragment_size_words_ << ")."
82  << " Total buffer space is: " << max_fragment_size_words_ * buffer_count_ << " words. Multiple over-size Fragments will exhaust the buffer!" << TLOG_ENDL;
83  }
84 
85  auto start_time = std::chrono::steady_clock::now();
86 
87  TLOG_ARB(5, uniqueLabel()) << "MPITransfer::moveFragment: Finding available send slot, send_timeout_usec=" << std::to_string(send_timeout_usec) << TLOG_ENDL;
88  auto req_idx = findAvailable();
89  auto counter = 0;
90  while (req_idx == RECV_TIMEOUT && TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec)
91  {
92  usleep(1000);
93  req_idx = findAvailable();
94  counter++;
95  if (counter % 1000 == 0)
96  {
97  TLOG_INFO(uniqueLabel()) << "Rank " << source_rank() << " waiting for available buffer to " << destination_rank() << ". "
98  << "Waited " << std::to_string(TimeUtils::GetElapsedTimeMilliseconds(start_time)) << " ms so far." << TLOG_ENDL;
99  }
100 
101  }
102  if (req_idx == TransferInterface::RECV_TIMEOUT)
103  {
104  TLOG_WARNING(uniqueLabel()) << "MPITransfer::sendFragment: No buffers available! Returning RECV_TIMEOUT!" << TLOG_ENDL;
105  return CopyStatus::kTimeout;
106  }
107 
108  TLOG_ARB(5, uniqueLabel()) << "MPITransfer::moveFragment send slot is " << req_idx << TLOG_ENDL;
109  auto buffer_idx = req_idx / 2;
110  TLOG_ARB(5, uniqueLabel()) << "MPITransfer::moveFragment: Swapping in fragment to send to buffer " << buffer_idx << TLOG_ENDL;
111  Fragment& curfrag = payload_[buffer_idx];
112  curfrag = std::move(frag);
113 
114  TLOG_ARB(5, uniqueLabel()) << "MPITransfer::moveFragment before send src=" << source_rank() << " dest=" << destination_rank() << " seqID=" << std::to_string(curfrag.sequenceID()) << " type=" << curfrag.typeString() << " found_idx=" << req_idx << TLOG_ENDL;
115 
116  std::unique_lock<std::mutex> lk(mpi_mutex_);
117 
118  // 14-Sep-2015, KAB: we should consider MPI_Issend here (see below)...
119  TLOG_ARB(5, uniqueLabel()) << "MPITransfer::moveFragment: Using MPI_Isend" << TLOG_ENDL;
120  //Waits for the receiver to acknowledge header
121  MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
122 
123  auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
124  auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
125  MPI_Issend(offset, sizeWrds * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &reqs_[req_idx + 1]);
126  TLOG_ARB(5, uniqueLabel()) << "MPITransfer::moveFragment COMPLETE" << TLOG_ENDL;
127 
128  TLOG_ARB(11, uniqueLabel()) << "MPITransfer::moveFragment COMPLETE: "
129  << " buffer_idx=" << buffer_idx
130  << " send_size=" << curfrag.size()
131  << " src=" << source_rank()
132  << " dest=" << destination_rank()
133  << " sequenceID=" << curfrag.sequenceID()
134  << " fragID=" << curfrag.fragmentID() << TLOG_ENDL;
135  return CopyStatus::kSuccess;
136 }
137 
138 int artdaq::MPITransfer::receiveFragmentHeader(detail::RawFragmentHeader& header, size_t timeout_usec)
139 {
140  TLOG_ARB(6, uniqueLabel()) << "MPITransfer::receiveFragmentHeader entered tmo=" << std::to_string(timeout_usec) << " us (ignored)" << TLOG_ENDL;
141  MPI_Status status;
142  int wait_result = MPI_SUCCESS;
143 
144  MPI_Request req;
145  {
146  std::unique_lock<std::mutex> lk(mpi_mutex_);
147  MPI_Irecv(&header, header.num_words() * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &req);
148  }
149  //TLOG_DEBUG(uniqueLabel()) << "MPITransfer::receiveFragmentHeader: Start of receiveFragment" << TLOG_ENDL;
150 
151  int flag;
152  do {
153  std::unique_lock<std::mutex> lk(mpi_mutex_);
154  wait_result = MPI_Test(&req, &flag, &status);
155  if (!flag) {
156  usleep(1000);
157  //TLOG_ARB(6, uniqueLabel()) << "MPITransfer::receiveFragmentHeader wait loop, flag=" << flag << TLOG_ENDL;
158  }
159  } while (!flag);
160 
161  if (req != MPI_REQUEST_NULL)
162  {
163  TLOG_ERROR(uniqueLabel()) << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader." << TLOG_ENDL;
164  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
165  }
166  //TLOG_DEBUG(uniqueLabel()) << "After testing/waiting res=" << wait_result << TLOG_ENDL;
167  TLOG_ARB(8, uniqueLabel()) << "MPITransfer::receiveFragmentHeader recvd" << TLOG_ENDL;
168 
169 
170  {TLOG_ARB(8, uniqueLabel()) << "MPITransfer::receiveFragmentHeader: " << my_rank
171  << " Wait_error=" << wait_result
172  << " status_error=" << status.MPI_ERROR
173  << " source=" << status.MPI_SOURCE
174  << " tag=" << status.MPI_TAG
175  << " Fragment_sequenceID=" << (uint64_t)header.sequence_id
176  << " Fragment_size=" << header.word_count
177  << " fragID=" << header.fragment_id << TLOG_ENDL;
178  }
179  char err_buffer[MPI_MAX_ERROR_STRING];
180  int resultlen;
181  switch (wait_result)
182  {
183  case MPI_SUCCESS:
184  break;
185  case MPI_ERR_IN_STATUS:
186  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
187  TLOG_ERROR(uniqueLabel())
188  << "MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
189  break;
190  default:
191  MPI_Error_string(wait_result, err_buffer, &resultlen);
192  TLOG_ERROR(uniqueLabel())
193  << "MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
194  }
195 
196  //TLOG_INFO(uniqueLabel()) << "End of receiveFragment" << TLOG_ENDL;
197  return status.MPI_SOURCE;
198 }
199 
200 int artdaq::MPITransfer::receiveFragmentData(RawDataType* destination, size_t wordCount)
201 {
202  TLOG_ARB(6, uniqueLabel()) << "MPITransfer::receiveFragmentData entered wordCount=" << std::to_string(wordCount) << TLOG_ENDL;
203  int wait_result = MPI_SUCCESS;
204  MPI_Status status;
205 
206  MPI_Request req;
207  {
208  std::unique_lock<std::mutex> lk(mpi_mutex_);
209  MPI_Irecv(destination, wordCount * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD, &req);
210  }
211  //TLOG_DEBUG(uniqueLabel()) << "Start of receiveFragment" << TLOG_ENDL;
212 
213  int flag;
214  do {
215  std::unique_lock<std::mutex> lk(mpi_mutex_);
216  wait_result = MPI_Test(&req, &flag, &status);
217  if (!flag) {
218  usleep(1000);
219  //TLOG_ARB(6, uniqueLabel()) << "MPITransfer::receiveFragmentData wait loop, flag=" << flag << TLOG_ENDL;
220  }
221  } while (!flag);
222  if (req != MPI_REQUEST_NULL)
223  {
224  TLOG_ERROR(uniqueLabel()) << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData." << TLOG_ENDL;
225  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
226  }
227 
228  //TLOG_DEBUG(uniqueLabel()) << "After testing/waiting res=" << wait_result << TLOG_ENDL;
229  TLOG_ARB(8, uniqueLabel()) << "MPITransfer::receiveFragmentData recvd" << TLOG_ENDL;
230 
231 
232  char err_buffer[MPI_MAX_ERROR_STRING];
233  int resultlen;
234  switch (wait_result)
235  {
236  case MPI_SUCCESS:
237  break;
238  case MPI_ERR_IN_STATUS:
239  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
240  TLOG_ERROR(uniqueLabel())
241  << "MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
242  break;
243  default:
244  MPI_Error_string(wait_result, err_buffer, &resultlen);
245  TLOG_ERROR(uniqueLabel())
246  << "MPITransfer: Waitany ERROR: " << err_buffer << "\n" << TLOG_ENDL;
247  }
248 
249  //TLOG_INFO(uniqueLabel()) << "End of MPITransfer::receiveFragmentData" << TLOG_ENDL;
250  return status.MPI_SOURCE;
251 }
252 
253 void
254 artdaq::MPITransfer::
255 cancelReq_(MPI_Request req) const
256 {
257  if (req == MPI_REQUEST_NULL) return;
258 
259  TLOG_ARB(8, uniqueLabel()) << "Cancelling post" << TLOG_ENDL;
260 
261  std::unique_lock<std::mutex> lk(mpi_mutex_);
262  int result = MPI_Cancel(&req);
263  if (result == MPI_SUCCESS)
264  {
265  MPI_Status status;
266  MPI_Wait(&req, &status);
267  }
268  else
269  {
270  switch (result)
271  {
272  case MPI_ERR_REQUEST:
273  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
274  << "MPI_Cancel returned MPI_ERR_REQUEST.\n";
275  case MPI_ERR_ARG:
276  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
277  << "MPI_Cancel returned MPI_ERR_ARG.\n";
278  default:
279  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
280  << "MPI_Cancel returned unknown error code.\n";
281  }
282  }
283 }
284 
285 int artdaq::MPITransfer::findAvailable()
286 {
287  int use_me;
288  int flag, flag2;
289  size_t loops = 0;
290  //TLOG_ARB(5,uniqueLabel()) << "findAvailable initial pos_=" << pos_ << TLOG_ENDL;
291  do
292  {
293  use_me = pos_;
294  std::unique_lock<std::mutex> lk(mpi_mutex_);
295  MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
296  if (flag) {
297  MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
298  }
299  pos_ = (pos_ + 2) % reqs_.size();
300  ++loops;
301  } while (!flag2 && loops < buffer_count_);
302  if (loops == buffer_count_) { return TransferInterface::RECV_TIMEOUT; }
303  TLOG_ARB(5, uniqueLabel()) << "findAvailable returning use_me=" << use_me << " loops=" << std::to_string(loops) << TLOG_ENDL;
304  // pos_ is pointing at the next slot to check
305  // use_me is pointing at the slot to use
306  return use_me;
307 }
308 
309 DEFINE_ARTDAQ_TRANSFER(artdaq::MPITransfer)
size_t buffer_count_
The number of Fragment transfers the TransferInterface can handle simultaneously. ...
virtual int source_rank() const
Get the source rank for this TransferInterface instance.
virtual ~MPITransfer()
MPITransfer Destructor.
Definition: MPI_transfer.cc:44
CopyStatus moveFragment(Fragment &&frag, size_t timeout_usec=std::numeric_limits< size_t >::max()) override
Move a Fragment to the destination.
Definition: MPI_transfer.cc:77
int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout) override
Receive a Fragment Header from the transport mechanism.
MPITransfer(fhicl::ParameterSet pset, Role role)
MPITransfer Constructor.
Definition: MPI_transfer.cc:25
static const int RECV_TIMEOUT
Value to be returned upon receive timeout. Because receivers otherwise return rank, this is also the limit on the number of ranks that artdaq currently supports.
Role
Used to determine if a TransferInterface is a Sender or Receiver.
CopyStatus copyFragment(Fragment &frag, size_t timeout_usec=std::numeric_limits< size_t >::max()) override
Copy a Fragment to the destination. Forces asynchronous send.
Definition: MPI_transfer.cc:70
std::string uniqueLabel() const
Get the unique label of this TransferInterface instance.
This interface defines the functions used to transfer data between artdaq applications.
virtual int destination_rank() const
Get the destination rank for this TransferInterface instance.
MPITransfer is a TransferInterface implementation plugin that transfers data using MPI...
Definition: MPITransfer.hh:24
CopyStatus
Returned from the send functions, this enumeration describes the possible return codes. If an exception occurs, it will be thrown and should be handled normally.
int receiveFragmentData(RawDataType *destination, size_t wordCount) override
Receive the body of a Fragment to the given destination pointer.