artdaq_mpich_plugin  v1_00_13
MPI_transfer.cc
1 #define TRACE_NAME "MPITransfer"
2 #include <algorithm>
3 #include "artdaq-mpich-plugin/TransferPlugins/MPITransfer.hh"
4 
5 #include "canvas/Utilities/Exception.h"
6 #include "cetlib_except/exception.h"
7 
8 #include "artdaq-core/Data/Fragment.hh"
9 
10 /*
11  Protocol: want to do a send for each request object, then wait for for
12  pending requests to complete, followed by a reset to allow another set
13  of sends to be completed.
14 
15  This needs to be separated into a thing for sending and a thing for receiving.
16  There probably needs to be a common class that both use.
17 */
18 
19 #define MPI_TAG_HEADER 0x8E // 142
20 #define MPI_TAG_DATA 0xDA // 218
21 #define USE_RECV 1
22 
23 std::mutex artdaq::MPITransfer::mpi_mutex_;
24 
25 artdaq::MPITransfer::MPITransfer(fhicl::ParameterSet pset, Role role)
26  : TransferInterface(pset, role), reqs_(2 * buffer_count_, MPI_REQUEST_NULL), payload_(buffer_count_), pos_()
27 {
28  TLOG(TLVL_TRACE) << GetTraceName() << " construction: "
29  << "source rank " << source_rank() << ", "
30  << "destination rank " << destination_rank() << ", " << buffer_count_ << " buffers. ";
31 
32  if (buffer_count_ == 0)
33  {
34  throw art::Exception(art::errors::Configuration, "MPITransfer: ") << "No buffers configured.";
35  }
36 }
37 
39 {
40  TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: BEGIN";
41  flush_buffers();
42  /*
43  TLOG_ARB(TLVL_VERBOSE, "MPITransfer") << uniqueLabel() << " ::~MPITransfer: Entering Barrier");
44  MPI_Barrier(MPI_COMM_WORLD);
45  TLOG_ARB(TLVL_VERBOSE, "MPITransfer") << uniqueLabel() << " ::~MPITransfer: Done with Barrier");*/
46  TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: DONE";
47 }
48 
50 {
51  TLOG(TLVL_TRACE) << GetTraceName() << ": flush_buffers: Collecting requests that need to be waited on";
52  std::vector<MPI_Request> reqs;
53  for (size_t ii = 0; ii < reqs_.size(); ++ii)
54  {
55  if (reqs_[ii] != MPI_REQUEST_NULL)
56  {
57  reqs.push_back(reqs_[ii]);
58  }
59  }
60  if (reqs.size() > 0)
61  {
62  TLOG(TLVL_TRACE) << GetTraceName() << ": flush_buffers: Waiting on " << reqs.size() << " reqs.";
63  MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
64  }
65 }
66 
67 artdaq::TransferInterface::CopyStatus artdaq::MPITransfer::transfer_fragment_min_blocking_mode(
68  Fragment const& frag, size_t send_timeout_usec)
69 {
70  return sendFragment(Fragment(frag), send_timeout_usec);
71 }
72 
73 artdaq::TransferInterface::CopyStatus artdaq::MPITransfer::transfer_fragment_reliable_mode(Fragment&& frag)
74 {
75  return sendFragment(std::move(frag), 0);
76 }
77 
78 artdaq::TransferInterface::CopyStatus artdaq::MPITransfer::sendFragment(Fragment&& frag, size_t send_timeout_usec)
79 {
80  if (frag.dataSize() > max_fragment_size_words_)
81  {
82  TLOG(TLVL_WARNING) << GetTraceName() << " Fragment has size (" << frag.dataSize()
83  << ") larger than max_fragment_size_words_ (" << max_fragment_size_words_ << ")."
84  << " Total buffer space is: " << max_fragment_size_words_ * buffer_count_
85  << " words. Multiple over-size Fragments will exhaust the buffer!";
86  }
87 
88  auto start_time = std::chrono::steady_clock::now();
89 
90  TLOG(5) << GetTraceName() << ": sendFragment: Finding available send slot, send_timeout_usec=" << send_timeout_usec;
91  auto req_idx = findAvailable();
92  auto counter = 0;
93  while (req_idx == RECV_TIMEOUT &&
94  (send_timeout_usec == 0 || TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec))
95  {
96  usleep(1000);
97  req_idx = findAvailable();
98  counter++;
99  if (counter % 1000 == 0)
100  {
101  TLOG(TLVL_INFO) << GetTraceName() << " Rank " << source_rank() << " waiting for available buffer to "
102  << destination_rank() << ". "
103  << "Waited " << TimeUtils::GetElapsedTimeMilliseconds(start_time) << " ms so far.";
104  }
105  }
106  if (req_idx == TransferInterface::RECV_TIMEOUT)
107  {
108  TLOG(TLVL_WARNING) << GetTraceName() << ": sendFragment: No buffers available! Returning RECV_TIMEOUT!";
109  return CopyStatus::kTimeout;
110  }
111 
112  TLOG(5) << GetTraceName() << ": sendFragment send slot is " << req_idx;
113  auto buffer_idx = req_idx / 2;
114  TLOG(5) << GetTraceName() << ": sendFragment: Swapping in fragment to send to buffer " << buffer_idx;
115  Fragment& curfrag = payload_[buffer_idx];
116  curfrag = std::move(frag);
117 
118  TLOG(5) << GetTraceName() << ": sendFragment before send src=" << source_rank() << " dest=" << destination_rank()
119  << " seqID=" << curfrag.sequenceID() << " type=" << curfrag.typeString() << " found_idx=" << req_idx;
120 
121  std::unique_lock<std::mutex> lk(mpi_mutex_);
122 
123  // 14-Sep-2015, KAB: we should consider MPI_Issend here (see below)...
124  TLOG(5) << GetTraceName() << ": sendFragment: Using MPI_Isend";
125  // Waits for the receiver to acknowledge header
126  MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() * sizeof(RawDataType), MPI_BYTE,
127  destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
128 
129  auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
130  auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
131  MPI_Issend(offset, sizeWrds * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD,
132  &reqs_[req_idx + 1]);
133  TLOG(5) << GetTraceName() << ": sendFragment COMPLETE";
134 
135  TLOG(11) << GetTraceName() << ": sendFragment COMPLETE: "
136  << " buffer_idx=" << buffer_idx << " send_size=" << curfrag.size() << " src=" << source_rank()
137  << " dest=" << destination_rank() << " sequenceID=" << curfrag.sequenceID()
138  << " fragID=" << curfrag.fragmentID();
139  return CopyStatus::kSuccess;
140 }
141 
142 int artdaq::MPITransfer::receiveFragmentHeader(detail::RawFragmentHeader& header, size_t timeout_usec)
143 {
144  TLOG(6) << GetTraceName() << ": receiveFragmentHeader entered tmo=" << timeout_usec << " us (ignored)";
145  MPI_Status status;
146  int wait_result = MPI_SUCCESS;
147 
148  MPI_Request req;
149  {
150  std::unique_lock<std::mutex> lk(mpi_mutex_);
151  MPI_Irecv(&header, header.num_words() * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER,
152  MPI_COMM_WORLD, &req);
153  }
154  // TLOG(TLVL_DEBUG) << GetTraceName() << ": receiveFragmentHeader: Start of receiveFragment" ;
155 
156  int flag;
157  do
158  {
159  std::unique_lock<std::mutex> lk(mpi_mutex_);
160  wait_result = MPI_Test(&req, &flag, &status);
161  if (!flag)
162  {
163  usleep(1000);
164  // TLOG_ARB(6) << uniqueLabel() << " MPITransfer::receiveFragmentHeader wait loop, flag=" << flag ;
165  }
166  } while (!flag);
167 
168  if (req != MPI_REQUEST_NULL)
169  {
170  TLOG(TLVL_ERROR) << GetTraceName() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
171  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
172  << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
173  }
174  // TLOG(TLVL_DEBUG) << GetTraceName() << " After testing/waiting res=" << wait_result ;
175  TLOG(8) << GetTraceName() << ": receiveFragmentHeader recvd";
176 
177  {
178  TLOG(8) << GetTraceName() << ": receiveFragmentHeader: " << my_rank << " Wait_error=" << wait_result
179  << " status_error=" << status.MPI_ERROR << " source=" << status.MPI_SOURCE << " tag=" << status.MPI_TAG
180  << " Fragment_sequenceID=" << (uint64_t)header.sequence_id << " Fragment_size=" << header.word_count
181  << " fragID=" << header.fragment_id;
182  }
183  char err_buffer[MPI_MAX_ERROR_STRING];
184  int resultlen;
185  switch (wait_result)
186  {
187  case MPI_SUCCESS:
188  break;
189  case MPI_ERR_IN_STATUS:
190  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
191  TLOG(TLVL_ERROR) << GetTraceName() << ": Waitany ERROR: " << err_buffer;
192  break;
193  default:
194  MPI_Error_string(wait_result, err_buffer, &resultlen);
195  TLOG(TLVL_ERROR) << GetTraceName() << ": Waitany ERROR: " << err_buffer;
196  }
197 
198  // TLOG_INFO) << GetTraceName() << " End of receiveFragment" ;
199  return status.MPI_SOURCE;
200 }
201 
202 int artdaq::MPITransfer::receiveFragmentData(RawDataType* destination, size_t wordCount)
203 {
204  TLOG(6) << GetTraceName() << ": receiveFragmentData entered wordCount=" << wordCount;
205  int wait_result = MPI_SUCCESS;
206  MPI_Status status;
207 
208  MPI_Request req;
209  {
210  std::unique_lock<std::mutex> lk(mpi_mutex_);
211  MPI_Irecv(destination, wordCount * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD,
212  &req);
213  }
214  // TLOG(TLVL_DEBUG) << GetTraceName() << " Start of receiveFragment" ;
215 
216  int flag;
217  do
218  {
219  std::unique_lock<std::mutex> lk(mpi_mutex_);
220  wait_result = MPI_Test(&req, &flag, &status);
221  if (!flag)
222  {
223  usleep(1000);
224  // TLOG(6) << GetTraceName() << ": receiveFragmentData wait loop, flag=" << flag ;
225  }
226  } while (!flag);
227  if (req != MPI_REQUEST_NULL)
228  {
229  TLOG(TLVL_ERROR) << GetTraceName() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
230  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
231  << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
232  }
233 
234  // TLOG(TLVL_DEBUG) << GetTraceName() << " After testing/waiting res=" << wait_result ;
235  TLOG(8) << GetTraceName() << ": receiveFragmentData recvd";
236 
237  char err_buffer[MPI_MAX_ERROR_STRING];
238  int resultlen;
239  switch (wait_result)
240  {
241  case MPI_SUCCESS:
242  break;
243  case MPI_ERR_IN_STATUS:
244  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
245  TLOG(TLVL_ERROR) << GetTraceName() << " MPITransfer: Waitany ERROR: " << err_buffer;
246  break;
247  default:
248  MPI_Error_string(wait_result, err_buffer, &resultlen);
249  TLOG(TLVL_ERROR) << GetTraceName() << " MPITransfer: Waitany ERROR: " << err_buffer;
250  }
251 
252  // TLOG_INFO) << GetTraceName() << " End of MPITransfer::receiveFragmentData" ;
253  return status.MPI_SOURCE;
254 }
255 
256 void artdaq::MPITransfer::cancelReq_(MPI_Request req) const
257 {
258  if (req == MPI_REQUEST_NULL) return;
259 
260  TLOG(8) << GetTraceName() << ": Cancelling post";
261 
262  std::unique_lock<std::mutex> lk(mpi_mutex_);
263  int result = MPI_Cancel(&req);
264  if (result == MPI_SUCCESS)
265  {
266  MPI_Status status;
267  MPI_Wait(&req, &status);
268  }
269  else
270  {
271  switch (result)
272  {
273  case MPI_ERR_REQUEST:
274  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "MPI_Cancel returned MPI_ERR_REQUEST.\n";
275  case MPI_ERR_ARG:
276  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "MPI_Cancel returned MPI_ERR_ARG.\n";
277  default:
278  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "MPI_Cancel returned unknown error code.\n";
279  }
280  }
281 }
282 
283 int artdaq::MPITransfer::findAvailable()
284 {
285  int use_me;
286  int flag = 0, flag2 = 0;
287  size_t loops = 0;
288  // TLOG_ARB(5) << uniqueLabel() << " findAvailable initial pos_=" << pos_ ;
289  do
290  {
291  use_me = pos_;
292  std::unique_lock<std::mutex> lk(mpi_mutex_);
293  MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
294  if (flag)
295  {
296  MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
297  }
298  pos_ = (pos_ + 2) % reqs_.size();
299  ++loops;
300  } while (flag2 == 0 && loops < buffer_count_);
301  if (loops == buffer_count_)
302  {
303  return TransferInterface::RECV_TIMEOUT;
304  }
305  TLOG(5) << GetTraceName() << " findAvailable returning use_me=" << use_me << " loops=" << loops;
306  // pos_ is pointing at the next slot to check
307  // use_me is pointing at the slot to use
308  return use_me;
309 }
310 
311 DEFINE_ARTDAQ_TRANSFER(artdaq::MPITransfer)
virtual ~MPITransfer()
MPITransfer Destructor.
Definition: MPI_transfer.cc:38
void flush_buffers() override
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
Definition: MPI_transfer.cc:49
int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout) override
Receive a Fragment Header from the transport mechanism.
MPITransfer(fhicl::ParameterSet pset, Role role)
MPITransfer Constructor.
Definition: MPI_transfer.cc:25
CopyStatus transfer_fragment_reliable_mode(Fragment &&frag) override
Move a Fragment to the destination.
Definition: MPI_transfer.cc:73
MPITransfer is a TransferInterface implementation plugin that transfers data using MPI...
Definition: MPITransfer.hh:23
CopyStatus transfer_fragment_min_blocking_mode(Fragment const &frag, size_t timeout_usec) override
Copy a Fragment to the destination. Forces asynchronous send.
Definition: MPI_transfer.cc:67
int receiveFragmentData(RawDataType *destination, size_t wordCount) override
Receive the body of a Fragment to the given destination pointer.