artdaq_mpich_plugin  v1_00_08a
MPI_transfer.cc
1 #define TRACE_NAME "MPITransfer"
2 #include <algorithm>
3 #include "artdaq-mpich-plugin/TransferPlugins/MPITransfer.hh"
4 
5 #include "canvas/Utilities/Exception.h"
6 #include "cetlib_except/exception.h"
7 
8 #include "artdaq-core/Data/Fragment.hh"
9 
10 /*
11  Protocol: want to do a send for each request object, then wait for for
12  pending requests to complete, followed by a reset to allow another set
13  of sends to be completed.
14 
15  This needs to be separated into a thing for sending and a thing for receiving.
16  There probably needs to be a common class that both use.
17 */
18 
19 #define MPI_TAG_HEADER 0x8E // 142
20 #define MPI_TAG_DATA 0xDA // 218
21 #define USE_RECV 1
22 
23 std::mutex artdaq::MPITransfer::mpi_mutex_;
24 
25 artdaq::MPITransfer::MPITransfer(fhicl::ParameterSet pset, Role role)
26  : TransferInterface(pset, role), reqs_(2 * buffer_count_, MPI_REQUEST_NULL), payload_(buffer_count_), pos_() {
27  TLOG(TLVL_TRACE) << GetTraceName() << " construction: "
28  << "source rank " << source_rank() << ", "
29  << "destination rank " << destination_rank() << ", " << buffer_count_ << " buffers. ";
30 
31  if (buffer_count_ == 0) {
32  throw art::Exception(art::errors::Configuration, "MPITransfer: ") << "No buffers configured.";
33  }
34 }
35 
37  TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: BEGIN";
38  flush_buffers();
39  /*
40  TLOG_ARB(TLVL_VERBOSE, "MPITransfer") << uniqueLabel() << " ::~MPITransfer: Entering Barrier");
41  MPI_Barrier(MPI_COMM_WORLD);
42  TLOG_ARB(TLVL_VERBOSE, "MPITransfer") << uniqueLabel() << " ::~MPITransfer: Done with Barrier");*/
43  TLOG(TLVL_TRACE) << GetTraceName() << ": ~MPITransfer: DONE";
44 }
45 
47  TLOG(TLVL_TRACE) << GetTraceName() << ": flush_buffers: Collecting requests that need to be waited on";
48  std::vector<MPI_Request> reqs;
49  for (size_t ii = 0; ii < reqs_.size(); ++ii) {
50  if (reqs_[ii] != MPI_REQUEST_NULL) {
51  reqs.push_back(reqs_[ii]);
52  }
53  }
54  if (reqs.size() > 0) {
55  TLOG(TLVL_TRACE) << GetTraceName() << ": flush_buffers: Waiting on " << reqs.size() << " reqs.";
56  MPI_Waitall(reqs.size(), &reqs[0], MPI_STATUSES_IGNORE);
57  }
58 }
59 
60 artdaq::TransferInterface::CopyStatus artdaq::MPITransfer::transfer_fragment_min_blocking_mode(
61  Fragment const& frag, size_t send_timeout_usec) {
62  return sendFragment(Fragment(frag), send_timeout_usec);
63 }
64 
65 artdaq::TransferInterface::CopyStatus artdaq::MPITransfer::transfer_fragment_reliable_mode(Fragment&& frag) {
66  return sendFragment(std::move(frag), 0);
67 }
68 
69 artdaq::TransferInterface::CopyStatus artdaq::MPITransfer::sendFragment(Fragment&& frag, size_t send_timeout_usec) {
70  if (frag.dataSize() > max_fragment_size_words_) {
71  TLOG(TLVL_WARNING) << GetTraceName() << " Fragment has size (" << frag.dataSize()
72  << ") larger than max_fragment_size_words_ (" << max_fragment_size_words_ << ")."
73  << " Total buffer space is: " << max_fragment_size_words_ * buffer_count_
74  << " words. Multiple over-size Fragments will exhaust the buffer!";
75  }
76 
77  auto start_time = std::chrono::steady_clock::now();
78 
79  TLOG(5) << GetTraceName() << ": sendFragment: Finding available send slot, send_timeout_usec=" << send_timeout_usec;
80  auto req_idx = findAvailable();
81  auto counter = 0;
82  while (req_idx == RECV_TIMEOUT &&
83  (send_timeout_usec == 0 || TimeUtils::GetElapsedTimeMicroseconds(start_time) < send_timeout_usec)) {
84  usleep(1000);
85  req_idx = findAvailable();
86  counter++;
87  if (counter % 1000 == 0) {
88  TLOG(TLVL_INFO) << GetTraceName() << " Rank " << source_rank() << " waiting for available buffer to "
89  << destination_rank() << ". "
90  << "Waited " << TimeUtils::GetElapsedTimeMilliseconds(start_time) << " ms so far.";
91  }
92  }
93  if (req_idx == TransferInterface::RECV_TIMEOUT) {
94  TLOG(TLVL_WARNING) << GetTraceName() << ": sendFragment: No buffers available! Returning RECV_TIMEOUT!";
95  return CopyStatus::kTimeout;
96  }
97 
98  TLOG(5) << GetTraceName() << ": sendFragment send slot is " << req_idx;
99  auto buffer_idx = req_idx / 2;
100  TLOG(5) << GetTraceName() << ": sendFragment: Swapping in fragment to send to buffer " << buffer_idx;
101  Fragment& curfrag = payload_[buffer_idx];
102  curfrag = std::move(frag);
103 
104  TLOG(5) << GetTraceName() << ": sendFragment before send src=" << source_rank() << " dest=" << destination_rank()
105  << " seqID=" << curfrag.sequenceID() << " type=" << curfrag.typeString() << " found_idx=" << req_idx;
106 
107  std::unique_lock<std::mutex> lk(mpi_mutex_);
108 
109  // 14-Sep-2015, KAB: we should consider MPI_Issend here (see below)...
110  TLOG(5) << GetTraceName() << ": sendFragment: Using MPI_Isend";
111  // Waits for the receiver to acknowledge header
112  MPI_Issend(curfrag.headerAddress(), detail::RawFragmentHeader::num_words() * sizeof(RawDataType), MPI_BYTE,
113  destination_rank(), MPI_TAG_HEADER, MPI_COMM_WORLD, &reqs_[req_idx]);
114 
115  auto sizeWrds = curfrag.size() - detail::RawFragmentHeader::num_words();
116  auto offset = curfrag.headerAddress() + detail::RawFragmentHeader::num_words();
117  MPI_Issend(offset, sizeWrds * sizeof(RawDataType), MPI_BYTE, destination_rank(), MPI_TAG_DATA, MPI_COMM_WORLD,
118  &reqs_[req_idx + 1]);
119  TLOG(5) << GetTraceName() << ": sendFragment COMPLETE";
120 
121  TLOG(11) << GetTraceName() << ": sendFragment COMPLETE: "
122  << " buffer_idx=" << buffer_idx << " send_size=" << curfrag.size() << " src=" << source_rank()
123  << " dest=" << destination_rank() << " sequenceID=" << curfrag.sequenceID()
124  << " fragID=" << curfrag.fragmentID();
125  return CopyStatus::kSuccess;
126 }
127 
128 int artdaq::MPITransfer::receiveFragmentHeader(detail::RawFragmentHeader& header, size_t timeout_usec) {
129  TLOG(6) << GetTraceName() << ": receiveFragmentHeader entered tmo=" << timeout_usec << " us (ignored)";
130  MPI_Status status;
131  int wait_result = MPI_SUCCESS;
132 
133  MPI_Request req;
134  {
135  std::unique_lock<std::mutex> lk(mpi_mutex_);
136  MPI_Irecv(&header, header.num_words() * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_HEADER,
137  MPI_COMM_WORLD, &req);
138  }
139  // TLOG(TLVL_DEBUG) << GetTraceName() << ": receiveFragmentHeader: Start of receiveFragment" ;
140 
141  int flag;
142  do {
143  std::unique_lock<std::mutex> lk(mpi_mutex_);
144  wait_result = MPI_Test(&req, &flag, &status);
145  if (!flag) {
146  usleep(1000);
147  // TLOG_ARB(6) << uniqueLabel() << " MPITransfer::receiveFragmentHeader wait loop, flag=" << flag ;
148  }
149  } while (!flag);
150 
151  if (req != MPI_REQUEST_NULL) {
152  TLOG(TLVL_ERROR) << GetTraceName() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
153  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
154  << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentHeader.";
155  }
156  // TLOG(TLVL_DEBUG) << GetTraceName() << " After testing/waiting res=" << wait_result ;
157  TLOG(8) << GetTraceName() << ": receiveFragmentHeader recvd";
158 
159  {
160  TLOG(8) << GetTraceName() << ": receiveFragmentHeader: " << my_rank << " Wait_error=" << wait_result
161  << " status_error=" << status.MPI_ERROR << " source=" << status.MPI_SOURCE << " tag=" << status.MPI_TAG
162  << " Fragment_sequenceID=" << (uint64_t)header.sequence_id << " Fragment_size=" << header.word_count
163  << " fragID=" << header.fragment_id;
164  }
165  char err_buffer[MPI_MAX_ERROR_STRING];
166  int resultlen;
167  switch (wait_result) {
168  case MPI_SUCCESS:
169  break;
170  case MPI_ERR_IN_STATUS:
171  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
172  TLOG(TLVL_ERROR) << GetTraceName() << ": Waitany ERROR: " << err_buffer;
173  break;
174  default:
175  MPI_Error_string(wait_result, err_buffer, &resultlen);
176  TLOG(TLVL_ERROR) << GetTraceName() << ": Waitany ERROR: " << err_buffer;
177  }
178 
179  // TLOG_INFO) << GetTraceName() << " End of receiveFragment" ;
180  return status.MPI_SOURCE;
181 }
182 
183 int artdaq::MPITransfer::receiveFragmentData(RawDataType* destination, size_t wordCount) {
184  TLOG(6) << GetTraceName() << ": receiveFragmentData entered wordCount=" << wordCount;
185  int wait_result = MPI_SUCCESS;
186  MPI_Status status;
187 
188  MPI_Request req;
189  {
190  std::unique_lock<std::mutex> lk(mpi_mutex_);
191  MPI_Irecv(destination, wordCount * sizeof(RawDataType), MPI_BYTE, source_rank(), MPI_TAG_DATA, MPI_COMM_WORLD,
192  &req);
193  }
194  // TLOG(TLVL_DEBUG) << GetTraceName() << " Start of receiveFragment" ;
195 
196  int flag;
197  do {
198  std::unique_lock<std::mutex> lk(mpi_mutex_);
199  wait_result = MPI_Test(&req, &flag, &status);
200  if (!flag) {
201  usleep(1000);
202  // TLOG(6) << GetTraceName() << ": receiveFragmentData wait loop, flag=" << flag ;
203  }
204  } while (!flag);
205  if (req != MPI_REQUEST_NULL) {
206  TLOG(TLVL_ERROR) << GetTraceName() << " INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
207  throw art::Exception(art::errors::LogicError, "MPITransfer: ")
208  << "INTERNAL ERROR: req is not MPI_REQUEST_NULL in receiveFragmentData.";
209  }
210 
211  // TLOG(TLVL_DEBUG) << GetTraceName() << " After testing/waiting res=" << wait_result ;
212  TLOG(8) << GetTraceName() << ": receiveFragmentData recvd";
213 
214  char err_buffer[MPI_MAX_ERROR_STRING];
215  int resultlen;
216  switch (wait_result) {
217  case MPI_SUCCESS:
218  break;
219  case MPI_ERR_IN_STATUS:
220  MPI_Error_string(status.MPI_ERROR, err_buffer, &resultlen);
221  TLOG(TLVL_ERROR) << GetTraceName() << " MPITransfer: Waitany ERROR: " << err_buffer;
222  break;
223  default:
224  MPI_Error_string(wait_result, err_buffer, &resultlen);
225  TLOG(TLVL_ERROR) << GetTraceName() << " MPITransfer: Waitany ERROR: " << err_buffer;
226  }
227 
228  // TLOG_INFO) << GetTraceName() << " End of MPITransfer::receiveFragmentData" ;
229  return status.MPI_SOURCE;
230 }
231 
232 void artdaq::MPITransfer::cancelReq_(MPI_Request req) const {
233  if (req == MPI_REQUEST_NULL) return;
234 
235  TLOG(8) << GetTraceName() << ": Cancelling post";
236 
237  std::unique_lock<std::mutex> lk(mpi_mutex_);
238  int result = MPI_Cancel(&req);
239  if (result == MPI_SUCCESS) {
240  MPI_Status status;
241  MPI_Wait(&req, &status);
242  } else {
243  switch (result) {
244  case MPI_ERR_REQUEST:
245  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "MPI_Cancel returned MPI_ERR_REQUEST.\n";
246  case MPI_ERR_ARG:
247  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "MPI_Cancel returned MPI_ERR_ARG.\n";
248  default:
249  throw art::Exception(art::errors::LogicError, "MPITransfer: ") << "MPI_Cancel returned unknown error code.\n";
250  }
251  }
252 }
253 
254 int artdaq::MPITransfer::findAvailable() {
255  int use_me;
256  int flag = 0, flag2 = 0;
257  size_t loops = 0;
258  // TLOG_ARB(5) << uniqueLabel() << " findAvailable initial pos_=" << pos_ ;
259  do {
260  use_me = pos_;
261  std::unique_lock<std::mutex> lk(mpi_mutex_);
262  MPI_Test(&reqs_[use_me], &flag, MPI_STATUS_IGNORE);
263  if (flag) {
264  MPI_Test(&reqs_[use_me + 1], &flag2, MPI_STATUS_IGNORE);
265  }
266  pos_ = (pos_ + 2) % reqs_.size();
267  ++loops;
268  } while (flag2 == 0 && loops < buffer_count_);
269  if (loops == buffer_count_) {
270  return TransferInterface::RECV_TIMEOUT;
271  }
272  TLOG(5) << GetTraceName() << " findAvailable returning use_me=" << use_me << " loops=" << loops;
273  // pos_ is pointing at the next slot to check
274  // use_me is pointing at the slot to use
275  return use_me;
276 }
277 
278 DEFINE_ARTDAQ_TRANSFER(artdaq::MPITransfer)
virtual ~MPITransfer()
MPITransfer Destructor.
Definition: MPI_transfer.cc:36
void flush_buffers() override
Flush any in-flight data. This should be used by the receiver after the receive loop has ended...
Definition: MPI_transfer.cc:46
int receiveFragmentHeader(detail::RawFragmentHeader &header, size_t receiveTimeout) override
Receive a Fragment Header from the transport mechanism.
MPITransfer(fhicl::ParameterSet pset, Role role)
MPITransfer Constructor.
Definition: MPI_transfer.cc:25
CopyStatus transfer_fragment_reliable_mode(Fragment &&frag) override
Move a Fragment to the destination.
Definition: MPI_transfer.cc:65
MPITransfer is a TransferInterface implementation plugin that transfers data using MPI...
Definition: MPITransfer.hh:23
CopyStatus transfer_fragment_min_blocking_mode(Fragment const &frag, size_t timeout_usec) override
Copy a Fragment to the destination. Forces asynchronous send.
Definition: MPI_transfer.cc:60
int receiveFragmentData(RawDataType *destination, size_t wordCount) override
Receive the body of a Fragment to the given destination pointer.