00001 #include <chrono>
00002
00003 #define TRACE_NAME "DataReceiverManager"
00004 #include "artdaq/DAQdata/Globals.hh"
00005 #include "artdaq/DAQrate/DataReceiverManager.hh"
00006 #include "artdaq/TransferPlugins/MakeTransferPlugin.hh"
00007 #include "cetlib_except/exception.h"
00008 #include <iomanip>
00009
00010 artdaq::DataReceiverManager::DataReceiverManager(const fhicl::ParameterSet& pset, std::shared_ptr<SharedMemoryEventManager> shm)
00011 : stop_requested_(false)
00012 , stop_requested_time_(0)
00013 , source_threads_()
00014 , source_plugins_()
00015 , enabled_sources_()
00016 , recv_frag_count_()
00017 , recv_frag_size_()
00018 , recv_seq_count_()
00019 , receive_timeout_(pset.get<size_t>("receive_timeout_usec", 100000))
00020 , stop_timeout_ms_(pset.get<size_t>("stop_timeout_ms",3000))
00021 , shm_manager_(shm)
00022 , non_reliable_mode_enabled_(pset.get<bool>("non_reliable_mode", false))
00023 , non_reliable_mode_retry_count_(pset.get<size_t>("non_reliable_mode_retry_count", -1))
00024 {
00025 TLOG_DEBUG("DataReceiverManager") << "Constructor" << TLOG_ENDL;
00026 auto enabled_srcs = pset.get<std::vector<int>>("enabled_sources", std::vector<int>());
00027 auto enabled_srcs_empty = enabled_srcs.size() == 0;
00028
00029 if (non_reliable_mode_enabled_)
00030 {
00031 TLOG_WARNING("DataReceiverManager") << "DataReceiverManager is configured to drop data after " << std::to_string(non_reliable_mode_retry_count_)
00032 << " failed attempts to put data into the SharedMemoryEventManager! If this is unexpected, please check your configuration!" << TLOG_ENDL;
00033 }
00034
00035 if (enabled_srcs_empty)
00036 {
00037 TLOG_INFO("DataReceiverManager") << "enabled_sources not specified, assuming all sources enabled." << TLOG_ENDL;
00038 }
00039 else
00040 {
00041 for (auto& s : enabled_srcs)
00042 {
00043 enabled_sources_.insert(s);
00044 }
00045 }
00046
00047 auto srcs = pset.get<fhicl::ParameterSet>("sources", fhicl::ParameterSet());
00048 for (auto& s : srcs.get_pset_names())
00049 {
00050 try
00051 {
00052 auto transfer = std::unique_ptr<TransferInterface>(MakeTransferPlugin(srcs, s,
00053 TransferInterface::Role::kReceive));
00054 auto source_rank = transfer->source_rank();
00055 if (enabled_srcs_empty) enabled_sources_.insert(source_rank);
00056 source_plugins_[source_rank] = std::move(transfer);
00057 }
00058 catch (cet::exception ex)
00059 {
00060 TLOG_WARNING("DataReceiverManager") << "cet::exception caught while setting up source " << s << ": " << ex.what() << TLOG_ENDL;
00061 }
00062 catch (std::exception ex)
00063 {
00064 TLOG_WARNING("DataReceiverManager") << "std::exception caught while setting up source " << s << ": " << ex.what() << TLOG_ENDL;
00065 }
00066 catch (...)
00067 {
00068 TLOG_WARNING("DataReceiverManager") << "Non-cet exception caught while setting up source " << s << "." << TLOG_ENDL;
00069 }
00070 }
00071 if (srcs.get_pset_names().size() == 0)
00072 {
00073 TLOG_ERROR("DataReceiverManager") << "No sources configured!" << TLOG_ENDL;
00074 }
00075 }
00076
00077 artdaq::DataReceiverManager::~DataReceiverManager()
00078 {
00079 TLOG_TRACE("DataReceiverManager") << "~DataReceiverManager: BEGIN: Setting stop_requested to true, frags=" << std::to_string(count()) << ", bytes=" << std::to_string(byteCount()) << TLOG_ENDL;
00080 stop_requested_time_ = TimeUtils::gettimeofday_us();
00081 stop_requested_ = true;
00082
00083 TLOG_TRACE("DataReceiverManager") << "~DataReceiverManager: Joining all threads" << TLOG_ENDL;
00084 for (auto& s : source_threads_)
00085 {
00086 auto& thread = s.second;
00087 if (thread.joinable()) thread.join();
00088 }
00089 shm_manager_.reset();
00090 TLOG_TRACE("DataReceiverManager") << "Destructor END" << TLOG_ENDL;
00091 }
00092
00093
00094 void artdaq::DataReceiverManager::start_threads()
00095 {
00096 for (auto& source : source_plugins_)
00097 {
00098 auto& rank = source.first;
00099 if (enabled_sources_.count(rank))
00100 {
00101 running_sources_.insert(rank);
00102 boost::thread::attributes attrs;
00103 attrs.set_stack_size(4096 * 500);
00104 source_threads_[rank] = boost::thread(attrs, boost::bind(&DataReceiverManager::runReceiver_, this, rank));
00105 }
00106 }
00107 }
00108
00109 void artdaq::DataReceiverManager::runReceiver_(int source_rank)
00110 {
00111 std::chrono::steady_clock::time_point start_time, after_header, before_body;
00112 int ret;
00113 double delta_t, hdr_delta_t, store_delta_t, data_delta_t;
00114 detail::RawFragmentHeader header;
00115 size_t endOfDataCount = -1;
00116 auto sleep_time = receive_timeout_ / 100 > 100000 ? 100000 : receive_timeout_ / 100;
00117 auto max_retries = non_reliable_mode_retry_count_ * ceil(receive_timeout_ / sleep_time);
00118
00119 while (!(stop_requested_ && TimeUtils::gettimeofday_us() - stop_requested_time_ > stop_timeout_ms_ * 1000) && enabled_sources_.count(source_rank))
00120 {
00121 TRACE(16, "DataReceiverManager::runReceiver_: Begin loop");
00122 if (stop_requested_) { receive_timeout_ = stop_timeout_ms_; }
00123
00124 start_time = std::chrono::steady_clock::now();
00125
00126 TRACE(16, "DataReceiverManager::runReceiver_: Calling receiveFragmentHeader");
00127 ret = source_plugins_[source_rank]->receiveFragmentHeader(header, receive_timeout_);
00128 TRACE(16, "DataReceiverManager::runReceiver_: Done with receiveFragmentHeader, ret=%d (should be %d)", ret, source_rank);
00129 if (ret != source_rank)
00130 {
00131 continue;
00132 }
00133
00134 after_header = std::chrono::steady_clock::now();
00135
00136 if (Fragment::isUserFragmentType(header.type) || header.type == Fragment::DataFragmentType || header.type == Fragment::EmptyFragmentType || header.type == Fragment::ContainerFragmentType) {
00137 TLOG_TRACE("DataReceiverManager") << "Received Fragment Header from rank " << source_rank << "." << TLOG_ENDL;
00138 RawDataType* loc = nullptr;
00139 size_t retries = 0;
00140 while (loc == nullptr )
00141 {
00142 loc = shm_manager_->WriteFragmentHeader(header);
00143 if (loc == nullptr) usleep(sleep_time);
00144 if (stop_requested_) return;
00145 retries++;
00146 if (non_reliable_mode_enabled_ && retries > max_retries)
00147 {
00148 loc = shm_manager_->WriteFragmentHeader(header, true);
00149 }
00150 }
00151 if (loc == nullptr)
00152 {
00153
00154 TLOG_ERROR("DataReceiverManager") << "runReceiver_: Could not get data location for event " << std::to_string(header.sequence_id) << TLOG_ENDL;
00155 continue;
00156 }
00157 before_body = std::chrono::steady_clock::now();
00158
00159 TRACE(16, "DataReceiverManager::runReceiver_: Calling receiveFragmentData");
00160 auto ret2 = source_plugins_[source_rank]->receiveFragmentData(loc, header.word_count - header.num_words());
00161 TRACE(16, "DataReceiverManager::runReceiver_: Done with receiveFragmentData, ret2=%d (should be %d)", ret2, source_rank);
00162
00163 if (ret != ret2) {
00164 TLOG_ERROR("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")" << TLOG_ENDL;
00165 throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader! (Expected: " << ret << ", Got: " << ret2 << ")";
00166 }
00167
00168 shm_manager_->DoneWritingFragment(header);
00169 TLOG_TRACE("DataReceiverManager") << "Done receiving fragment with sequence ID " << std::to_string(header.sequence_id) << " from rank " << source_rank << TLOG_ENDL;
00170
00171 recv_frag_count_.incSlot(source_rank);
00172 recv_frag_size_.incSlot(source_rank, header.word_count * sizeof(RawDataType));
00173 recv_seq_count_.setSlot(source_rank, header.sequence_id);
00174 if (endOfDataCount != static_cast<size_t>(-1))
00175 {
00176 TLOG_DEBUG("DataReceiverManager") << "Received fragment " << std::to_string(header.sequence_id) << " from rank " << source_rank
00177 << " (" << std::to_string(recv_frag_count_.slotCount(source_rank)) << "/" << std::to_string(endOfDataCount) << ")" << TLOG_ENDL;
00178 }
00179
00180 if (metricMan)
00181 {
00182 TRACE(6, "DataReceiverManager::runReceiver_: Sending receive stats");
00183 delta_t = TimeUtils::GetElapsedTime(start_time);
00184 hdr_delta_t = TimeUtils::GetElapsedTime(start_time, after_header);
00185 store_delta_t = TimeUtils::GetElapsedTime(after_header, before_body);
00186 data_delta_t = TimeUtils::GetElapsedTime(before_body);
00187 metricMan->sendMetric("Total Receive Time From Rank " + std::to_string(source_rank), delta_t, "s", 1, MetricMode::Accumulate);
00188 metricMan->sendMetric("Total Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(header.word_count * sizeof(RawDataType)), "B", 1, MetricMode::Accumulate);
00189 metricMan->sendMetric("Total Receive Rate From Rank " + std::to_string(source_rank), header.word_count * sizeof(RawDataType) / delta_t, "B/s", 1, MetricMode::Average);
00190
00191 metricMan->sendMetric("Header Receive Time From Rank " + std::to_string(source_rank), hdr_delta_t, "s", 1, MetricMode::Accumulate);
00192 metricMan->sendMetric("Header Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>(header.num_words() * sizeof(RawDataType)), "B", 1, MetricMode::Accumulate);
00193 metricMan->sendMetric("Header Receive Rate From Rank " + std::to_string(source_rank), header.num_words() * sizeof(RawDataType) / hdr_delta_t, "B/s", 1, MetricMode::Average);
00194
00195 metricMan->sendMetric("Shared Memory Wait Time From Rank " + std::to_string(source_rank), store_delta_t, "s", 1, MetricMode::Accumulate);
00196
00197 metricMan->sendMetric("Data Receive Time From Rank " + std::to_string(source_rank), data_delta_t, "s", 1, MetricMode::Accumulate);
00198 metricMan->sendMetric("Data Receive Size From Rank " + std::to_string(source_rank), static_cast<unsigned long>((header.word_count - header.num_words()) * sizeof(RawDataType)), "B", 1, MetricMode::Accumulate);
00199 metricMan->sendMetric("Data Receive Rate From Rank " + std::to_string(source_rank), (header.word_count - header.num_words()) * sizeof(RawDataType) / data_delta_t, "B/s", 1, MetricMode::Average);
00200 metricMan->sendMetric("Data Receive Count From Rank " + std::to_string(source_rank), recv_frag_count_.slotCount(source_rank), "fragments", 3, MetricMode::Accumulate);
00201 TRACE(6, "DataReceiverManager::runReceiver_: Done sending receive stats");
00202 }
00203 }
00204 else if (header.type == Fragment::EndOfDataFragmentType || header.type == Fragment::InitFragmentType || header.type == Fragment::EndOfRunFragmentType || header.type == Fragment::EndOfSubrunFragmentType || header.type == Fragment::ShutdownFragmentType)
00205 {
00206 TLOG_DEBUG("DataReceiverManager") << "Received System Fragment from rank " << source_rank << " of type " << detail::RawFragmentHeader::SystemTypeToString(header.type) << "." << TLOG_ENDL;
00207
00208 FragmentPtr frag(new Fragment(header.word_count - header.num_words()));
00209 memcpy(frag->headerAddress(), &header, header.num_words() * sizeof(RawDataType));
00210 auto ret3 = source_plugins_[source_rank]->receiveFragmentData(frag->headerAddress() + header.num_words(), header.word_count - header.num_words());
00211 if (ret3 != source_rank)
00212 {
00213 TLOG_ERROR("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")" << TLOG_ENDL;
00214 throw cet::exception("DataReceiverManager") << "Unexpected return code from receiveFragmentData after receiveFragmentHeader while receiving System Fragment! (Expected: " << source_rank << ", Got: " << ret3 << ")";
00215 }
00216
00217 switch (header.type)
00218 {
00219 case Fragment::EndOfDataFragmentType:
00220 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
00221 endOfDataCount = *(frag->dataBegin());
00222 TLOG_DEBUG("DataReceiverManager") << "EndOfData Fragment indicates that " << std::to_string(endOfDataCount) << " fragments are expected from rank " << source_rank
00223 << " (recvd " << std::to_string(recv_frag_count_.slotCount(source_rank)) << ")." << TLOG_ENDL;
00224 break;
00225 case Fragment::InitFragmentType:
00226 TLOG_DEBUG("DataReceiverManager") << "Received Init Fragment from rank " << source_rank << "." << TLOG_ENDL;
00227 shm_manager_->setRequestMode(detail::RequestMessageMode::Normal);
00228 shm_manager_->SetInitFragment(std::move(frag));
00229 break;
00230 case Fragment::EndOfRunFragmentType:
00231 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
00232 break;
00233 case Fragment::EndOfSubrunFragmentType:
00234 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
00235 break;
00236 case Fragment::ShutdownFragmentType:
00237 shm_manager_->setRequestMode(detail::RequestMessageMode::EndOfRun);
00238 break;
00239 }
00240 }
00241
00242 if (endOfDataCount <= recv_frag_count_.slotCount(source_rank))
00243 {
00244 running_sources_.erase(source_rank);
00245 return;
00246 }
00247 }
00248 }