00001 #define TRACE_NAME "RequestReceiver"
00002
00003 #include "artdaq/DAQrate/RequestReceiver.hh"
00004 #include "artdaq/DAQdata/Globals.hh"
00005 #include "artdaq/DAQrate/detail/RequestMessage.hh"
00006
00007 #include <boost/exception/all.hpp>
00008 #include <boost/throw_exception.hpp>
00009
00010 #include <limits>
00011 #include <iterator>
00012
00013 #include "canvas/Utilities/Exception.h"
00014 #include "cetlib_except/exception.h"
00015 #include "fhiclcpp/ParameterSet.h"
00016
00017 #include "artdaq-core/Utilities/SimpleLookupPolicy.hh"
00018 #include "artdaq-core/Data/Fragment.hh"
00019 #include "artdaq-core/Data/ContainerFragmentLoader.hh"
00020 #include "artdaq-core/Utilities/ExceptionHandler.hh"
00021 #include "artdaq-core/Utilities/TimeUtils.hh"
00022
00023 #include <fstream>
00024 #include <iomanip>
00025 #include <iterator>
00026 #include <iostream>
00027 #include <iomanip>
00028 #include <algorithm>
00029 #include <sys/poll.h>
00030 #include "artdaq/DAQdata/TCPConnect.hh"
00031
00032 artdaq::RequestReceiver::RequestReceiver()
00033 : request_port_(3001)
00034 , request_addr_("227.128.12.26")
00035 , running_(false)
00036 , requests_()
00037 , request_timing_()
00038 , request_stop_requested_(false)
00039 , request_received_(false)
00040 , end_of_run_timeout_ms_(1000)
00041 , should_stop_(false)
00042 , highest_seen_request_(0)
00043 , out_of_order_requests_()
00044 , request_increment_(1)
00045 {}
00046
00047 artdaq::RequestReceiver::RequestReceiver(const fhicl::ParameterSet& ps)
00048 : request_port_(ps.get<int>("request_port", 3001))
00049 , request_addr_(ps.get<std::string>("request_address", "227.128.12.26"))
00050 , multicast_out_addr_(ps.get<std::string>("multicast_interface_ip", "0.0.0.0"))
00051 , running_(false)
00052 , requests_()
00053 , request_timing_()
00054 , request_stop_requested_(false)
00055 , request_received_(false)
00056 , end_of_run_timeout_ms_(ps.get<size_t>("end_of_run_quiet_timeout_ms", 1000))
00057 , should_stop_(false)
00058 , highest_seen_request_(0)
00059 , out_of_order_requests_()
00060 , request_increment_(ps.get<artdaq::Fragment::sequence_id_t>("request_increment", 1))
00061 {
00062 setupRequestListener();
00063 }
00064
00065 void artdaq::RequestReceiver::setupRequestListener()
00066 {
00067 TLOG(TLVL_INFO) << "Setting up request listen socket, rank=" << my_rank << ", address=" << request_addr_ << ":" << request_port_;
00068 request_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00069 if (request_socket_ < 0)
00070 {
00071 TLOG(TLVL_ERROR) << "Error creating socket for receiving data requests! err=" << strerror(errno);
00072 exit(1);
00073 }
00074
00075 struct sockaddr_in si_me_request;
00076
00077 int yes = 1;
00078 if (setsockopt(request_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00079 {
00080 TLOG(TLVL_ERROR) << "Unable to enable port reuse on request socket, err=" << strerror(errno);
00081 exit(1);
00082 }
00083 memset(&si_me_request, 0, sizeof(si_me_request));
00084 si_me_request.sin_family = AF_INET;
00085 si_me_request.sin_port = htons(request_port_);
00086 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00087 if (bind(request_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
00088 {
00089 TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << request_port_ << ", err=" << strerror(errno);
00090 exit(1);
00091 }
00092
00093 if (request_addr_ != "localhost")
00094 {
00095 struct ip_mreq mreq;
00096 int sts = ResolveHost(request_addr_.c_str(), mreq.imr_multiaddr);
00097 if (sts == -1)
00098 {
00099 TLOG(TLVL_ERROR) << "Unable to resolve multicast request address, err=" << strerror(errno);
00100 exit(1);
00101 }
00102 sts = GetInterfaceForNetwork(multicast_out_addr_.c_str(), mreq.imr_interface);
00103 if (sts == -1)
00104 {
00105 TLOG(TLVL_ERROR) << "Unable to resolve hostname for " << multicast_out_addr_;
00106 exit(1);
00107 }
00108 if (setsockopt(request_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
00109 {
00110 TLOG(TLVL_ERROR) << "Unable to join multicast group, err=" << strerror(errno);
00111 exit(1);
00112 }
00113 }
00114 TLOG(TLVL_INFO) << "Done setting up request socket, rank=" << my_rank;
00115 }
00116
00117 artdaq::RequestReceiver::~RequestReceiver()
00118 {
00119 stopRequestReceiverThread(true);
00120 }
00121
00122 void artdaq::RequestReceiver::stopRequestReceiverThread(bool force)
00123 {
00124 std::unique_lock<std::mutex> lk(state_mutex_);
00125 if (!request_received_ && !force)
00126 {
00127 TLOG(TLVL_ERROR) << "Stop request received by RequestReceiver, but no requests have ever been received." << std::endl
00128 << "Check that UDP port " << request_port_ << " is open in the firewall config.";
00129 }
00130 should_stop_ = true;
00131 TLOG(TLVL_DEBUG) << "Joining requestThread";
00132 if (requestThread_.joinable()) requestThread_.join();
00133 bool once = true;
00134 while (running_) {
00135 if (once) TLOG(TLVL_ERROR) << "running_ is true after thread join! Should NOT happen";
00136 once = false;
00137 usleep(10000);
00138 }
00139
00140 if (request_socket_ != -1)
00141 {
00142 close(request_socket_);
00143 request_socket_ = -1;
00144 }
00145 request_received_ = false;
00146 highest_seen_request_ = 0;
00147 }
00148
00149 void artdaq::RequestReceiver::startRequestReceiverThread()
00150 {
00151 std::unique_lock<std::mutex> lk(state_mutex_);
00152 if (requestThread_.joinable()) requestThread_.join();
00153 should_stop_ = false;
00154 request_stop_requested_ = false;
00155
00156 if (request_socket_ == -1)
00157 {
00158 TLOG(TLVL_INFO) << "Connecting Request Reception socket";
00159 setupRequestListener();
00160 }
00161
00162 TLOG(TLVL_INFO) << "Starting Request Reception Thread";
00163 requestThread_ = boost::thread(&RequestReceiver::receiveRequestsLoop, this);
00164 running_ = true;
00165 }
00166
00167 void artdaq::RequestReceiver::receiveRequestsLoop()
00168 {
00169 while (!should_stop_)
00170 {
00171 TLOG(16) << "receiveRequestsLoop: Polling Request socket for new requests";
00172
00173 if (request_socket_ == -1)
00174 {
00175 setupRequestListener();
00176 }
00177
00178 int ms_to_wait = 10;
00179 struct pollfd ufds[1];
00180 ufds[0].fd = request_socket_;
00181 ufds[0].events = POLLIN | POLLPRI | POLLERR;
00182 int rv = poll(ufds, 1, ms_to_wait);
00183
00184
00185 if (rv <= 0 || (ufds[0].revents != POLLIN && ufds[0].revents != POLLPRI))
00186 {
00187 if (rv == 1 && (ufds[0].revents & (POLLNVAL | POLLERR | POLLHUP)))
00188 {
00189 close(request_socket_);
00190 request_socket_ = -1;
00191 }
00192 if (request_stop_requested_ && TimeUtils::GetElapsedTimeMilliseconds(request_stop_timeout_) > end_of_run_timeout_ms_)
00193 {
00194 break;
00195 }
00196 continue;
00197 }
00198
00199 TLOG(11) << "Recieved packet on Request channel";
00200 artdaq::detail::RequestHeader hdr_buffer;
00201 auto sts = recv(request_socket_, &hdr_buffer, sizeof(hdr_buffer), 0);
00202 if (sts < 0)
00203 {
00204 TLOG(TLVL_ERROR) << "Error receiving request message header err=" << strerror(errno);
00205 close(request_socket_);
00206 request_socket_ = -1;
00207 continue;
00208 }
00209 TLOG(11) << "Request header word: 0x" << std::hex << hdr_buffer.header;
00210 if (!hdr_buffer.isValid()) continue;
00211
00212 request_received_ = true;
00213 if (hdr_buffer.mode == artdaq::detail::RequestMessageMode::EndOfRun)
00214 {
00215 TLOG(TLVL_INFO) << "Received Request Message with the EndOfRun marker. (Re)Starting 1-second timeout for receiving all outstanding requests...";
00216 request_stop_timeout_ = std::chrono::steady_clock::now();
00217 request_stop_requested_ = true;
00218 }
00219
00220 std::vector<artdaq::detail::RequestPacket> pkt_buffer(hdr_buffer.packet_count);
00221 size_t recvd = 0;
00222 while (recvd < sizeof(artdaq::detail::RequestPacket) * hdr_buffer.packet_count)
00223 {
00224 ssize_t this_recv = recv(request_socket_, reinterpret_cast<uint8_t*>(&pkt_buffer[0]) + recvd, sizeof(artdaq::detail::RequestPacket) * hdr_buffer.packet_count - recvd, 0);
00225 if (this_recv < 0)
00226 {
00227 TLOG(TLVL_ERROR) << "Error receiving request message data err=" << strerror(errno);
00228 close(request_socket_);
00229 request_socket_ = -1;
00230 continue;
00231
00232 }
00233 recvd += this_recv;
00234 }
00235 bool anyNew = false;
00236
00237 if (should_stop_) break;
00238
00239 for (auto& buffer : pkt_buffer)
00240 {
00241 TLOG(20) << "Request Packet: hdr=" << buffer.header << ", seq=" << buffer.sequence_id << ", ts=" << buffer.timestamp;
00242 if (!buffer.isValid()) continue;
00243 if (requests_.count(buffer.sequence_id) && requests_[buffer.sequence_id] != buffer.timestamp)
00244 {
00245 TLOG(TLVL_ERROR) << "Received conflicting request for SeqID "
00246 << buffer.sequence_id << "!"
00247 << " Old ts=" << requests_[buffer.sequence_id]
00248 << ", new ts=" << buffer.timestamp << ". Keeping OLD!";
00249 }
00250 else if (!requests_.count(buffer.sequence_id))
00251 {
00252 int delta = buffer.sequence_id - highest_seen_request_;
00253 TLOG(11) << "Recieved request for sequence ID " << buffer.sequence_id
00254 << " and timestamp " << buffer.timestamp << " (delta: " << delta << ")";
00255 if (delta <= 0 || out_of_order_requests_.count(buffer.sequence_id))
00256 {
00257 TLOG(11) << "Already serviced this request! Ignoring...";
00258 }
00259 else
00260 {
00261 std::unique_lock<std::mutex> tlk(request_mutex_);
00262 requests_[buffer.sequence_id] = buffer.timestamp;
00263 request_timing_[buffer.sequence_id] = std::chrono::steady_clock::now();
00264 anyNew = true;
00265 }
00266 }
00267 }
00268 if (anyNew)
00269 {
00270 request_cv_.notify_all();
00271 }
00272 }
00273 TLOG(TLVL_DEBUG) << "Ending Request Thread";
00274 running_ = false;
00275 }
00276
00277 void artdaq::RequestReceiver::RemoveRequest(artdaq::Fragment::sequence_id_t reqID)
00278 {
00279 TLOG(10) << "RemoveRequest: Removing request with id " << reqID;
00280 std::unique_lock<std::mutex> lk(request_mutex_);
00281 requests_.erase(reqID);
00282
00283 if (reqID > highest_seen_request_)
00284 {
00285 TLOG(10) << "RemoveRequest: out_of_order_requests_.size() == " << out_of_order_requests_.size() << ", reqID=" << reqID << ", expected=" << highest_seen_request_ + request_increment_;
00286 if (out_of_order_requests_.size() || reqID != highest_seen_request_ + request_increment_)
00287 {
00288 out_of_order_requests_.insert(reqID);
00289
00290 auto it = out_of_order_requests_.begin();
00291 while (it != out_of_order_requests_.end() && !should_stop_)
00292 {
00293 if (*it == highest_seen_request_ + request_increment_)
00294 {
00295 highest_seen_request_ = *it;
00296 it = out_of_order_requests_.erase(it);
00297 }
00298 else
00299 {
00300 break;
00301 }
00302 }
00303 }
00304 else
00305 {
00306 highest_seen_request_ = reqID;
00307 }
00308 TLOG(10) << "RemoveRequest: reqID=" << reqID << " Setting highest_seen_request_ to " << highest_seen_request_;
00309 }
00310 if (metricMan)
00311 {
00312 metricMan->sendMetric("Request Response Time", TimeUtils::GetElapsedTime(request_timing_[reqID]), "seconds", 2, MetricMode::Average);
00313 }
00314 request_timing_.erase(reqID);
00315 }