00001 #define TRACE_NAME (app_name + "_RequestReceiver").c_str()
00002 #include "artdaq/DAQdata/Globals.hh"
00003
00004 #include "artdaq/DAQrate/RequestReceiver.hh"
00005 #include "artdaq/DAQdata/Globals.hh"
00006 #include "artdaq/DAQrate/detail/RequestMessage.hh"
00007
00008 #include <boost/exception/all.hpp>
00009 #include <boost/throw_exception.hpp>
00010
00011 #include <limits>
00012 #include <iterator>
00013
00014 #include "canvas/Utilities/Exception.h"
00015 #include "cetlib_except/exception.h"
00016 #include "fhiclcpp/ParameterSet.h"
00017
00018 #include "artdaq-core/Utilities/SimpleLookupPolicy.hh"
00019 #include "artdaq-core/Data/Fragment.hh"
00020 #include "artdaq-core/Data/ContainerFragmentLoader.hh"
00021 #include "artdaq-core/Utilities/ExceptionHandler.hh"
00022 #include "artdaq-core/Utilities/TimeUtils.hh"
00023
00024 #include <fstream>
00025 #include <iomanip>
00026 #include <iterator>
00027 #include <iostream>
00028 #include <iomanip>
00029 #include <algorithm>
00030 #include <sys/poll.h>
00031 #include <arpa/inet.h>
00032 #include <netinet/in.h>
00033 #include "artdaq/DAQdata/TCPConnect.hh"
00034
00035 artdaq::RequestReceiver::RequestReceiver()
00036 : request_port_(3001)
00037 , request_addr_("227.128.12.26")
00038 , running_(false)
00039 , requests_()
00040 , request_timing_()
00041 , request_stop_requested_(false)
00042 , request_received_(false)
00043 , end_of_run_timeout_ms_(1000)
00044 , should_stop_(false)
00045 , highest_seen_request_(0)
00046 , out_of_order_requests_()
00047 , request_increment_(1)
00048 {}
00049
00050 artdaq::RequestReceiver::RequestReceiver(const fhicl::ParameterSet& ps)
00051 : request_port_(ps.get<int>("request_port", 3001))
00052 , request_addr_(ps.get<std::string>("request_address", "227.128.12.26"))
00053 , multicast_out_addr_(ps.get<std::string>("multicast_interface_ip", "0.0.0.0"))
00054 , running_(false)
00055 , requests_()
00056 , request_timing_()
00057 , request_stop_requested_(false)
00058 , request_received_(false)
00059 , end_of_run_timeout_ms_(ps.get<size_t>("end_of_run_quiet_timeout_ms", 1000))
00060 , should_stop_(false)
00061 , highest_seen_request_(0)
00062 , out_of_order_requests_()
00063 , request_increment_(ps.get<artdaq::Fragment::sequence_id_t>("request_increment", 1))
00064 {
00065 setupRequestListener();
00066 }
00067
00068 void artdaq::RequestReceiver::setupRequestListener()
00069 {
00070 TLOG(TLVL_INFO) << "Setting up request listen socket, rank=" << my_rank << ", address=" << request_addr_ << ":" << request_port_;
00071 request_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
00072 if (request_socket_ < 0)
00073 {
00074 TLOG(TLVL_ERROR) << "Error creating socket for receiving data requests! err=" << strerror(errno);
00075 exit(1);
00076 }
00077
00078 struct sockaddr_in si_me_request;
00079
00080 int yes = 1;
00081 if (setsockopt(request_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
00082 {
00083 TLOG(TLVL_ERROR) << "Unable to enable port reuse on request socket, err=" << strerror(errno);
00084 exit(1);
00085 }
00086 memset(&si_me_request, 0, sizeof(si_me_request));
00087 si_me_request.sin_family = AF_INET;
00088 si_me_request.sin_port = htons(request_port_);
00089 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY);
00090 if (bind(request_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1)
00091 {
00092 TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << request_port_ << ", err=" << strerror(errno);
00093 exit(1);
00094 }
00095
00096 if (request_addr_ != "localhost")
00097 {
00098 struct ip_mreq mreq;
00099 int sts = ResolveHost(request_addr_.c_str(), mreq.imr_multiaddr);
00100 if (sts == -1)
00101 {
00102 TLOG(TLVL_ERROR) << "Unable to resolve multicast request address, err=" << strerror(errno);
00103 exit(1);
00104 }
00105 sts = GetInterfaceForNetwork(multicast_out_addr_.c_str(), mreq.imr_interface);
00106 if (sts == -1)
00107 {
00108 TLOG(TLVL_ERROR) << "Unable to resolve hostname for " << multicast_out_addr_;
00109 exit(1);
00110 }
00111 if (setsockopt(request_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0)
00112 {
00113 TLOG(TLVL_ERROR) << "Unable to join multicast group, err=" << strerror(errno);
00114 exit(1);
00115 }
00116 }
00117 TLOG(TLVL_INFO) << "Done setting up request socket, rank=" << my_rank;
00118 }
00119
00120 artdaq::RequestReceiver::~RequestReceiver()
00121 {
00122 stopRequestReceiverThread(true);
00123 }
00124
00125 void artdaq::RequestReceiver::stopRequestReceiverThread(bool force)
00126 {
00127 std::unique_lock<std::mutex> lk(state_mutex_);
00128 if (!request_received_ && !force)
00129 {
00130 TLOG(TLVL_ERROR) << "Stop request received by RequestReceiver, but no requests have ever been received." << std::endl
00131 << "Check that UDP port " << request_port_ << " is open in the firewall config.";
00132 }
00133 should_stop_ = true;
00134 if (running_) {
00135 TLOG(TLVL_DEBUG) << "Joining requestThread";
00136 if (requestThread_.joinable()) requestThread_.join();
00137 bool once = true;
00138 while (running_) {
00139 if (once) TLOG(TLVL_ERROR) << "running_ is true after thread join! Should NOT happen";
00140 once = false;
00141 usleep(10000);
00142 }
00143 }
00144
00145 if (request_socket_ != -1)
00146 {
00147 close(request_socket_);
00148 request_socket_ = -1;
00149 }
00150 request_received_ = false;
00151 highest_seen_request_ = 0;
00152 }
00153
00154 void artdaq::RequestReceiver::startRequestReceiverThread()
00155 {
00156 std::unique_lock<std::mutex> lk(state_mutex_);
00157 if (requestThread_.joinable()) requestThread_.join();
00158 should_stop_ = false;
00159 request_stop_requested_ = false;
00160
00161 if (request_socket_ == -1)
00162 {
00163 TLOG(TLVL_INFO) << "Connecting Request Reception socket";
00164 setupRequestListener();
00165 }
00166
00167 TLOG(TLVL_INFO) << "Starting Request Reception Thread";
00168 try {
00169 requestThread_ = boost::thread(&RequestReceiver::receiveRequestsLoop, this);
00170 }
00171 catch (const boost::exception& e)
00172 {
00173 TLOG(TLVL_ERROR) << "Caught boost::exception starting Request Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno;
00174 std::cerr << "Caught boost::exception starting Request Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl;
00175 exit(5);
00176 }
00177 running_ = true;
00178 }
00179
00180 void artdaq::RequestReceiver::receiveRequestsLoop()
00181 {
00182 while (!should_stop_)
00183 {
00184 TLOG(16) << "receiveRequestsLoop: Polling Request socket for new requests";
00185
00186 if (request_socket_ == -1)
00187 {
00188 setupRequestListener();
00189 }
00190
00191 int ms_to_wait = 10;
00192 struct pollfd ufds[1];
00193 ufds[0].fd = request_socket_;
00194 ufds[0].events = POLLIN | POLLPRI | POLLERR;
00195 int rv = poll(ufds, 1, ms_to_wait);
00196
00197
00198 if (rv <= 0 || (ufds[0].revents != POLLIN && ufds[0].revents != POLLPRI))
00199 {
00200 if (rv == 1 && (ufds[0].revents & (POLLNVAL | POLLERR | POLLHUP)))
00201 {
00202 close(request_socket_);
00203 request_socket_ = -1;
00204 }
00205 if (request_stop_requested_ && TimeUtils::GetElapsedTimeMilliseconds(request_stop_timeout_) > end_of_run_timeout_ms_)
00206 {
00207 break;
00208 }
00209 continue;
00210 }
00211
00212 TLOG(11) << "Received packet on Request channel";
00213 std::vector<uint8_t> buffer(MAX_REQUEST_MESSAGE_SIZE);
00214 struct sockaddr_in from;
00215 socklen_t len = sizeof(from);
00216 auto sts = recvfrom(request_socket_, &buffer[0], MAX_REQUEST_MESSAGE_SIZE, 0, (struct sockaddr*)&from, &len);
00217 if (sts < 0)
00218 {
00219 TLOG(TLVL_ERROR) << "Error receiving request message header err=" << strerror(errno);
00220 close(request_socket_);
00221 request_socket_ = -1;
00222 continue;
00223 }
00224
00225 auto hdr_buffer = reinterpret_cast<artdaq::detail::RequestHeader*>(&buffer[0]);
00226 TLOG(11) << "Request header word: 0x" << std::hex << hdr_buffer->header << ", packet_count: " << hdr_buffer->packet_count << " from rank " << hdr_buffer->rank << ", " << inet_ntoa(from.sin_addr) << ":" << from.sin_port;
00227 if (!hdr_buffer->isValid()) continue;
00228
00229 request_received_ = true;
00230 if (hdr_buffer->mode == artdaq::detail::RequestMessageMode::EndOfRun)
00231 {
00232 TLOG(TLVL_INFO) << "Received Request Message with the EndOfRun marker. (Re)Starting 1-second timeout for receiving all outstanding requests...";
00233 request_stop_timeout_ = std::chrono::steady_clock::now();
00234 request_stop_requested_ = true;
00235 }
00236
00237 std::vector<artdaq::detail::RequestPacket> pkt_buffer(hdr_buffer->packet_count);
00238 memcpy(&pkt_buffer[0], &buffer[sizeof(artdaq::detail::RequestHeader)], sizeof(artdaq::detail::RequestPacket) * hdr_buffer->packet_count);
00239 bool anyNew = false;
00240
00241 if (should_stop_) break;
00242
00243 for (auto& buffer : pkt_buffer)
00244 {
00245 TLOG(20) << "Request Packet: hdr=" << buffer.header << ", seq=" << buffer.sequence_id << ", ts=" << buffer.timestamp;
00246 if (!buffer.isValid()) continue;
00247 std::unique_lock<std::mutex> tlk(request_mutex_);
00248 if (requests_.count(buffer.sequence_id) && requests_[buffer.sequence_id] != buffer.timestamp)
00249 {
00250 TLOG(TLVL_ERROR) << "Received conflicting request for SeqID "
00251 << buffer.sequence_id << "!"
00252 << " Old ts=" << requests_[buffer.sequence_id]
00253 << ", new ts=" << buffer.timestamp << ". Keeping OLD!";
00254 }
00255 else if (!requests_.count(buffer.sequence_id))
00256 {
00257 int delta = buffer.sequence_id - highest_seen_request_;
00258 TLOG(11) << "Received request for sequence ID " << buffer.sequence_id
00259 << " and timestamp " << buffer.timestamp << " (delta: " << delta << ")";
00260 if (delta <= 0 || out_of_order_requests_.count(buffer.sequence_id))
00261 {
00262 TLOG(11) << "Already serviced this request ( sequence ID " << buffer.sequence_id << ")! Ignoring...";
00263 }
00264 else
00265 {
00266 requests_[buffer.sequence_id] = buffer.timestamp;
00267 request_timing_[buffer.sequence_id] = std::chrono::steady_clock::now();
00268 anyNew = true;
00269 }
00270 }
00271 }
00272 if (anyNew)
00273 {
00274 request_cv_.notify_all();
00275 }
00276 }
00277 TLOG(TLVL_DEBUG) << "Ending Request Thread";
00278 running_ = false;
00279 }
00280
00281 void artdaq::RequestReceiver::RemoveRequest(artdaq::Fragment::sequence_id_t reqID)
00282 {
00283 TLOG(10) << "RemoveRequest: Removing request for id " << reqID;
00284 std::unique_lock<std::mutex> lk(request_mutex_);
00285 requests_.erase(reqID);
00286
00287 if (reqID > highest_seen_request_)
00288 {
00289 TLOG(10) << "RemoveRequest: out_of_order_requests_.size() == " << out_of_order_requests_.size() << ", reqID=" << reqID << ", expected=" << highest_seen_request_ + request_increment_;
00290 if (out_of_order_requests_.size() || reqID != highest_seen_request_ + request_increment_)
00291 {
00292 out_of_order_requests_.insert(reqID);
00293
00294 auto it = out_of_order_requests_.begin();
00295 while (it != out_of_order_requests_.end() && !should_stop_)
00296 {
00297 if (*it == highest_seen_request_ + request_increment_)
00298 {
00299 highest_seen_request_ = *it;
00300 it = out_of_order_requests_.erase(it);
00301 }
00302 else
00303 {
00304 break;
00305 }
00306 }
00307 }
00308 else
00309 {
00310 highest_seen_request_ = reqID;
00311 }
00312 TLOG(10) << "RemoveRequest: reqID=" << reqID << " Setting highest_seen_request_ to " << highest_seen_request_;
00313 }
00314 if (metricMan && request_timing_.count(reqID))
00315 {
00316 metricMan->sendMetric("Request Response Time", TimeUtils::GetElapsedTime(request_timing_[reqID]), "seconds", 2, MetricMode::Average);
00317 }
00318 request_timing_.erase(reqID);
00319 }