$treeview $search $mathjax $extrastylesheet
artdaq
v3_04_00
$projectbrief
|
$projectbrief
|
$searchbox |
00001 #define TRACE_NAME (app_name + "_RequestReceiver").c_str() 00002 #include "artdaq/DAQdata/Globals.hh" 00003 00004 #include "artdaq/DAQrate/RequestReceiver.hh" 00005 #include "artdaq/DAQdata/Globals.hh" 00006 #include "artdaq/DAQrate/detail/RequestMessage.hh" 00007 00008 #include <boost/exception/all.hpp> 00009 #include <boost/throw_exception.hpp> 00010 00011 #include <limits> 00012 #include <iterator> 00013 00014 #include "canvas/Utilities/Exception.h" 00015 #include "cetlib_except/exception.h" 00016 #include "fhiclcpp/ParameterSet.h" 00017 00018 #include "artdaq-core/Utilities/SimpleLookupPolicy.hh" 00019 #include "artdaq-core/Data/Fragment.hh" 00020 #include "artdaq-core/Data/ContainerFragmentLoader.hh" 00021 #include "artdaq-core/Utilities/ExceptionHandler.hh" 00022 #include "artdaq-core/Utilities/TimeUtils.hh" 00023 00024 #include <fstream> 00025 #include <iomanip> 00026 #include <iterator> 00027 #include <iostream> 00028 #include <iomanip> 00029 #include <algorithm> 00030 #include <sys/poll.h> 00031 #include <arpa/inet.h> 00032 #include <netinet/in.h> 00033 #include "artdaq/DAQdata/TCPConnect.hh" 00034 00035 artdaq::RequestReceiver::RequestReceiver() 00036 : request_port_(3001) 00037 , request_addr_("227.128.12.26") 00038 , running_(false) 00039 , run_number_(0) 00040 , requests_() 00041 , request_timing_() 00042 , request_stop_requested_(false) 00043 , request_received_(false) 00044 , end_of_run_timeout_ms_(1000) 00045 , should_stop_(false) 00046 , highest_seen_request_(0) 00047 , out_of_order_requests_() 00048 , request_increment_(1) 00049 {} 00050 00051 artdaq::RequestReceiver::RequestReceiver(const fhicl::ParameterSet& ps) 00052 : request_port_(ps.get<int>("request_port", 3001)) 00053 , request_addr_(ps.get<std::string>("request_address", "227.128.12.26")) 00054 , multicast_out_addr_(ps.get<std::string>("multicast_interface_ip", "0.0.0.0")) 00055 , running_(false) 00056 , run_number_(0) 00057 , requests_() 00058 , request_timing_() 00059 , request_stop_requested_(false) 00060 , request_received_(false) 00061 , end_of_run_timeout_ms_(ps.get<size_t>("end_of_run_quiet_timeout_ms", 1000)) 00062 , should_stop_(false) 00063 , highest_seen_request_(0) 00064 , out_of_order_requests_() 00065 , request_increment_(ps.get<artdaq::Fragment::sequence_id_t>("request_increment", 1)) 00066 { 00067 setupRequestListener(); 00068 } 00069 00070 void artdaq::RequestReceiver::setupRequestListener() 00071 { 00072 TLOG(TLVL_INFO) << "Setting up request listen socket, rank=" << my_rank << ", address=" << request_addr_ << ":" << request_port_; 00073 request_socket_ = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 00074 if (request_socket_ < 0) 00075 { 00076 TLOG(TLVL_ERROR) << "Error creating socket for receiving data requests! err=" << strerror(errno); 00077 exit(1); 00078 } 00079 00080 struct sockaddr_in si_me_request; 00081 00082 int yes = 1; 00083 if (setsockopt(request_socket_, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) 00084 { 00085 TLOG(TLVL_ERROR) << "Unable to enable port reuse on request socket, err=" << strerror(errno); 00086 exit(1); 00087 } 00088 memset(&si_me_request, 0, sizeof(si_me_request)); 00089 si_me_request.sin_family = AF_INET; 00090 si_me_request.sin_port = htons(request_port_); 00091 si_me_request.sin_addr.s_addr = htonl(INADDR_ANY); 00092 if (bind(request_socket_, (struct sockaddr *)&si_me_request, sizeof(si_me_request)) == -1) 00093 { 00094 TLOG(TLVL_ERROR) << "Cannot bind request socket to port " << request_port_ << ", err=" << strerror(errno); 00095 exit(1); 00096 } 00097 00098 if (request_addr_ != "localhost") 00099 { 00100 struct ip_mreq mreq; 00101 int sts = ResolveHost(request_addr_.c_str(), mreq.imr_multiaddr); 00102 if (sts == -1) 00103 { 00104 TLOG(TLVL_ERROR) << "Unable to resolve multicast request address, err=" << strerror(errno); 00105 exit(1); 00106 } 00107 sts = GetInterfaceForNetwork(multicast_out_addr_.c_str(), mreq.imr_interface); 00108 if (sts == -1) 00109 { 00110 TLOG(TLVL_ERROR) << "Unable to resolve hostname for " << multicast_out_addr_; 00111 exit(1); 00112 } 00113 if (setsockopt(request_socket_, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) 00114 { 00115 TLOG(TLVL_ERROR) << "Unable to join multicast group, err=" << strerror(errno); 00116 exit(1); 00117 } 00118 } 00119 TLOG(TLVL_INFO) << "Done setting up request socket, rank=" << my_rank; 00120 } 00121 00122 artdaq::RequestReceiver::~RequestReceiver() 00123 { 00124 stopRequestReception(true); 00125 } 00126 00127 void artdaq::RequestReceiver::stopRequestReception(bool force) 00128 { 00129 std::unique_lock<std::mutex> lk(state_mutex_); 00130 if (!request_received_ && !force) 00131 { 00132 TLOG(TLVL_ERROR) << "Stop request received by RequestReceiver, but no requests have ever been received." << std::endl 00133 << "Check that UDP port " << request_port_ << " is open in the firewall config."; 00134 } 00135 should_stop_ = true; 00136 if (running_) { 00137 TLOG(TLVL_DEBUG) << "Joining requestThread"; 00138 if (requestThread_.joinable()) requestThread_.join(); 00139 bool once = true; 00140 while (running_) { 00141 if (once) TLOG(TLVL_ERROR) << "running_ is true after thread join! Should NOT happen"; 00142 once = false; 00143 usleep(10000); 00144 } 00145 } 00146 00147 if (request_socket_ != -1) 00148 { 00149 close(request_socket_); 00150 request_socket_ = -1; 00151 } 00152 request_received_ = false; 00153 highest_seen_request_ = 0; 00154 } 00155 00156 void artdaq::RequestReceiver::startRequestReception() 00157 { 00158 std::unique_lock<std::mutex> lk(state_mutex_); 00159 if (requestThread_.joinable()) requestThread_.join(); 00160 should_stop_ = false; 00161 request_stop_requested_ = false; 00162 00163 if (request_socket_ == -1) 00164 { 00165 TLOG(TLVL_INFO) << "Connecting Request Reception socket"; 00166 setupRequestListener(); 00167 } 00168 00169 TLOG(TLVL_INFO) << "Starting Request Reception Thread"; 00170 try { 00171 requestThread_ = boost::thread(&RequestReceiver::receiveRequestsLoop, this); 00172 } 00173 catch (const boost::exception& e) 00174 { 00175 TLOG(TLVL_ERROR) << "Caught boost::exception starting Request Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno; 00176 std::cerr << "Caught boost::exception starting Request Receiver thread: " << boost::diagnostic_information(e) << ", errno=" << errno << std::endl; 00177 exit(5); 00178 } 00179 running_ = true; 00180 } 00181 00182 void artdaq::RequestReceiver::receiveRequestsLoop() 00183 { 00184 while (!should_stop_) 00185 { 00186 TLOG(16) << "receiveRequestsLoop: Polling Request socket for new requests"; 00187 00188 if (request_socket_ == -1) 00189 { 00190 setupRequestListener(); 00191 } 00192 00193 int ms_to_wait = 10; 00194 struct pollfd ufds[1]; 00195 ufds[0].fd = request_socket_; 00196 ufds[0].events = POLLIN | POLLPRI | POLLERR; 00197 int rv = poll(ufds, 1, ms_to_wait); 00198 00199 // Continue loop if no message received or message does not have correct event ID 00200 if (rv <= 0 || (ufds[0].revents != POLLIN && ufds[0].revents != POLLPRI)) 00201 { 00202 if (rv == 1 && (ufds[0].revents & (POLLNVAL | POLLERR | POLLHUP))) 00203 { 00204 close(request_socket_); 00205 request_socket_ = -1; 00206 } 00207 if (request_stop_requested_ && TimeUtils::GetElapsedTimeMilliseconds(request_stop_timeout_) > end_of_run_timeout_ms_) 00208 { 00209 break; 00210 } 00211 continue; 00212 } 00213 00214 TLOG(11) << "Received packet on Request channel"; 00215 std::vector<uint8_t> buffer(MAX_REQUEST_MESSAGE_SIZE); 00216 struct sockaddr_in from; 00217 socklen_t len = sizeof(from); 00218 auto sts = recvfrom(request_socket_, &buffer[0], MAX_REQUEST_MESSAGE_SIZE, 0, (struct sockaddr*)&from, &len); 00219 if (sts < 0) 00220 { 00221 TLOG(TLVL_ERROR) << "Error receiving request message header err=" << strerror(errno); 00222 close(request_socket_); 00223 request_socket_ = -1; 00224 continue; 00225 } 00226 00227 auto hdr_buffer = reinterpret_cast<artdaq::detail::RequestHeader*>(&buffer[0]); 00228 TLOG(11) << "Request header word: 0x" << std::hex << hdr_buffer->header << std::dec << ", packet_count: " << hdr_buffer->packet_count << " from rank " << hdr_buffer->rank << ", " << inet_ntoa(from.sin_addr) << ":" << from.sin_port << ", run number: " << hdr_buffer->run_number; 00229 if (!hdr_buffer->isValid()) continue; 00230 00231 request_received_ = true; 00232 00233 // 19-Dec-2018, KAB: added check on current run number 00234 if (run_number_ != 0 && hdr_buffer->run_number != run_number_) 00235 { 00236 TLOG(TLVL_WARNING) << "Received a Request Message with the wrong run number (" 00237 << hdr_buffer->run_number << "), expected " << run_number_ 00238 << ", ignoring this request."; 00239 continue; 00240 } 00241 00242 if (hdr_buffer->mode == artdaq::detail::RequestMessageMode::EndOfRun) 00243 { 00244 TLOG(TLVL_INFO) << "Received Request Message with the EndOfRun marker. (Re)Starting 1-second timeout for receiving all outstanding requests..."; 00245 request_stop_timeout_ = std::chrono::steady_clock::now(); 00246 request_stop_requested_ = true; 00247 } 00248 00249 std::vector<artdaq::detail::RequestPacket> pkt_buffer(hdr_buffer->packet_count); 00250 memcpy(&pkt_buffer[0], &buffer[sizeof(artdaq::detail::RequestHeader)], sizeof(artdaq::detail::RequestPacket) * hdr_buffer->packet_count); 00251 bool anyNew = false; 00252 00253 if (should_stop_) break; 00254 00255 for (auto& buffer : pkt_buffer) 00256 { 00257 TLOG(20) << "Request Packet: hdr=" << /*std::dec <<*/ buffer.header << ", seq=" << buffer.sequence_id << ", ts=" << buffer.timestamp; 00258 if (!buffer.isValid()) continue; 00259 std::unique_lock<std::mutex> tlk(request_mutex_); 00260 if (requests_.count(buffer.sequence_id) && requests_[buffer.sequence_id] != buffer.timestamp) 00261 { 00262 TLOG(TLVL_ERROR) << "Received conflicting request for SeqID " 00263 << buffer.sequence_id << "!" 00264 << " Old ts=" << requests_[buffer.sequence_id] 00265 << ", new ts=" << buffer.timestamp << ". Keeping OLD!"; 00266 } 00267 else if (!requests_.count(buffer.sequence_id)) 00268 { 00269 int delta = buffer.sequence_id - highest_seen_request_; 00270 TLOG(11) << "Received request for sequence ID " << buffer.sequence_id 00271 << " and timestamp " << buffer.timestamp << " (delta: " << delta << ")"; 00272 if (delta <= 0 || out_of_order_requests_.count(buffer.sequence_id)) 00273 { 00274 TLOG(11) << "Already serviced this request ( sequence ID " << buffer.sequence_id << ")! Ignoring..."; 00275 } 00276 else 00277 { 00278 requests_[buffer.sequence_id] = buffer.timestamp; 00279 request_timing_[buffer.sequence_id] = std::chrono::steady_clock::now(); 00280 anyNew = true; 00281 } 00282 } 00283 } 00284 if (anyNew) 00285 { 00286 request_cv_.notify_all(); 00287 } 00288 } 00289 TLOG(TLVL_DEBUG) << "Ending Request Thread"; 00290 running_ = false; 00291 } 00292 00293 void artdaq::RequestReceiver::RemoveRequest(artdaq::Fragment::sequence_id_t reqID) 00294 { 00295 TLOG(10) << "RemoveRequest: Removing request for id " << reqID; 00296 std::unique_lock<std::mutex> lk(request_mutex_); 00297 requests_.erase(reqID); 00298 00299 if (reqID > highest_seen_request_) 00300 { 00301 TLOG(10) << "RemoveRequest: out_of_order_requests_.size() == " << out_of_order_requests_.size() << ", reqID=" << reqID << ", expected=" << highest_seen_request_ + request_increment_; 00302 if (out_of_order_requests_.size() || reqID != highest_seen_request_ + request_increment_) 00303 { 00304 out_of_order_requests_.insert(reqID); 00305 00306 auto it = out_of_order_requests_.begin(); 00307 while (it != out_of_order_requests_.end() && !should_stop_) // Stop accounting for requests after stop 00308 { 00309 if (*it == highest_seen_request_ + request_increment_) 00310 { 00311 highest_seen_request_ = *it; 00312 it = out_of_order_requests_.erase(it); 00313 } 00314 else 00315 { 00316 break; 00317 } 00318 } 00319 } 00320 else // no out-of-order requests and this request is highest seen + request_increment_ 00321 { 00322 highest_seen_request_ = reqID; 00323 } 00324 TLOG(10) << "RemoveRequest: reqID=" << reqID << " Setting highest_seen_request_ to " << highest_seen_request_; 00325 } 00326 if (metricMan && request_timing_.count(reqID)) 00327 { 00328 metricMan->sendMetric("Request Response Time", TimeUtils::GetElapsedTime(request_timing_[reqID]), "seconds", 2, MetricMode::Average); 00329 } 00330 request_timing_.erase(reqID); 00331 }