Skip to content

Commit c924166

Browse files
committed
tfbuilder: wait for all ucx enpoints to be created before publishing ready and closing the listener thread
1 parent 2f0c029 commit c924166

File tree

7 files changed

+48
-44
lines changed

7 files changed

+48
-44
lines changed

src/TfBuilder/TfBuilderDevice.cxx

Lines changed: 18 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -152,39 +152,18 @@ void TfBuilderDevice::InitTask()
152152
}
153153
}
154154

155-
// start the task
156-
if (!start()) {
157-
mShouldExit = true;
158-
throw std::runtime_error("Aborting InitTask(). Cannot configure.");
159-
}
160-
161-
// wait for the memory allocation and registration to finish
162-
lBuffersAllocatedFuture.wait();
163-
if (!lBuffersAllocatedFuture.get()) {
164-
EDDLOG("InitTask::MemorySegment allocation failed. Exiting...");
165-
throw "InitTask::MemorySegment allocation failed. Exiting...";
166-
return;
167-
}
168-
169-
DDDLOG("InitTask completed.");
170-
}
171-
172-
bool TfBuilderDevice::start()
173-
{
155+
// Connect all StfSender gRPCs
174156
while (!mRpc->start(mTfDataRegionSize, mFlpInputHandler->getStfRequestQueue(), mFlpInputHandler->getDataQueue())) {
175157
// check if should stop looking for TfScheduler
176158
if (mRpc->isTerminateRequested()) {
177-
mShouldExit = true;
178-
return false;
159+
return;
179160
}
180161

181162
// try to reach the scheduler unless we should exit
182163
if (IsRunningState() && NewStatePending()) {
183-
mShouldExit = true;
184-
return false;
164+
return;
185165
}
186-
187-
std::this_thread::sleep_for(1s);
166+
std::this_thread::sleep_for(250ms);
188167
}
189168

190169
// we reached the scheduler instance, initialize everything else
@@ -200,13 +179,23 @@ bool TfBuilderDevice::start()
200179
// start file sink
201180
mFileSink.start();
202181

203-
// Start input handlers
182+
// wait for the memory allocation and registration to finish
183+
lBuffersAllocatedFuture.wait();
184+
if (!lBuffersAllocatedFuture.get()) {
185+
EDDLOG("InitTask::MemorySegment allocation failed. Exiting...");
186+
throw std::runtime_error("InitTask::MemorySegment allocation failed. Exiting...");
187+
}
188+
189+
// Start input handlers after the memory is finished allocating
204190
if (!mFlpInputHandler->start()) {
205-
mShouldExit = true;
206-
EDDLOG("Could not initialize input connections. Exiting.");
207-
return false;
191+
throw std::runtime_error("Could not initialize input connections. Exiting.");
208192
}
209193

194+
DDDLOG("InitTask completed.");
195+
}
196+
197+
bool TfBuilderDevice::start()
198+
{
210199
return true;
211200
}
212201

src/TfBuilder/TfBuilderInput.cxx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,11 @@ TfBuilderInput::TfBuilderInput(TfBuilderDevice& pStfBuilderDev, std::shared_ptr<
3939
mRpc(pRpc),
4040
mOutStage(pOutStage)
4141
{
42-
// Select which backend is used
42+
// initialize request and data queues
4343
mStfRequestQueue = std::make_shared<ConcurrentQueue<std::string>>();
4444
mReceivedDataQueue = std::make_shared<ConcurrentQueue<ReceivedStfMeta>>();
4545

46+
// Select which backend is used
4647
auto lTransportOpt = mConfig->getStringParam(DataDistNetworkTransportKey, DataDistNetworkTransportDefault);
4748
if (lTransportOpt == "fmq" || lTransportOpt == "FMQ" || lTransportOpt == "fairmq" || lTransportOpt == "FAIRMQ") {
4849
mInputFairMQ = std::make_unique<TfBuilderInputFairMQ>(pRpc, pStfBuilderDev.TfBuilderI(), *mStfRequestQueue, *mReceivedDataQueue);

src/TfBuilder/TfBuilderInputUCX.cxx

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ void TfBuilderInputUCX::ListenerThread()
8181
// progress the listener worker
8282
const auto lProgress = ucp_worker_progress(listener_worker.ucp_worker);
8383

84-
const auto lSleep = lProgress > 0 ? 0us : 50000us;
84+
const auto lSleep = lProgress > 0 ? 1us : 5000us;
8585

8686
auto lConnInfoOpt = mConnRequestQueue.pop_wait_for(lSleep);
8787
if (!lConnInfoOpt.has_value()) {
@@ -123,7 +123,7 @@ void TfBuilderInputUCX::ListenerThread()
123123
mConnMap[lStfSenderId] = std::move(lConnStruct);
124124
}
125125

126-
IDDLOG("TfBuilderInputUCX:stop: Listener thread stopped.");
126+
DDDLOG("TfBuilderInputUCX: Listener thread stopped.");
127127
}
128128

129129
bool TfBuilderInputUCX::start()
@@ -257,11 +257,28 @@ bool TfBuilderInputUCX::start()
257257
// connection successful
258258
break;
259259

260-
} while(true);
260+
} while(true);
261261

262-
// Start all the threads
262+
// Wait until we have all endpoints for StfSenders
263+
do {
264+
std::size_t lNumConnected = 0;
265+
{
266+
std::scoped_lock lLock(mConnectionMapLock);
267+
lNumConnected = mConnMap.size();
268+
}
269+
270+
if (lNumConnected == lNumStfSenders) {
271+
break;
272+
}
273+
274+
std::this_thread::sleep_for(100ms);
275+
DDDLOG_RL(5000, "TfBuilderInputUCX::start: Waiting for all StfSender ucx endpoints. connected={} total={}", lNumConnected, lNumStfSenders);
276+
} while (true);
277+
278+
// This will stop the Listener thread
263279
mState = RUNNING;
264280

281+
DDDLOG("TfBuilderInputUCX::start: Finished");
265282
return true;
266283
}
267284

src/TfBuilder/TfBuilderRpc.cxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ ::grpc::Status TfBuilderRpcImpl::BuildTfRequest(::grpc::ServerContext* /*context
295295
}
296296

297297
sNumTfRequests++;
298-
DDDLOG_RL(5000, "Requesting SubTimeFrames. tf_id={} tf_size={} total_requests={}", lTfId, lTfSize, sNumTfRequests);
298+
DDDLOG_GRL(5000, "Requesting SubTimeFrames. tf_id={} tf_size={} total_requests={}", lTfId, lTfSize, sNumTfRequests);
299299

300300
StfDataRequestMessage lStfRequest;
301301
const auto &lTfBuilderId = mDiscoveryConfig->status().info().process_id();
@@ -384,7 +384,7 @@ void TfBuilderRpcImpl::StfRequestThread()
384384
while (mRunning && !lReqVector.empty()) {
385385
// wait for the stf slots to become free
386386
if (mNumReqInFlight.load() >= mMaxNumReqInFlight) {
387-
std::this_thread::sleep_for(5ms);
387+
std::this_thread::sleep_for(500us);
388388
continue; // reevaluate the max TF conditions
389389
}
390390

src/common/discovery/StfSenderRpcClient.cxx

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
#include <string>
1717
#include <chrono>
1818

19-
namespace o2
20-
{
21-
namespace DataDistribution
19+
namespace o2::DataDistribution
2220
{
2321

2422
StfSenderRpcClient::StfSenderRpcClient(const std::string &pEndpoint) {
@@ -57,4 +55,3 @@ std::string StfSenderRpcClient::grpc_status() {
5755

5856

5957
}
60-
}

src/common/discovery/StfSenderRpcClient.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ class StfSenderRpcClientCollection {
150150
}
151151

152152
if (lStfSenderStatus.rpc_endpoint().empty()) {
153-
EDDLOG("StfSender rpc_endpoint field empty. stfs_id={}", lStfSenderId);
153+
DDDLOG("StfSender rpc_endpoint field empty. stfs_id={}", lStfSenderId);
154154
continue;
155155
}
156156

@@ -169,14 +169,14 @@ class StfSenderRpcClientCollection {
169169

170170
if (mClients.size() < lNumStfSenders) {
171171
lWaitForStfSenders = true;
172-
IDDLOG_RL(1000, "gRPC: Connected to {} out of {} StfSenders", mClients.size(), lNumStfSenders);
172+
IDDLOG_RL(10000, "gRPC: Connected to {} out of {} StfSenders", mClients.size(), lNumStfSenders);
173173
}
174174

175175
// check the connection on existing clients
176176
for (auto &[ mCliId, lClient] : mClients) {
177177
if (!lClient->is_ready()) {
178178
lAllConnReady = false;
179-
IDDLOG_RL(1000, "StfSender gRPC client connection is not ready. stfs_id={} grpc_status={}", mCliId, lClient->grpc_status());
179+
IDDLOG_RL(10000, "StfSender gRPC client connection is not ready. stfs_id={} grpc_status={}", mCliId, lClient->grpc_status());
180180
}
181181
}
182182
}

src/common/discovery/TfSchedulerRpcClient.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ class TfSchedulerRpcClient {
4141
template <typename ConsulCli>
4242
bool start(std::shared_ptr<ConsulCli> pConfig) {
4343

44-
if (!mShouldRetryStart) {
44+
if (!should_retry_start()) {
4545
return false;
4646
}
4747

0 commit comments

Comments
 (0)