Skip to content

Commit 7d8c771

Browse files
authored
HDDS-12469. Mark statemachine unhealthy for write operation timeout. (#8022)
1 parent 9aa41fd commit 7d8c771

File tree

4 files changed

+197
-52
lines changed

4 files changed

+197
-52
lines changed

hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java

+3-1
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,9 @@ public final class ScmConfigKeys {
632632

633633
public static final String NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY =
634634
"net.topology.node.switch.mapping.impl";
635-
635+
public static final String HDDS_CONTAINER_RATIS_STATEMACHINE_WRITE_WAIT_INTERVAL
636+
= "hdds.container.ratis.statemachine.write.wait.interval";
637+
public static final long HDDS_CONTAINER_RATIS_STATEMACHINE_WRITE_WAIT_INTERVAL_NS_DEFAULT = 10 * 60 * 1000_000_000L;
636638
/**
637639
* Never constructed.
638640
*/

hadoop-hdds/common/src/main/resources/ozone-default.xml

+8
Original file line numberDiff line numberDiff line change
@@ -3548,6 +3548,14 @@
35483548
Timeout for the request submitted directly to Ratis in datanode.
35493549
</description>
35503550
</property>
3551+
<property>
3552+
<name>hdds.container.ratis.statemachine.write.wait.interval</name>
3553+
<tag>OZONE, DATANODE</tag>
3554+
<value>10m</value>
3555+
<description>
3556+
Timeout for the write path for container blocks.
3557+
</description>
3558+
</property>
35513559
<property>
35523560
<name>hdds.datanode.slow.op.warning.threshold</name>
35533561
<tag>OZONE, DATANODE, PERFORMANCE</tag>

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java

+132-51
Original file line numberDiff line numberDiff line change
@@ -35,17 +35,22 @@
3535
import java.util.Iterator;
3636
import java.util.List;
3737
import java.util.Map;
38+
import java.util.NavigableMap;
3839
import java.util.Objects;
3940
import java.util.Optional;
4041
import java.util.Set;
42+
import java.util.SortedMap;
4143
import java.util.concurrent.CompletableFuture;
4244
import java.util.concurrent.CompletionException;
4345
import java.util.concurrent.ConcurrentHashMap;
46+
import java.util.concurrent.ConcurrentSkipListMap;
4447
import java.util.concurrent.ExecutorService;
4548
import java.util.concurrent.Executors;
49+
import java.util.concurrent.Future;
4650
import java.util.concurrent.Semaphore;
4751
import java.util.concurrent.ThreadFactory;
4852
import java.util.concurrent.ThreadPoolExecutor;
53+
import java.util.concurrent.TimeUnit;
4954
import java.util.concurrent.atomic.AtomicBoolean;
5055
import java.util.function.Consumer;
5156
import org.apache.hadoop.hdds.HddsUtils;
@@ -187,13 +192,38 @@ long getStartTime() {
187192
}
188193
}
189194

195+
static class WriteFutures {
196+
private final Future<ContainerCommandResponseProto> writeChunkFuture;
197+
private final CompletableFuture<Message> raftFuture;
198+
private final long startTime;
199+
200+
WriteFutures(Future<ContainerCommandResponseProto> writeChunkFuture,
201+
CompletableFuture<Message> raftFuture, long startTime) {
202+
this.writeChunkFuture = writeChunkFuture;
203+
this.raftFuture = raftFuture;
204+
this.startTime = startTime;
205+
}
206+
207+
public Future<ContainerCommandResponseProto> getWriteChunkFuture() {
208+
return writeChunkFuture;
209+
}
210+
211+
public CompletableFuture<Message> getRaftFuture() {
212+
return raftFuture;
213+
}
214+
215+
long getStartTime() {
216+
return startTime;
217+
}
218+
}
219+
190220
private final SimpleStateMachineStorage storage =
191221
new SimpleStateMachineStorage();
192222
private final ContainerDispatcher dispatcher;
193223
private final ContainerController containerController;
194224
private final XceiverServerRatis ratisServer;
195-
private final ConcurrentHashMap<Long,
196-
CompletableFuture<ContainerCommandResponseProto>> writeChunkFutureMap;
225+
private final NavigableMap<Long, WriteFutures> writeChunkFutureMap;
226+
private final long writeChunkWaitMaxNs;
197227

198228
// keeps track of the containers created per pipeline
199229
private final Map<Long, Long> container2BCSIDMap;
@@ -229,7 +259,7 @@ public ContainerStateMachine(HddsDatanodeService hddsDatanodeService, RaftGroupI
229259
this.containerController = containerController;
230260
this.ratisServer = ratisServer;
231261
metrics = CSMMetrics.create(gid);
232-
this.writeChunkFutureMap = new ConcurrentHashMap<>();
262+
this.writeChunkFutureMap = new ConcurrentSkipListMap<>();
233263
applyTransactionCompletionMap = new ConcurrentHashMap<>();
234264
this.unhealthyContainers = ConcurrentHashMap.newKeySet();
235265
long pendingRequestsBytesLimit = (long)conf.getStorageSize(
@@ -273,6 +303,8 @@ public ContainerStateMachine(HddsDatanodeService hddsDatanodeService, RaftGroupI
273303
this.waitOnBothFollowers = conf.getObject(
274304
DatanodeConfiguration.class).waitOnAllFollowers();
275305

306+
this.writeChunkWaitMaxNs = conf.getTimeDuration(ScmConfigKeys.HDDS_CONTAINER_RATIS_STATEMACHINE_WRITE_WAIT_INTERVAL,
307+
ScmConfigKeys.HDDS_CONTAINER_RATIS_STATEMACHINE_WRITE_WAIT_INTERVAL_NS_DEFAULT, TimeUnit.NANOSECONDS);
276308
}
277309

278310
private void validatePeers() throws IOException {
@@ -542,6 +574,16 @@ private ContainerCommandResponseProto dispatchCommand(
542574
private CompletableFuture<Message> writeStateMachineData(
543575
ContainerCommandRequestProto requestProto, long entryIndex, long term,
544576
long startTime) {
577+
final WriteFutures previous = writeChunkFutureMap.get(entryIndex);
578+
if (previous != null) {
579+
// generally state machine will wait forever, for precaution, a check is added if retry happens.
580+
return previous.getRaftFuture();
581+
}
582+
try {
583+
validateLongRunningWrite();
584+
} catch (StorageContainerException e) {
585+
return completeExceptionally(e);
586+
}
545587
final WriteChunkRequestProto write = requestProto.getWriteChunk();
546588
RaftServer server = ratisServer.getServer();
547589
Preconditions.checkArgument(!write.getData().isEmpty());
@@ -564,19 +606,22 @@ private CompletableFuture<Message> writeStateMachineData(
564606
.setContainer2BCSIDMap(container2BCSIDMap)
565607
.build();
566608
CompletableFuture<Message> raftFuture = new CompletableFuture<>();
567-
// ensure the write chunk happens asynchronously in writeChunkExecutor pool
568-
// thread.
569-
CompletableFuture<ContainerCommandResponseProto> writeChunkFuture =
570-
CompletableFuture.supplyAsync(() -> {
609+
// ensure the write chunk happens asynchronously in writeChunkExecutor pool thread.
610+
Future<ContainerCommandResponseProto> future = getChunkExecutor(
611+
requestProto.getWriteChunk()).submit(() -> {
571612
try {
572613
try {
573614
checkContainerHealthy(write.getBlockID().getContainerID(), true);
574615
} catch (StorageContainerException e) {
575-
return ContainerUtils.logAndReturnError(LOG, e, requestProto);
616+
ContainerCommandResponseProto result = ContainerUtils.logAndReturnError(LOG, e, requestProto);
617+
handleCommandResult(requestProto, entryIndex, startTime, result, write, raftFuture);
618+
return result;
576619
}
577620
metrics.recordWriteStateMachineQueueingLatencyNs(
578621
Time.monotonicNowNanos() - startTime);
579-
return dispatchCommand(requestProto, context);
622+
ContainerCommandResponseProto result = dispatchCommand(requestProto, context);
623+
handleCommandResult(requestProto, entryIndex, startTime, result, write, raftFuture);
624+
return result;
580625
} catch (Exception e) {
581626
LOG.error("{}: writeChunk writeStateMachineData failed: blockId" +
582627
"{} logIndex {} chunkName {}", getGroupId(), write.getBlockID(),
@@ -588,55 +633,87 @@ private CompletableFuture<Message> writeStateMachineData(
588633
stateMachineHealthy.set(false);
589634
raftFuture.completeExceptionally(e);
590635
throw e;
636+
} finally {
637+
// Remove the future once it finishes execution from the
638+
writeChunkFutureMap.remove(entryIndex);
591639
}
592-
}, getChunkExecutor(requestProto.getWriteChunk()));
640+
});
593641

594-
writeChunkFutureMap.put(entryIndex, writeChunkFuture);
642+
writeChunkFutureMap.put(entryIndex, new WriteFutures(future, raftFuture, startTime));
595643
if (LOG.isDebugEnabled()) {
596644
LOG.debug("{}: writeChunk writeStateMachineData : blockId" +
597645
"{} logIndex {} chunkName {}", getGroupId(), write.getBlockID(),
598646
entryIndex, write.getChunkData().getChunkName());
599647
}
600-
// Remove the future once it finishes execution from the
601-
// writeChunkFutureMap.
602-
writeChunkFuture.thenApply(r -> {
603-
if (r.getResult() != ContainerProtos.Result.SUCCESS
604-
&& r.getResult() != ContainerProtos.Result.CONTAINER_NOT_OPEN
605-
&& r.getResult() != ContainerProtos.Result.CLOSED_CONTAINER_IO
606-
// After concurrent flushes are allowed on the same key, chunk file inconsistencies can happen and
607-
// that should not crash the pipeline.
608-
&& r.getResult() != ContainerProtos.Result.CHUNK_FILE_INCONSISTENCY) {
609-
StorageContainerException sce =
610-
new StorageContainerException(r.getMessage(), r.getResult());
611-
LOG.error(getGroupId() + ": writeChunk writeStateMachineData failed: blockId" +
648+
return raftFuture;
649+
}
650+
651+
private void handleCommandResult(ContainerCommandRequestProto requestProto, long entryIndex, long startTime,
652+
ContainerCommandResponseProto r, WriteChunkRequestProto write,
653+
CompletableFuture<Message> raftFuture) {
654+
if (r.getResult() != ContainerProtos.Result.SUCCESS
655+
&& r.getResult() != ContainerProtos.Result.CONTAINER_NOT_OPEN
656+
&& r.getResult() != ContainerProtos.Result.CLOSED_CONTAINER_IO
657+
// After concurrent flushes are allowed on the same key, chunk file inconsistencies can happen and
658+
// that should not crash the pipeline.
659+
&& r.getResult() != ContainerProtos.Result.CHUNK_FILE_INCONSISTENCY) {
660+
StorageContainerException sce =
661+
new StorageContainerException(r.getMessage(), r.getResult());
662+
LOG.error(getGroupId() + ": writeChunk writeStateMachineData failed: blockId" +
663+
write.getBlockID() + " logIndex " + entryIndex + " chunkName " +
664+
write.getChunkData().getChunkName() + " Error message: " +
665+
r.getMessage() + " Container Result: " + r.getResult());
666+
metrics.incNumWriteDataFails();
667+
// If the write fails currently we mark the stateMachine as unhealthy.
668+
// This leads to pipeline close. Any change in that behavior requires
669+
// handling the entry for the write chunk in cache.
670+
stateMachineHealthy.set(false);
671+
unhealthyContainers.add(write.getBlockID().getContainerID());
672+
raftFuture.completeExceptionally(sce);
673+
} else {
674+
metrics.incNumBytesWrittenCount(
675+
requestProto.getWriteChunk().getChunkData().getLen());
676+
if (LOG.isDebugEnabled()) {
677+
LOG.debug(getGroupId() +
678+
": writeChunk writeStateMachineData completed: blockId" +
612679
write.getBlockID() + " logIndex " + entryIndex + " chunkName " +
613-
write.getChunkData().getChunkName() + " Error message: " +
614-
r.getMessage() + " Container Result: " + r.getResult());
615-
metrics.incNumWriteDataFails();
616-
// If the write fails currently we mark the stateMachine as unhealthy.
617-
// This leads to pipeline close. Any change in that behavior requires
618-
// handling the entry for the write chunk in cache.
619-
stateMachineHealthy.set(false);
620-
unhealthyContainers.add(write.getBlockID().getContainerID());
621-
raftFuture.completeExceptionally(sce);
622-
} else {
623-
metrics.incNumBytesWrittenCount(
624-
requestProto.getWriteChunk().getChunkData().getLen());
625-
if (LOG.isDebugEnabled()) {
626-
LOG.debug(getGroupId() +
627-
": writeChunk writeStateMachineData completed: blockId" +
628-
write.getBlockID() + " logIndex " + entryIndex + " chunkName " +
629-
write.getChunkData().getChunkName());
630-
}
631-
raftFuture.complete(r::toByteString);
632-
metrics.recordWriteStateMachineCompletionNs(
633-
Time.monotonicNowNanos() - startTime);
680+
write.getChunkData().getChunkName());
634681
}
682+
raftFuture.complete(r::toByteString);
683+
metrics.recordWriteStateMachineCompletionNs(
684+
Time.monotonicNowNanos() - startTime);
685+
}
686+
}
635687

636-
writeChunkFutureMap.remove(entryIndex);
637-
return r;
638-
});
639-
return raftFuture;
688+
private void validateLongRunningWrite() throws StorageContainerException {
689+
// get min valid write chunk operation's future context
690+
Map.Entry<Long, WriteFutures> writeFutureContextEntry = null;
691+
for (boolean found = false; !found;) {
692+
writeFutureContextEntry = writeChunkFutureMap.firstEntry();
693+
if (null == writeFutureContextEntry) {
694+
return;
695+
}
696+
if (writeFutureContextEntry.getValue().getWriteChunkFuture().isDone()) {
697+
// there is a possibility that writeChunkFutureMap may have dangling entry, as remove is done before add future
698+
writeChunkFutureMap.remove(writeFutureContextEntry.getKey());
699+
} else {
700+
found = true;
701+
}
702+
}
703+
// validate for timeout in milli second
704+
long waitTime = Time.monotonicNowNanos() - writeFutureContextEntry.getValue().getStartTime();
705+
if (waitTime > writeChunkWaitMaxNs) {
706+
LOG.error("Write chunk has taken {}ns crossing threshold {}ns for index {} groupId {}, " +
707+
"cancelling pending write chunk for this group", waitTime, writeChunkWaitMaxNs,
708+
writeFutureContextEntry.getKey(), getGroupId());
709+
stateMachineHealthy.set(false);
710+
writeChunkFutureMap.forEach((key, value) -> {
711+
value.getWriteChunkFuture().cancel(true);
712+
});
713+
throw new StorageContainerException("Write chunk has taken " + waitTime + "ns crossing threshold "
714+
+ writeChunkWaitMaxNs + "ns for index " + writeFutureContextEntry.getKey() + " groupId " + getGroupId(),
715+
ContainerProtos.Result.CONTAINER_INTERNAL_ERROR);
716+
}
640717
}
641718

642719
private StateMachine.DataChannel getStreamDataChannel(
@@ -819,9 +896,13 @@ private ByteString readStateMachineData(
819896
*/
820897
@Override
821898
public CompletableFuture<Void> flush(long index) {
822-
return CompletableFuture.allOf(
823-
writeChunkFutureMap.entrySet().stream().filter(x -> x.getKey() <= index)
824-
.map(Map.Entry::getValue).toArray(CompletableFuture[]::new));
899+
final SortedMap<Long, WriteFutures> head = writeChunkFutureMap.headMap(index, true);
900+
if (head.isEmpty()) {
901+
return CompletableFuture.completedFuture(null);
902+
}
903+
return CompletableFuture.allOf(head.values().stream()
904+
.map(WriteFutures::getRaftFuture)
905+
.toArray(CompletableFuture[]::new));
825906
}
826907

827908
/**

hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/TestContainerStateMachine.java

+54
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import static org.junit.jupiter.api.Assertions.assertNotNull;
2323
import static org.junit.jupiter.api.Assertions.assertNull;
2424
import static org.mockito.ArgumentMatchers.any;
25+
import static org.mockito.Mockito.doAnswer;
2526
import static org.mockito.Mockito.mock;
2627
import static org.mockito.Mockito.reset;
2728
import static org.mockito.Mockito.times;
@@ -30,8 +31,10 @@
3031

3132
import com.google.common.util.concurrent.ThreadFactoryBuilder;
3233
import java.io.IOException;
34+
import java.lang.reflect.Field;
3335
import java.util.List;
3436
import java.util.UUID;
37+
import java.util.concurrent.CompletableFuture;
3538
import java.util.concurrent.ExecutionException;
3639
import java.util.concurrent.LinkedBlockingQueue;
3740
import java.util.concurrent.ThreadPoolExecutor;
@@ -57,6 +60,7 @@
5760
import org.junit.jupiter.api.AfterAll;
5861
import org.junit.jupiter.api.AfterEach;
5962
import org.junit.jupiter.api.BeforeEach;
63+
import org.junit.jupiter.api.Test;
6064
import org.junit.jupiter.api.TestInstance;
6165
import org.junit.jupiter.params.ParameterizedTest;
6266
import org.junit.jupiter.params.provider.ValueSource;
@@ -246,4 +250,54 @@ public void testApplyTransactionFailure(boolean failWithException) throws Execut
246250
ContainerProtos.ContainerCommandResponseProto.parseFrom(succcesfulTransaction.getContent());
247251
assertEquals(ContainerProtos.Result.SUCCESS, resp.getResult());
248252
}
253+
254+
@Test
255+
public void testWriteTimout() throws Exception {
256+
RaftProtos.LogEntryProto entry = mock(RaftProtos.LogEntryProto.class);
257+
when(entry.getTerm()).thenReturn(1L);
258+
when(entry.getIndex()).thenReturn(1L);
259+
RaftProtos.LogEntryProto entryNext = mock(RaftProtos.LogEntryProto.class);
260+
when(entryNext.getTerm()).thenReturn(1L);
261+
when(entryNext.getIndex()).thenReturn(2L);
262+
TransactionContext trx = mock(TransactionContext.class);
263+
ContainerStateMachine.Context context = mock(ContainerStateMachine.Context.class);
264+
when(trx.getStateMachineContext()).thenReturn(context);
265+
doAnswer(e -> {
266+
try {
267+
Thread.sleep(200000);
268+
} catch (InterruptedException ie) {
269+
Thread.currentThread().interrupt();
270+
throw ie;
271+
}
272+
return null;
273+
}).when(dispatcher).dispatch(any(), any());
274+
275+
when(context.getRequestProto()).thenReturn(ContainerProtos.ContainerCommandRequestProto.newBuilder()
276+
.setCmdType(ContainerProtos.Type.WriteChunk).setWriteChunk(
277+
ContainerProtos.WriteChunkRequestProto.newBuilder().setData(ByteString.copyFromUtf8("Test Data"))
278+
.setBlockID(
279+
ContainerProtos.DatanodeBlockID.newBuilder().setContainerID(1).setLocalID(1).build()).build())
280+
.setContainerID(1)
281+
.setDatanodeUuid(UUID.randomUUID().toString()).build());
282+
AtomicReference<Throwable> throwable = new AtomicReference<>(null);
283+
Function<Throwable, ? extends Message> throwableSetter = t -> {
284+
throwable.set(t);
285+
return null;
286+
};
287+
Field writeChunkWaitMaxNs = stateMachine.getClass().getDeclaredField("writeChunkWaitMaxNs");
288+
writeChunkWaitMaxNs.setAccessible(true);
289+
writeChunkWaitMaxNs.set(stateMachine, 1000_000_000);
290+
CompletableFuture<Message> firstWrite = stateMachine.write(entry, trx);
291+
Thread.sleep(2000);
292+
CompletableFuture<Message> secondWrite = stateMachine.write(entryNext, trx);
293+
firstWrite.exceptionally(throwableSetter).get();
294+
assertNotNull(throwable.get());
295+
assertInstanceOf(InterruptedException.class, throwable.get());
296+
297+
secondWrite.exceptionally(throwableSetter).get();
298+
assertNotNull(throwable.get());
299+
assertInstanceOf(StorageContainerException.class, throwable.get());
300+
StorageContainerException sce = (StorageContainerException) throwable.get();
301+
assertEquals(ContainerProtos.Result.CONTAINER_INTERNAL_ERROR, sce.getResult());
302+
}
249303
}

0 commit comments

Comments
 (0)