From 0bee7e1f837e1500c9534206123eb439f52bce9b Mon Sep 17 00:00:00 2001 From: chayanikabhandary Date: Thu, 26 Jun 2025 15:39:12 +0530 Subject: [PATCH 1/2] HDFS-17803: Compute correct checksum type when file is empty --- .../hadoop/hdfs/FileChecksumHelper.java | 34 +++++++-- .../hadoop/hdfs/TestFileChecksumHelper.java | 75 +++++++++++++++++++ 2 files changed, 101 insertions(+), 8 deletions(-) create mode 100644 hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/TestFileChecksumHelper.java diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/FileChecksumHelper.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/FileChecksumHelper.java index 21bc885358076..6239ddfe1a41e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/FileChecksumHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/FileChecksumHelper.java @@ -240,20 +240,38 @@ void compute() throws IOException { * magic entry that matches what previous hdfs versions return. */ if (locatedBlocks == null || locatedBlocks.isEmpty()) { - // Explicitly specified here in case the default DataOutputBuffer - // buffer length value is changed in future. This matters because the - // fixed value 32 has to be used to repeat the magic value for previous - // HDFS version. - final int lenOfZeroBytes = 32; - byte[] emptyBlockMd5 = new byte[lenOfZeroBytes]; - MD5Hash fileMD5 = MD5Hash.digest(emptyBlockMd5); - fileChecksum = new MD5MD5CRC32GzipFileChecksum(0, 0, fileMD5); + fileChecksum = makeEmptyBlockResult(); } else { checksumBlocks(); fileChecksum = makeFinalResult(); } } + /** + * Returns a zero byte checksum based on the combine mode and CRC type + */ + FileChecksum makeEmptyBlockResult() { + // Explicitly specified here in case the default DataOutputBuffer + // buffer length value is changed in future. This matters because the + // fixed value 32 has to be used to repeat the magic value for previous + // HDFS version. + final int lenOfZeroBytes = 32; + byte[] emptyBlockMd5 = new byte[lenOfZeroBytes]; + MD5Hash fileMD5 = MD5Hash.digest(emptyBlockMd5); + + switch (combineMode) { + case MD5MD5CRC: + if (crcType == DataChecksum.Type.CRC32C) { + return new MD5MD5CRC32CastagnoliFileChecksum(0, 0, fileMD5); + } + return new MD5MD5CRC32GzipFileChecksum(0, 0, fileMD5); + case COMPOSITE_CRC: + return new CompositeCrcFileChecksum(0, getCrcType(), bytesPerCRC); + default: + return new MD5MD5CRC32GzipFileChecksum(0, 0, fileMD5); + } + } + /** * Compute block checksums block by block and append the raw bytes of the * block checksums into getBlockChecksumBuf(). diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/TestFileChecksumHelper.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/TestFileChecksumHelper.java new file mode 100644 index 0000000000000..288a7cea927ca --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/TestFileChecksumHelper.java @@ -0,0 +1,75 @@ +package org.apache.hadoop.hdfs; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileChecksum; +import org.apache.hadoop.fs.MD5MD5CRC32CastagnoliFileChecksum; +import org.apache.hadoop.fs.MD5MD5CRC32GzipFileChecksum; +import org.apache.hadoop.fs.CompositeCrcFileChecksum; +import org.apache.hadoop.fs.Options.ChecksumCombineMode; +import org.apache.hadoop.hdfs.protocol.ClientProtocol; +import org.apache.hadoop.hdfs.protocol.LocatedBlocks; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.Arrays; +import java.util.Collection; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mockito.Mockito.mock; + + +import org.apache.hadoop.util.DataChecksum; + +@RunWith(Parameterized.class) +public class TestFileChecksumHelper { + + private final ChecksumCombineMode combineMode; + private final DataChecksum.Type crcType; + private final Class expectedChecksumClass; + + public TestFileChecksumHelper(ChecksumCombineMode combineMode, + DataChecksum.Type crcType, + Class expectedChecksumClass) { + this.combineMode = combineMode; + this.crcType = crcType; + this.expectedChecksumClass = expectedChecksumClass; + } + + @Parameterized.Parameters(name = "{index}: Mode={0}, CRC={1}, Expect={2}") + public static Collection data() { + return Arrays.asList(new Object[][]{ + {ChecksumCombineMode.MD5MD5CRC, DataChecksum.Type.CRC32, MD5MD5CRC32GzipFileChecksum.class}, + {ChecksumCombineMode.MD5MD5CRC, DataChecksum.Type.CRC32C, MD5MD5CRC32CastagnoliFileChecksum.class}, + {ChecksumCombineMode.COMPOSITE_CRC, DataChecksum.Type.CRC32, CompositeCrcFileChecksum.class}, + {ChecksumCombineMode.COMPOSITE_CRC, DataChecksum.Type.CRC32C, CompositeCrcFileChecksum.class}, + }); + } + + @Test + public void testComputeReturnsCorrectChecksumForEmptyBlocks() throws Exception { + Configuration conf = new Configuration(); + conf.set("dfs.checksum.combine.mode", combineMode.toString()); + conf.set("dfs.checksum.type", crcType.toString()); + + LocatedBlocks emptyBlocks = new LocatedBlocks(); // No blocks + + DFSClient mockClient = mock(DFSClient.class); + ClientProtocol mockNamenode = mock(ClientProtocol.class); + + FileChecksumHelper.ReplicatedFileChecksumComputer checker = + new FileChecksumHelper.ReplicatedFileChecksumComputer( + "/empty-file", 0L, emptyBlocks, + mockNamenode, mockClient, combineMode + ); + + checker.setCrcType(crcType); + checker.setBytesPerCRC(512); + checker.compute(); + FileChecksum checksum = checker.getFileChecksum(); + + assertNotNull("Checksum must not be null", checksum); + assertEquals("Unexpected checksum class", expectedChecksumClass, checksum.getClass()); + } +} From 1c94a94d7631fb4072e38b3dd2e47e3dc6dc199b Mon Sep 17 00:00:00 2001 From: chayanikabhandary Date: Fri, 27 Jun 2025 10:08:01 +0530 Subject: [PATCH 2/2] add license --- .../hadoop/hdfs/TestFileChecksumHelper.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/TestFileChecksumHelper.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/TestFileChecksumHelper.java index 288a7cea927ca..ea5ac9519390e 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/TestFileChecksumHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs-client/src/test/java/org/apache/hadoop/hdfs/TestFileChecksumHelper.java @@ -1,3 +1,20 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.hadoop.hdfs; import org.apache.hadoop.conf.Configuration;