From 061ba3de702b3fae3cab54fc7dbb22791366495b Mon Sep 17 00:00:00 2001 From: Abhishek Dixit Date: Thu, 2 May 2024 12:12:41 +0530 Subject: [PATCH 1/2] update test and fix --- .../converter/ParquetMetadataConverter.java | 3 +- .../TestParquetMetadataConverter.java | 45 ++++++++++++++++--- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index e752b4ceea..e5eaa22f02 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -556,8 +556,7 @@ private void addRowGroup( columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset()); - if (columnMetaData.getEncodingStats() != null - && columnMetaData.getEncodingStats().hasDictionaryPages()) { + if (columnMetaData.hasDictionaryPage()) { metaData.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset()); } long bloomFilterOffset = columnMetaData.getBloomFilterOffset(); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 4dcede624f..017801749d 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -225,6 +225,32 @@ public void testParquetMetadataConverterWithDictionary() throws IOException { Assert.assertEquals(dicOffsetOriginal, dicOffsetConverted); } + @Test + public void testParquetMetadataConverterWithDictionaryWithoutStats() throws IOException { + ParquetMetadata parquetMetaData = createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, false); + + ParquetMetadataConverter converter = new ParquetMetadataConverter(); + FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData); + + // Flag should be true + fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> { + assertTrue(column.meta_data.isSetDictionary_page_offset()); + })); + + ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream(); + Util.writeFileMetaData(fmd1, metaDataOutputStream); + ByteArrayInputStream metaDataInputStream = new ByteArrayInputStream(metaDataOutputStream.toByteArray()); + FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream); + ParquetMetadata parquetMetaDataConverted = converter.fromParquetMetadata(fmd2); + + long dicOffsetOriginal = + parquetMetaData.getBlocks().get(0).getColumns().get(0).getDictionaryPageOffset(); + long dicOffsetConverted = + parquetMetaDataConverted.getBlocks().get(0).getColumns().get(0).getDictionaryPageOffset(); + + Assert.assertEquals(dicOffsetOriginal, dicOffsetConverted); + } + @Test public void testParquetMetadataConverterWithoutDictionary() throws IOException { ParquetMetadata parquetMetaData = createParquetMetaData(null, Encoding.PLAIN); @@ -1248,17 +1274,26 @@ private static Statistics createStatsTyped(PrimitiveType type, BigInteger min } private static ParquetMetadata createParquetMetaData(Encoding dicEncoding, Encoding dataEncoding) { + return createParquetMetaData(dicEncoding, dataEncoding, true); + } + + + private static ParquetMetadata createParquetMetaData(Encoding dicEncoding, Encoding dataEncoding, + boolean includeDicStats) { MessageType schema = parseMessageType("message schema { optional int32 col (INT_32); }"); org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap(), null); List blockMetaDataList = new ArrayList(); BlockMetaData blockMetaData = new BlockMetaData(); - EncodingStats.Builder builder = new EncodingStats.Builder(); - if (dicEncoding != null) { - builder.addDictEncoding(dicEncoding).build(); + EncodingStats es = null; + if (includeDicStats) { + EncodingStats.Builder builder = new EncodingStats.Builder(); + if (dicEncoding != null) { + builder.addDictEncoding(dicEncoding).build(); + } + builder.addDataEncoding(dataEncoding); + es = builder.build(); } - builder.addDataEncoding(dataEncoding); - EncodingStats es = builder.build(); Set e = new HashSet(); PrimitiveTypeName t = PrimitiveTypeName.INT32; ColumnPath p = ColumnPath.get("col"); From f71202f2f6271bfeaa6b20d280ba9e51aba6f0b6 Mon Sep 17 00:00:00 2001 From: Abhishek Dixit Date: Mon, 6 May 2024 13:50:40 +0530 Subject: [PATCH 2/2] fix styling error --- .../format/converter/TestParquetMetadataConverter.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java index 017801749d..344785593f 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/format/converter/TestParquetMetadataConverter.java @@ -1277,9 +1277,8 @@ private static ParquetMetadata createParquetMetaData(Encoding dicEncoding, Encod return createParquetMetaData(dicEncoding, dataEncoding, true); } - - private static ParquetMetadata createParquetMetaData(Encoding dicEncoding, Encoding dataEncoding, - boolean includeDicStats) { + private static ParquetMetadata createParquetMetaData( + Encoding dicEncoding, Encoding dataEncoding, boolean includeDicStats) { MessageType schema = parseMessageType("message schema { optional int32 col (INT_32); }"); org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap(), null);