diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocation.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocation.java index b782c8488c6..7c8f47636e5 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocation.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocation.java @@ -197,6 +197,10 @@ private RowSet computeIndex() { for (int rgi = 0; rgi < rowGroups.length; ++rgi) { final long subRegionSize = rowGroups[rgi].getNum_rows(); + if (subRegionSize == 0) { + // Skip empty row groups + continue; + } final long subRegionFirstKey = (long) rgi << regionParameters.regionMaskNumBits; final long subRegionLastKey = subRegionFirstKey + subRegionSize - 1; sequentialBuilder.appendRange(subRegionFirstKey, subRegionLastKey); diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index a6d683afcc0..065f708e5b3 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -1735,6 +1735,47 @@ public void testAllNonPartitioningColumnTypes() { } } + @Test + public void testReadingParquetDataWithEmptyRowGroups() { + { + // Single parquet file with empty row group + final String path = + TestParquetTools.class.getResource("/ReferenceParquetWithEmptyRowGroup1.parquet").getFile(); + final Table fromDisk = + readTable(path, EMPTY.withLayout(ParquetInstructions.ParquetFileLayout.SINGLE_FILE)).select(); + assertEquals(0, fromDisk.size()); + assertTrue(fromDisk.getRowSet().isEmpty()); + } + + { + // Single parquet file with three row groups, first and third row group are non-empty, and second row group + // is empty. To generate this file, the following branch was used: + // https://github.com/malhotrashivam/deephaven-core/tree/sm-ref-branch + final String path = + TestParquetTools.class.getResource("/ReferenceParquetWithEmptyRowGroup2.parquet").getFile(); + final Table fromDisk = + readTable(path, EMPTY.withLayout(ParquetInstructions.ParquetFileLayout.SINGLE_FILE)).select(); + assertEquals(20, fromDisk.size()); + final Table table = TableTools.emptyTable(10).update("integers = (int)(ii%3)"); + final Table expected = merge(table, table); + assertTableEquals(expected, fromDisk); + } + + { + // Parquet dataset with three files, first and third file have three row groups, two non-empty followed by + // an empty row group, and second file has just one empty row group. + final String dirPath = TestParquetTools.class.getResource("/datasetWithEmptyRowgroups").getFile(); + assertFalse(readTable(dirPath + "/file1.parquet").isEmpty()); + assertTrue(readTable(dirPath + "/file2.parquet").isEmpty()); + assertFalse(readTable(dirPath + "/file3.parquet").isEmpty()); + + final Table table = readTable(dirPath).select(); + assertEquals(2138182, table.size()); + assertEquals(4, table.numColumns()); + assertEquals(1068950, table.selectDistinct("price").size()); + } + } + @Test public void decimalLogicalTypeTest() { final Table expected = TableTools.emptyTable(100_000).update( diff --git a/extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup1.parquet b/extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup1.parquet new file mode 100644 index 00000000000..11c4bc2ce8c --- /dev/null +++ b/extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup1.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44d4d437881c25c71de1e1d2dccc1727cef00871defce35bd68c81f22cba6d25 +size 553 diff --git a/extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup2.parquet b/extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup2.parquet new file mode 100644 index 00000000000..2ce0097288d --- /dev/null +++ b/extensions/parquet/table/src/test/resources/ReferenceParquetWithEmptyRowGroup2.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d13c8bb7cb58dba15328674290eb12ae559aae5d175dd33f9a769137acb9f31 +size 489 diff --git a/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file1.parquet b/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file1.parquet new file mode 100644 index 00000000000..2b3343ecf6f --- /dev/null +++ b/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file1.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cbf99224de156d82106cc716affbc9bf156a95697de545112f86dfd6be56a4b +size 11443200 diff --git a/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file2.parquet b/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file2.parquet new file mode 100644 index 00000000000..11c4bc2ce8c --- /dev/null +++ b/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file2.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44d4d437881c25c71de1e1d2dccc1727cef00871defce35bd68c81f22cba6d25 +size 553 diff --git a/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file3.parquet b/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file3.parquet new file mode 100644 index 00000000000..2b3343ecf6f --- /dev/null +++ b/extensions/parquet/table/src/test/resources/datasetWithEmptyRowgroups/file3.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cbf99224de156d82106cc716affbc9bf156a95697de545112f86dfd6be56a4b +size 11443200