From 46f7352f33317ba2eda089bd604b5d3c4b560084 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 22 May 2026 13:57:40 +0800 Subject: [PATCH 1/6] [format] Add row-oriented file format with O(1) row-number lookups Introduce a new .row file format optimized for fast point lookups by row number, designed for deletion vector applications and changelog materialization. The format stores data in ZSTD-compressed blocks with a block index enabling binary search by row number. Key components: - RowFormatWriter/Reader: block-level write and read with projection and selection (RoaringBitmap) pushdown - BlockPrefetcher: concurrent IO with range coalescing (merges adjacent blocks within 256KB gap, up to 2MB per range) and prefetch sliding window - InputStreamPool: lazy stream pool that opens streams on demand for concurrent reads - RowBlockWriter/Reader: compact row serialization supporting all Paimon types including nested ARRAY, MAP, ROW, and VARIANT - RowBlockIndex: delta+zigzag+varint encoded block metadata - Documentation: rowformat.md specification and fileformat.md updates Co-Authored-By: Claude Opus 4.6 --- docs/docs/concepts/spec/fileformat.md | 19 +- docs/docs/concepts/spec/rowformat.md | 213 ++++ docs/sidebars.js | 1 + .../apache/paimon/format/row/BlockInput.java | 131 +++ .../apache/paimon/format/row/BlockOutput.java | 121 +++ .../paimon/format/row/BlockPrefetcher.java | 225 ++++ .../paimon/format/row/InputStreamPool.java | 76 ++ .../paimon/format/row/RowBlockIndex.java | 143 +++ .../paimon/format/row/RowBlockReader.java | 338 ++++++ .../paimon/format/row/RowBlockWriter.java | 351 +++++++ .../paimon/format/row/RowFileFooter.java | 128 +++ .../paimon/format/row/RowFileFormat.java | 95 ++ .../format/row/RowFileFormatFactory.java | 38 + .../format/row/RowFileRecordIterator.java | 118 +++ .../paimon/format/row/RowFormatReader.java | 147 +++ .../format/row/RowFormatReaderFactory.java | 65 ++ .../paimon/format/row/RowFormatWriter.java | 115 +++ .../format/row/RowFormatWriterFactory.java | 45 + ...org.apache.paimon.format.FileFormatFactory | 1 + .../format/row/BlockPrefetcherTest.java | 577 +++++++++++ .../format/row/RowFormatReadWriteTest.java | 970 ++++++++++++++++++ 21 files changed, 3915 insertions(+), 2 deletions(-) create mode 100644 docs/docs/concepts/spec/rowformat.md create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/BlockInput.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/BlockOutput.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/BlockPrefetcher.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/InputStreamPool.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockIndex.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockReader.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockWriter.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFooter.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormat.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormatFactory.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/RowFileRecordIterator.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriter.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriterFactory.java create mode 100644 paimon-format/src/test/java/org/apache/paimon/format/row/BlockPrefetcherTest.java create mode 100644 paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java diff --git a/docs/docs/concepts/spec/fileformat.md b/docs/docs/concepts/spec/fileformat.md index baac8f2b0536..a38a667cbb83 100644 --- a/docs/docs/concepts/spec/fileformat.md +++ b/docs/docs/concepts/spec/fileformat.md @@ -24,9 +24,10 @@ under the License. # File Format -Currently, supports Parquet, Avro, ORC, CSV, JSON, and Lance file formats. +Currently, supports Parquet, Avro, ORC, CSV, JSON, Lance, and Row file formats. - Recommended column format is Parquet, which has a high compression rate and fast column projection queries. -- Recommended row based format is Avro, which has good performance n reading and writing full row (all columns). +- Recommended row based format is Avro, which has good performance on reading and writing full row (all columns). +- Recommended format for row-number based O(1) lookups is Row, which stores data in row-oriented blocks with ZSTD compression and supports fast random access by row number. - Recommended testing format is CSV, which has better readability but the worst read-write performance. - Recommended format for ML workloads is Lance, which is optimized for vector search and machine learning use cases. @@ -754,6 +755,20 @@ Limitations: 1. Lance file format does not support `MAP` type. 2. Lance file format does not support `TIMESTAMP_LOCAL_ZONE` type. +## ROW + +The Row format is a row-oriented storage format designed for O(1) random access by row number. Data is organized in blocks with ZSTD Level 1 compression. Each block contains complete rows serialized in a compact binary format with an offset array for direct row positioning. + +Key features: +- **O(1) Row Lookup**: Block index + in-block offset array enables direct access to any row by its global row number +- **Block-level ZSTD Compression**: Each block is independently compressed for good compression ratio with fast decompression +- **Compact Serialization**: Rows are serialized with a null bitmap followed by field values in sequence, minimizing overhead +- **Selection Pushdown**: Supports RoaringBitmap-based row selection, skipping entire blocks that contain no selected rows + +The Row format supports all Paimon data types: BOOLEAN, TINYINT, SMALLINT, INT, BIGINT, FLOAT, DOUBLE, CHAR, VARCHAR, BINARY, VARBINARY, DECIMAL, DATE, TIME, TIMESTAMP, TIMESTAMP_LOCAL_ZONE, VARIANT, ARRAY, MAP, ROW. + +For detailed file layout and binary format specification, see [Row Format](./rowformat). + ## BLOB The BLOB format is a specialized format for storing large binary objects such as images, videos, and other multimodal data. Unlike other formats that store data inline, BLOB format stores large binary data in separate files with an optimized layout for random access. diff --git a/docs/docs/concepts/spec/rowformat.md b/docs/docs/concepts/spec/rowformat.md new file mode 100644 index 000000000000..cddbbda22c7f --- /dev/null +++ b/docs/docs/concepts/spec/rowformat.md @@ -0,0 +1,213 @@ +--- +title: "Row Format" +sidebar_position: 8 +--- + + + +# Row Format Specification + +The Row format (`.row`) is a row-oriented file format optimized for O(1) random access by row number. It is designed for scenarios where fast point lookups by row position are critical, such as deletion vector applications and changelog materialization. + +## File Layout + +A `.row` file consists of three sections: + +``` ++====================================================================+ +| ROW FILE (.row) | ++====================================================================+ +| Data Block 0 (ZSTD compressed) | +| Data Block 1 (ZSTD compressed) | +| ... | +| Data Block K (ZSTD compressed) | ++--------------------------------------------------------------------+ +| Block Index (Delta+ZigZag+Varint encoded) | ++--------------------------------------------------------------------+ +| Footer (fixed 32 bytes) | ++====================================================================+ +``` + +## Data Block + +Each data block is independently ZSTD Level 1 compressed. The uncompressed content has the following layout: + +``` ++-----------------------------------------------------------+ +| row_0_bytes | row_1_bytes | ... | row_N_bytes | ++-----------------------------------------------------------+ +| offset[0] (int32 LE) | offset[1] | ... | offset[N] | ++-----------------------------------------------------------+ +| row_count (int32 LE) | ++-----------------------------------------------------------+ +``` + +- **Row data region**: Each row is serialized sequentially using the compact row format (see below). +- **Offset array**: An array of int32 little-endian values, one per row, storing the byte offset of each row within the uncompressed block. +- **Row count**: A single int32 little-endian value at the very end of the block, storing the number of rows in this block. + +A new block is flushed when the estimated uncompressed size reaches the configured block size threshold (default 64 KB, configurable via `file.block-size`). + +### Row Serialization Format + +Each row is serialized as: + +``` ++-----------------------------------------------+ +| null_bitmap | field_0 | field_1 | ... | field_N | ++-----------------------------------------------+ +``` + +**Null bitmap**: `ceil(arity / 8)` bytes. Bit `i` is set (1) if field `i` is null. The bit position is `byte[i/8] & (1 << (i%8))`. Non-null fields are serialized in order; null fields occupy no space beyond the bitmap bit. + +### Primitive Type Encoding + +All multi-byte primitives use **little-endian** byte order. + +| Paimon Type | Encoding | +|---|---| +| BOOLEAN | 1 byte: 0 = false, 1 = true | +| TINYINT | 1 byte signed | +| SMALLINT | 2 bytes int16 LE | +| INT / DATE / TIME | 4 bytes int32 LE | +| BIGINT | 8 bytes int64 LE | +| FLOAT | 4 bytes IEEE 754 LE | +| DOUBLE | 8 bytes IEEE 754 LE | +| CHAR / VARCHAR | varint(length) + UTF-8 bytes | +| BINARY / VARBINARY | varint(length) + raw bytes | +| DECIMAL(P, S) where P <= 18 | 8 bytes int64 LE (unscaled long) | +| DECIMAL(P, S) where P > 18 | varint(length) + unscaled bytes (big-endian two's complement) | +| TIMESTAMP(P) where P <= 3 | 8 bytes int64 LE (epoch millis) | +| TIMESTAMP(P) where P > 3 | 8 bytes int64 LE (epoch millis) + varint(nanoOfMillisecond) | +| VARIANT | varint(len1) + value bytes + varint(len2) + metadata bytes | + +### Varint Encoding + +Variable-length integer encoding (unsigned LEB128): +- Each byte uses 7 bits for data and 1 bit (MSB) as continuation flag. +- If MSB = 1, more bytes follow. If MSB = 0, this is the last byte. +- Maximum 5 bytes for int32 values. + +### Complex Type Encoding + +**ARRAY**: + +``` +varint(size) | null_bitmap[ceil(size/8) bytes] | element_0 | element_1 | ... | element_N +``` + +Null bitmap uses the same bit layout as row nulls. Non-null elements are serialized in order using the element type's encoding. + +**MAP**: + +A map is serialized as two arrays (keys array followed by values array): + +``` +[keys array] [values array] +``` + +Each array follows the ARRAY encoding above (varint size + null bitmap + elements). Both keys and values support null entries. + +**ROW (nested)**: + +Nested rows use the same format as top-level rows: + +``` +null_bitmap[ceil(arity/8) bytes] | field_0 | field_1 | ... | field_N +``` + +## Block Index + +The block index stores metadata for all blocks, enabling binary search to locate the block containing a given row number. + +``` ++--------------------------------------------------------------------+ +| varint(len_0) | encoded_block_compressed_sizes | +| varint(len_1) | encoded_block_uncompressed_sizes | +| varint(len_2) | encoded_block_row_starts | ++--------------------------------------------------------------------+ +``` + +Each of the three arrays is encoded using **Delta + ZigZag + Varint** compression: +1. Compute deltas between consecutive values +2. ZigZag encode each delta (maps signed to unsigned) +3. Varint encode each ZigZag value + +This is highly efficient for monotonically increasing sequences (row starts) and similar-valued sequences (sizes). + +The arrays are: +- **blockCompressedSizes**: Compressed size of each block. Block offsets are derived by prefix sum (first block starts at file position 0). +- **blockUncompressedSizes**: Uncompressed size of each block (needed to allocate decompression buffer) +- **blockRowStarts**: Cumulative row count at the start of each block (for binary search) + +## Footer + +The footer is a fixed 32-byte structure at the end of the file: + +``` ++-----------------------------------------------+ +| totalRowCount | int64 | 8 bytes | LE | +| blockCount | int32 | 4 bytes | LE | +| indexOffset | int64 | 8 bytes | LE | +| indexLength | int32 | 4 bytes | LE | +| version | int8 | 1 byte | | +| reserved | | 3 bytes | | +| magic | int32 | 4 bytes | LE | ++-----------------------------------------------+ +``` + +- **totalRowCount**: Total number of rows in the file. +- **blockCount**: Number of data blocks. +- **indexOffset**: Byte offset in the file where the block index starts. +- **indexLength**: Length in bytes of the block index section. +- **version**: Format version, currently `1`. +- **reserved**: 3 bytes reserved for future use (must be 0). +- **magic**: `0x524F5753` (ASCII "ROWS"), used for format validation. + +## Row Number Lookup Algorithm + +To read a specific row by its global row number: + +1. **Read Footer**: Seek to file end - 32 bytes, read the 32-byte footer. Validate magic number. +2. **Read Block Index**: Seek to `indexOffset`, read `indexLength` bytes, decode the three arrays. Compute block offsets by prefix sum of `blockCompressedSizes[]`. +3. **Binary Search**: Search `blockRowStarts[]` to find block `b` where `blockRowStarts[b] <= rowNum < blockRowStarts[b+1]`. +4. **Read Block**: Seek to `blockOffset(b)`, read `blockCompressedSizes[b]` bytes. +5. **Decompress**: ZSTD decompress into a buffer of size `blockUncompressedSizes[b]`. +6. **Locate Row**: Compute `localIdx = rowNum - blockRowStarts[b]`. Read `offsets[localIdx]` from the offset array at the end of the decompressed block. +7. **Deserialize**: Read the row starting at the computed offset using the row serialization format. + +## Projection + +Column projection is applied after full row deserialization. Since the compact row format serializes fields sequentially without per-field offset metadata, individual fields cannot be skipped during deserialization. After the complete row is deserialized, a projection mapping selects the requested columns. + +## Selection (Deletion Vectors) + +Row selection via `RoaringBitmap32` enables efficient filtering: + +1. For each block, check if the selection bitmap intersects with `[blockRowStart, blockRowEnd)`. +2. If no intersection, skip the entire block (no I/O or decompression). +3. If there is an intersection, decompress the block and only deserialize the selected rows using their local indices. + +## Configuration + +| Option | Default | Description | +|---|---|---| +| `file.block-size` | 64 KB | Uncompressed block size threshold. Larger blocks improve compression ratio but increase read amplification for point lookups. | +| ZSTD Level | 1 | Fixed at level 1 for fast compression with reasonable ratio. | diff --git a/docs/sidebars.js b/docs/sidebars.js index 0c959ab96325..634d0830e970 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -49,6 +49,7 @@ const sidebars = { "concepts/spec/manifest", "concepts/spec/datafile", "concepts/spec/fileformat", + "concepts/spec/rowformat", "concepts/spec/tableindex", "concepts/spec/fileindex" ] diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/BlockInput.java b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockInput.java new file mode 100644 index 000000000000..e169ce239875 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockInput.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.Timestamp; + +/** A cursor over a byte array that reads primitives in little-endian order. */ +class BlockInput { + + final byte[] data; + int position; + + BlockInput(byte[] data) { + this.data = data; + this.position = 0; + } + + boolean readBoolean() { + return data[position++] != 0; + } + + byte readByte() { + return data[position++]; + } + + short readShort() { + short v = (short) ((data[position] & 0xFF) | ((data[position + 1] & 0xFF) << 8)); + position += 2; + return v; + } + + int readInt() { + int v = + (data[position] & 0xFF) + | ((data[position + 1] & 0xFF) << 8) + | ((data[position + 2] & 0xFF) << 16) + | ((data[position + 3] & 0xFF) << 24); + position += 4; + return v; + } + + long readLong() { + long v = + (data[position] & 0xFFL) + | ((data[position + 1] & 0xFFL) << 8) + | ((data[position + 2] & 0xFFL) << 16) + | ((data[position + 3] & 0xFFL) << 24) + | ((data[position + 4] & 0xFFL) << 32) + | ((data[position + 5] & 0xFFL) << 40) + | ((data[position + 6] & 0xFFL) << 48) + | ((data[position + 7] & 0xFFL) << 56); + position += 8; + return v; + } + + float readFloat() { + return Float.intBitsToFloat(readInt()); + } + + double readDouble() { + return Double.longBitsToDouble(readLong()); + } + + int readVarInt() { + int result = 0; + int shift = 0; + while (true) { + byte b = data[position++]; + result |= (b & 0x7F) << shift; + if ((b & 0x80) == 0) { + return result; + } + shift += 7; + } + } + + BinaryString readString() { + int length = readVarInt(); + BinaryString s = BinaryString.fromBytes(data, position, length); + position += length; + return s; + } + + byte[] readBytes() { + int length = readVarInt(); + byte[] result = new byte[length]; + System.arraycopy(data, position, result, 0, length); + position += length; + return result; + } + + Decimal readDecimal(int precision, int scale) { + return Decimal.isCompact(precision) + ? Decimal.fromUnscaledLong(readLong(), precision, scale) + : Decimal.fromUnscaledBytes(readBytes(), precision, scale); + } + + Timestamp readTimestamp(int precision) { + if (Timestamp.isCompact(precision)) { + return Timestamp.fromEpochMillis(readLong()); + } + long millis = readLong(); + int nanos = readVarInt(); + return Timestamp.fromEpochMillis(millis, nanos); + } + + static int readIntLE(byte[] buf, int offset) { + return (buf[offset] & 0xFF) + | ((buf[offset + 1] & 0xFF) << 8) + | ((buf[offset + 2] & 0xFF) << 16) + | ((buf[offset + 3] & 0xFF) << 24); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/BlockOutput.java b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockOutput.java new file mode 100644 index 000000000000..3e457a7832ce --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockOutput.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.Timestamp; + +/** A resizable byte buffer with little-endian primitive write operations. */ +class BlockOutput { + + byte[] buffer; + int position; + + BlockOutput(int initialCapacity) { + this.buffer = new byte[initialCapacity]; + this.position = 0; + } + + void writeBoolean(boolean value) { + ensureCapacity(1); + buffer[position++] = (byte) (value ? 1 : 0); + } + + void writeByte(byte value) { + ensureCapacity(1); + buffer[position++] = value; + } + + void writeShort(short value) { + ensureCapacity(2); + buffer[position++] = (byte) (value & 0xFF); + buffer[position++] = (byte) ((value >>> 8) & 0xFF); + } + + void writeInt(int value) { + ensureCapacity(4); + buffer[position++] = (byte) (value & 0xFF); + buffer[position++] = (byte) ((value >>> 8) & 0xFF); + buffer[position++] = (byte) ((value >>> 16) & 0xFF); + buffer[position++] = (byte) ((value >>> 24) & 0xFF); + } + + void writeLong(long value) { + ensureCapacity(8); + buffer[position++] = (byte) (value & 0xFF); + buffer[position++] = (byte) ((value >>> 8) & 0xFF); + buffer[position++] = (byte) ((value >>> 16) & 0xFF); + buffer[position++] = (byte) ((value >>> 24) & 0xFF); + buffer[position++] = (byte) ((value >>> 32) & 0xFF); + buffer[position++] = (byte) ((value >>> 40) & 0xFF); + buffer[position++] = (byte) ((value >>> 48) & 0xFF); + buffer[position++] = (byte) ((value >>> 56) & 0xFF); + } + + void writeFloat(float value) { + writeInt(Float.floatToRawIntBits(value)); + } + + void writeDouble(double value) { + writeLong(Double.doubleToRawLongBits(value)); + } + + void writeVarInt(int value) { + ensureCapacity(5); + while ((value & ~0x7F) != 0) { + buffer[position++] = (byte) ((value & 0x7F) | 0x80); + value >>>= 7; + } + buffer[position++] = (byte) value; + } + + void writeBytes(byte[] value) { + writeVarInt(value.length); + ensureCapacity(value.length); + System.arraycopy(value, 0, buffer, position, value.length); + position += value.length; + } + + void writeDecimal(Decimal value, int precision) { + if (Decimal.isCompact(precision)) { + writeLong(value.toUnscaledLong()); + } else { + writeBytes(value.toUnscaledBytes()); + } + } + + void writeTimestamp(Timestamp value, int precision) { + if (Timestamp.isCompact(precision)) { + writeLong(value.getMillisecond()); + } else { + writeLong(value.getMillisecond()); + writeVarInt(value.getNanoOfMillisecond()); + } + } + + void ensureCapacity(int additional) { + int required = position + additional; + if (required > buffer.length) { + int newSize = Math.max(buffer.length * 2, required); + byte[] newBuffer = new byte[newSize]; + System.arraycopy(buffer, 0, newBuffer, 0, position); + buffer = newBuffer; + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/BlockPrefetcher.java b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockPrefetcher.java new file mode 100644 index 000000000000..5ceff2529f4f --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockPrefetcher.java @@ -0,0 +1,225 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.compression.ZstdBlockDecompressor; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.utils.IOUtils; + +import java.io.Closeable; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * Prefetches and decompresses blocks with IO coalescing and concurrent reading. + * + *

Adjacent blocks are merged into larger IO requests to reduce QPS. Multiple merged ranges are + * prefetched concurrently using separate InputStreams. + */ +class BlockPrefetcher implements Closeable { + + private static final int HOLE_SIZE_LIMIT = 256 * 1024; + private static final int RANGE_SIZE_LIMIT = 2 * 1024 * 1024; + private static final int PREFETCH_COUNT = 4; + + private static final ExecutorService IO_POOL = Executors.newCachedThreadPool(); + + private final InputStreamPool streamPool; + private final RowBlockIndex blockIndex; + private final ZstdBlockDecompressor decompressor; + + private final List mergedRanges; + private final Queue> prefetchQueue; + private int nextRangeToSubmit; + private int currentRangeIdx; + private byte[] currentRangeData; + private int currentBlockInRange; + + BlockPrefetcher(InputStreamPool streamPool, RowBlockIndex blockIndex, int[] blocksToRead) { + this.streamPool = streamPool; + this.blockIndex = blockIndex; + this.decompressor = new ZstdBlockDecompressor(); + this.mergedRanges = coalesceRanges(blocksToRead, blockIndex); + this.prefetchQueue = new ArrayDeque<>(PREFETCH_COUNT); + this.nextRangeToSubmit = 0; + this.currentRangeIdx = -1; + this.currentBlockInRange = 0; + + fillPrefetch(); + } + + byte[] nextBlock() throws IOException { + if (currentRangeIdx < 0 + || currentBlockInRange >= mergedRanges.get(currentRangeIdx).blockIndices.length) { + advanceToNextRange(); + } + if (currentRangeIdx >= mergedRanges.size()) { + return null; + } + + MergedRange range = mergedRanges.get(currentRangeIdx); + int blockIdx = range.blockIndices[currentBlockInRange]; + int offsetInBuf = (int) (blockIndex.blockOffset(blockIdx) - range.offset); + int compressedSize = (int) blockIndex.blockCompressedSize(blockIdx); + int uncompressedSize = (int) blockIndex.blockUncompressedSize(blockIdx); + + byte[] decompressed = new byte[uncompressedSize]; + decompressor.decompress(currentRangeData, offsetInBuf, compressedSize, decompressed, 0); + + currentBlockInRange++; + return decompressed; + } + + int currentBlockIdx() { + if (currentRangeIdx < 0 || currentRangeIdx >= mergedRanges.size()) { + return -1; + } + MergedRange range = mergedRanges.get(currentRangeIdx); + return range.blockIndices[currentBlockInRange - 1]; + } + + @Override + public void close() throws IOException { + for (CompletableFuture f : prefetchQueue) { + f.cancel(true); + } + prefetchQueue.clear(); + streamPool.close(); + } + + private void advanceToNextRange() throws IOException { + currentRangeIdx++; + currentBlockInRange = 0; + + if (currentRangeIdx >= mergedRanges.size()) { + currentRangeData = null; + return; + } + + CompletableFuture future = prefetchQueue.poll(); + if (future == null) { + currentRangeData = readRange(mergedRanges.get(currentRangeIdx)); + } else { + try { + currentRangeData = future.get(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException("Interrupted while waiting for prefetch", e); + } catch (ExecutionException e) { + Throwable cause = e.getCause(); + if (cause instanceof IOException) { + throw (IOException) cause; + } + throw new IOException("Prefetch failed", cause); + } + } + fillPrefetch(); + } + + private void fillPrefetch() { + while (prefetchQueue.size() < PREFETCH_COUNT && nextRangeToSubmit < mergedRanges.size()) { + int rangeIdx = nextRangeToSubmit++; + MergedRange range = mergedRanges.get(rangeIdx); + prefetchQueue.add( + CompletableFuture.supplyAsync( + () -> { + try { + return readRange(range); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }, + IO_POOL)); + } + } + + private byte[] readRange(MergedRange range) throws IOException { + byte[] buf = new byte[range.length]; + SeekableInputStream in = streamPool.borrow(); + try { + in.seek(range.offset); + IOUtils.readFully(in, buf); + } finally { + streamPool.returnStream(in); + } + return buf; + } + + // ======================== Range Coalescing ======================== + + static List coalesceRanges(int[] blocksToRead, RowBlockIndex blockIndex) { + List result = new ArrayList<>(); + if (blocksToRead.length == 0) { + return result; + } + + int rangeStart = 0; + long rangeOffset = blockIndex.blockOffset(blocksToRead[0]); + long rangeEnd = rangeOffset + blockIndex.blockCompressedSize(blocksToRead[0]); + + for (int i = 1; i < blocksToRead.length; i++) { + int blockIdx = blocksToRead[i]; + long blockOffset = blockIndex.blockOffset(blockIdx); + long blockEnd = blockOffset + blockIndex.blockCompressedSize(blockIdx); + long gap = blockOffset - rangeEnd; + long newLength = blockEnd - rangeOffset; + + if (gap < HOLE_SIZE_LIMIT && newLength <= RANGE_SIZE_LIMIT) { + rangeEnd = blockEnd; + } else { + result.add(buildRange(blocksToRead, rangeStart, i, rangeOffset, rangeEnd)); + rangeStart = i; + rangeOffset = blockOffset; + rangeEnd = blockEnd; + } + } + result.add( + buildRange(blocksToRead, rangeStart, blocksToRead.length, rangeOffset, rangeEnd)); + return result; + } + + private static MergedRange buildRange( + int[] blocksToRead, int from, int to, long rangeOffset, long rangeEnd) { + int[] indices = new int[to - from]; + System.arraycopy(blocksToRead, from, indices, 0, indices.length); + return new MergedRange(rangeOffset, (int) (rangeEnd - rangeOffset), indices); + } + + // ======================== MergedRange ======================== + + static class MergedRange { + final long offset; + final int length; + final int[] blockIndices; + + MergedRange(long offset, int length, int[] blockIndices) { + this.offset = offset; + this.length = length; + this.blockIndices = blockIndices; + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/InputStreamPool.java b/paimon-format/src/main/java/org/apache/paimon/format/row/InputStreamPool.java new file mode 100644 index 000000000000..7ef811505b5c --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/InputStreamPool.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.SeekableInputStream; + +import java.io.Closeable; +import java.io.IOException; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.atomic.AtomicInteger; + +/** A lazy pool of {@link SeekableInputStream} instances that opens streams on demand. */ +class InputStreamPool implements Closeable { + + private final FileIO fileIO; + private final Path path; + private final int maxSize; + private final AtomicInteger created; + private final LinkedBlockingQueue available; + + InputStreamPool(FileIO fileIO, Path path, int maxSize, SeekableInputStream initialStream) { + this.fileIO = fileIO; + this.path = path; + this.maxSize = maxSize; + this.created = new AtomicInteger(1); + this.available = new LinkedBlockingQueue<>(); + this.available.add(initialStream); + } + + SeekableInputStream borrow() throws IOException { + SeekableInputStream in = available.poll(); + if (in != null) { + return in; + } + if (created.getAndIncrement() < maxSize) { + return fileIO.newInputStream(path); + } + created.decrementAndGet(); + try { + return available.take(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException("Interrupted while waiting for stream", e); + } + } + + void returnStream(SeekableInputStream in) { + available.add(in); + } + + @Override + public void close() throws IOException { + SeekableInputStream in; + while ((in = available.poll()) != null) { + in.close(); + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockIndex.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockIndex.java new file mode 100644 index 000000000000..294eac1bcee4 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockIndex.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.utils.DeltaVarintCompressor; +import org.apache.paimon.utils.IOUtils; +import org.apache.paimon.utils.VarLengthIntUtils; + +import java.io.IOException; + +/** Block index that maps row numbers to block locations. */ +class RowBlockIndex { + + private final long[] blockOffsets; + private final long[] blockCompressedSizes; + private final long[] blockUncompressedSizes; + private final long[] blockRowStarts; + + RowBlockIndex( + long[] blockCompressedSizes, long[] blockUncompressedSizes, long[] blockRowStarts) { + this.blockCompressedSizes = blockCompressedSizes; + this.blockUncompressedSizes = blockUncompressedSizes; + this.blockRowStarts = blockRowStarts; + this.blockOffsets = computeOffsets(blockCompressedSizes); + } + + int blockCount() { + return blockCompressedSizes.length; + } + + long blockOffset(int blockIdx) { + return blockOffsets[blockIdx]; + } + + long blockCompressedSize(int blockIdx) { + return blockCompressedSizes[blockIdx]; + } + + long blockUncompressedSize(int blockIdx) { + return blockUncompressedSizes[blockIdx]; + } + + long blockRowStart(int blockIdx) { + return blockRowStarts[blockIdx]; + } + + void writeTo(PositionOutputStream out) throws IOException { + writeArray(out, DeltaVarintCompressor.compress(blockCompressedSizes)); + writeArray(out, DeltaVarintCompressor.compress(blockUncompressedSizes)); + writeArray(out, DeltaVarintCompressor.compress(blockRowStarts)); + } + + static RowBlockIndex readFrom(SeekableInputStream in, long indexOffset, int indexLength) + throws IOException { + in.seek(indexOffset); + byte[] indexData = new byte[indexLength]; + IOUtils.readFully(in, indexData); + + int pos = 0; + int len1 = decodeVarInt(indexData, pos); + pos += varIntSize(len1); + long[] blockCompressedSizes = + DeltaVarintCompressor.decompress(extractBytes(indexData, pos, len1)); + pos += len1; + + int len2 = decodeVarInt(indexData, pos); + pos += varIntSize(len2); + long[] blockUncompressedSizes = + DeltaVarintCompressor.decompress(extractBytes(indexData, pos, len2)); + pos += len2; + + int len3 = decodeVarInt(indexData, pos); + pos += varIntSize(len3); + long[] blockRowStarts = + DeltaVarintCompressor.decompress(extractBytes(indexData, pos, len3)); + + return new RowBlockIndex(blockCompressedSizes, blockUncompressedSizes, blockRowStarts); + } + + private static long[] computeOffsets(long[] compressedSizes) { + long[] offsets = new long[compressedSizes.length]; + long offset = 0; + for (int i = 0; i < compressedSizes.length; i++) { + offsets[i] = offset; + offset += compressedSizes[i]; + } + return offsets; + } + + private static void writeArray(PositionOutputStream out, byte[] encoded) throws IOException { + byte[] lenBuf = new byte[VarLengthIntUtils.MAX_VAR_INT_SIZE]; + int lenBytes = VarLengthIntUtils.encodeInt(lenBuf, 0, encoded.length); + out.write(lenBuf, 0, lenBytes); + out.write(encoded); + } + + private static int decodeVarInt(byte[] data, int offset) { + int result = 0; + int shift = 0; + int pos = offset; + while (true) { + byte b = data[pos++]; + result |= (b & 0x7F) << shift; + if ((b & 0x80) == 0) { + return result; + } + shift += 7; + } + } + + private static int varIntSize(int value) { + int size = 1; + while ((value & ~0x7F) != 0) { + size++; + value >>>= 7; + } + return size; + } + + private static byte[] extractBytes(byte[] data, int offset, int length) { + byte[] result = new byte[length]; + System.arraycopy(data, offset, result, 0, length); + return result; + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockReader.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockReader.java new file mode 100644 index 000000000000..2a87d9ba1e00 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockReader.java @@ -0,0 +1,338 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.GenericArray; +import org.apache.paimon.data.GenericMap; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalMap; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.variant.GenericVariant; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.MapType; +import org.apache.paimon.types.MultisetType; +import org.apache.paimon.types.RowType; + +import java.util.HashMap; +import java.util.Map; + +import static org.apache.paimon.types.DataTypeChecks.getPrecision; +import static org.apache.paimon.types.DataTypeChecks.getScale; + +/** Reads rows from a decompressed block by local row index. */ +class RowBlockReader { + + private final BlockInput buf; + private final int rowCount; + private final int offsetArrayStart; + private final FieldReader[] fieldReaders; + private final int headerSizeInBytes; + + RowBlockReader(BlockInput buf, RowType rowType) { + this.buf = buf; + int len = buf.data.length; + this.rowCount = BlockInput.readIntLE(buf.data, len - 4); + this.offsetArrayStart = len - 4 - rowCount * 4; + + int arity = rowType.getFieldCount(); + this.headerSizeInBytes = (arity + 7) / 8; + this.fieldReaders = new FieldReader[arity]; + for (int i = 0; i < arity; i++) { + fieldReaders[i] = createFieldReader(rowType.getTypeAt(i)); + } + } + + int rowCount() { + return rowCount; + } + + InternalRow readRow(int localRowIndex) { + buf.position = BlockInput.readIntLE(buf.data, offsetArrayStart + localRowIndex * 4); + return readRow(headerSizeInBytes, fieldReaders); + } + + // ======================== Row Reading ======================== + + private InternalRow readRow(int headerSize, FieldReader[] readers) { + int headerStart = buf.position; + buf.position += headerSize; + + GenericRow row = new GenericRow(readers.length); + for (int i = 0; i < readers.length; i++) { + if ((buf.data[headerStart + i / 8] & (1 << (i % 8))) != 0) { + row.setField(i, null); + } else { + row.setField(i, readers[i].read()); + } + } + return row; + } + + // ======================== Field Reader Factory ======================== + + private FieldReader createFieldReader(DataType type) { + switch (type.getTypeRoot()) { + case CHAR: + case VARCHAR: + return new StringFieldReader(); + case BOOLEAN: + return new BooleanFieldReader(); + case BINARY: + case VARBINARY: + return new BinaryFieldReader(); + case DECIMAL: + return new DecimalFieldReader(getPrecision(type), getScale(type)); + case TINYINT: + return new TinyIntFieldReader(); + case SMALLINT: + return new SmallIntFieldReader(); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + return new IntFieldReader(); + case BIGINT: + return new BigIntFieldReader(); + case FLOAT: + return new FloatFieldReader(); + case DOUBLE: + return new DoubleFieldReader(); + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return new TimestampFieldReader(getPrecision(type)); + case VARIANT: + return new VariantFieldReader(); + case ARRAY: + { + DataType elementType = ((ArrayType) type).getElementType(); + return new ArrayFieldReader(createFieldReader(elementType)); + } + case MULTISET: + { + DataType msElementType = ((MultisetType) type).getElementType(); + return new MapFieldReader( + createFieldReader(msElementType), createFieldReader(new IntType())); + } + case MAP: + { + MapType mapType = (MapType) type; + return new MapFieldReader( + createFieldReader(mapType.getKeyType()), + createFieldReader(mapType.getValueType())); + } + case ROW: + { + RowType nestedType = (RowType) type; + return new RowFieldReader(nestedType); + } + default: + throw new UnsupportedOperationException("Unsupported type: " + type.getTypeRoot()); + } + } + + // ======================== Complex Types ======================== + + private Object[] readElements(FieldReader elementReader) { + int size = buf.readVarInt(); + int nullBitmapBytes = (size + 7) / 8; + int nullStart = buf.position; + buf.position += nullBitmapBytes; + + Object[] elements = new Object[size]; + for (int i = 0; i < size; i++) { + if ((buf.data[nullStart + i / 8] & (1 << (i % 8))) != 0) { + elements[i] = null; + } else { + elements[i] = elementReader.read(); + } + } + return elements; + } + + private InternalArray readArray(FieldReader elementReader) { + return new GenericArray(readElements(elementReader)); + } + + private InternalMap readMap(FieldReader keyReader, FieldReader valueReader) { + Object[] keys = readElements(keyReader); + Object[] values = readElements(valueReader); + Map map = new HashMap<>(keys.length); + for (int i = 0; i < keys.length; i++) { + map.put(keys[i], values[i]); + } + return new GenericMap(map); + } + + // ======================== Interface ======================== + + interface FieldReader { + Object read(); + } + + // ======================== FieldReader Implementations ======================== + + private class StringFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readString(); + } + } + + private class BooleanFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readBoolean(); + } + } + + private class BinaryFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readBytes(); + } + } + + private class DecimalFieldReader implements FieldReader { + private final int precision; + private final int scale; + + DecimalFieldReader(int precision, int scale) { + this.precision = precision; + this.scale = scale; + } + + @Override + public Object read() { + return buf.readDecimal(precision, scale); + } + } + + private class TinyIntFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readByte(); + } + } + + private class SmallIntFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readShort(); + } + } + + private class IntFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readInt(); + } + } + + private class BigIntFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readLong(); + } + } + + private class FloatFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readFloat(); + } + } + + private class DoubleFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readDouble(); + } + } + + private class TimestampFieldReader implements FieldReader { + private final int precision; + + TimestampFieldReader(int precision) { + this.precision = precision; + } + + @Override + public Object read() { + return buf.readTimestamp(precision); + } + } + + private class VariantFieldReader implements FieldReader { + @Override + public Object read() { + byte[] value = buf.readBytes(); + byte[] metadata = buf.readBytes(); + return new GenericVariant(value, metadata); + } + } + + private class ArrayFieldReader implements FieldReader { + private final FieldReader elementReader; + + ArrayFieldReader(FieldReader elementReader) { + this.elementReader = elementReader; + } + + @Override + public Object read() { + return readArray(elementReader); + } + } + + private class MapFieldReader implements FieldReader { + private final FieldReader keyReader; + private final FieldReader valueReader; + + MapFieldReader(FieldReader keyReader, FieldReader valueReader) { + this.keyReader = keyReader; + this.valueReader = valueReader; + } + + @Override + public Object read() { + return readMap(keyReader, valueReader); + } + } + + private class RowFieldReader implements FieldReader { + private final int nestedHeaderSize; + private final FieldReader[] nestedReaders; + + RowFieldReader(RowType nestedType) { + int arity = nestedType.getFieldCount(); + this.nestedHeaderSize = (arity + 7) / 8; + this.nestedReaders = new FieldReader[arity]; + for (int i = 0; i < arity; i++) { + nestedReaders[i] = createFieldReader(nestedType.getTypeAt(i)); + } + } + + @Override + public Object read() { + return readRow(nestedHeaderSize, nestedReaders); + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockWriter.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockWriter.java new file mode 100644 index 000000000000..a88c1f05994d --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockWriter.java @@ -0,0 +1,351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.DataGetters; +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalMap; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.variant.Variant; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.MapType; +import org.apache.paimon.types.MultisetType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.IntArrayList; + +import static org.apache.paimon.types.DataTypeChecks.getPrecision; +import static org.apache.paimon.types.DataTypeChecks.getScale; + +/** + * Accumulates rows by serializing directly into a block buffer. + * + *

Block layout (uncompressed): + * + *

+ * [row_0 bytes][row_1 bytes]...[row_N bytes]
+ * [offset_0 (int32 LE)][offset_1]...[offset_N]
+ * [row_count (int32 LE)]
+ * 
+ */ +class RowBlockWriter { + + private final BlockOutput buf; + private final IntArrayList offsets; + private final FieldWriter[] fieldWriters; + private final int headerSizeInBytes; + + RowBlockWriter(BlockOutput buf, RowType rowType) { + this.buf = buf; + this.offsets = new IntArrayList(64); + int arity = rowType.getFieldCount(); + this.headerSizeInBytes = (arity + 7) / 8; + this.fieldWriters = new FieldWriter[arity]; + for (int i = 0; i < arity; i++) { + fieldWriters[i] = createFieldWriter(rowType.getTypeAt(i)); + } + } + + void writeRow(InternalRow row) { + offsets.add(buf.position); + writeRow(row, headerSizeInBytes, fieldWriters); + } + + private void writeRow(InternalRow row, int headerSize, FieldWriter[] writers) { + int headerStart = buf.position; + buf.ensureCapacity(headerSize); + for (int i = 0; i < headerSize; i++) { + buf.buffer[headerStart + i] = 0; + } + buf.position += headerSize; + for (int i = 0; i < writers.length; i++) { + if (row.isNullAt(i)) { + buf.buffer[headerStart + i / 8] |= (byte) (1 << (i % 8)); + } else { + writers[i].write(row, i); + } + } + } + + int rowCount() { + return offsets.size(); + } + + int estimatedSize() { + return buf.position + offsets.size() * 4 + 4; + } + + byte[] finish() { + int totalSize = buf.position + offsets.size() * 4 + 4; + buf.ensureCapacity(offsets.size() * 4 + 4); + for (int i = 0; i < offsets.size(); i++) { + RowFileFooter.writeIntLE(buf.buffer, buf.position, offsets.get(i)); + buf.position += 4; + } + RowFileFooter.writeIntLE(buf.buffer, buf.position, offsets.size()); + buf.position += 4; + + byte[] result = new byte[totalSize]; + System.arraycopy(buf.buffer, 0, result, 0, totalSize); + return result; + } + + void reset() { + offsets.clear(); + buf.position = 0; + } + + // ======================== Factory ======================== + + private FieldWriter createFieldWriter(DataType type) { + switch (type.getTypeRoot()) { + case CHAR: + case VARCHAR: + return new StringFieldWriter(); + case BOOLEAN: + return new BooleanFieldWriter(); + case BINARY: + case VARBINARY: + return new BinaryFieldWriter(); + case DECIMAL: + return new DecimalFieldWriter(getPrecision(type), getScale(type)); + case TINYINT: + return new TinyIntFieldWriter(); + case SMALLINT: + return new SmallIntFieldWriter(); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + return new IntFieldWriter(); + case BIGINT: + return new BigIntFieldWriter(); + case FLOAT: + return new FloatFieldWriter(); + case DOUBLE: + return new DoubleFieldWriter(); + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return new TimestampFieldWriter(getPrecision(type)); + case VARIANT: + return new VariantFieldWriter(); + case ARRAY: + return new ArrayFieldWriter(createFieldWriter(((ArrayType) type).getElementType())); + case MULTISET: + { + DataType elemType = ((MultisetType) type).getElementType(); + return new MapFieldWriter( + createFieldWriter(elemType), createFieldWriter(new IntType())); + } + case MAP: + { + MapType mapType = (MapType) type; + return new MapFieldWriter( + createFieldWriter(mapType.getKeyType()), + createFieldWriter(mapType.getValueType())); + } + case ROW: + return new RowFieldWriter((RowType) type); + default: + throw new UnsupportedOperationException("Unsupported type: " + type.getTypeRoot()); + } + } + + // ======================== Complex Type Helpers ======================== + + private void writeArray(InternalArray array, FieldWriter elementWriter) { + int size = array.size(); + buf.writeVarInt(size); + int nullBitmapBytes = (size + 7) / 8; + buf.ensureCapacity(nullBitmapBytes); + int nullStart = buf.position; + for (int i = 0; i < nullBitmapBytes; i++) { + buf.buffer[buf.position++] = 0; + } + for (int i = 0; i < size; i++) { + if (array.isNullAt(i)) { + buf.buffer[nullStart + i / 8] |= (byte) (1 << (i % 8)); + } else { + elementWriter.write(array, i); + } + } + } + + private void writeMap(InternalMap map, FieldWriter keyWriter, FieldWriter valueWriter) { + writeArray(map.keyArray(), keyWriter); + writeArray(map.valueArray(), valueWriter); + } + + // ======================== Interface ======================== + + interface FieldWriter { + void write(DataGetters data, int i); + } + + // ======================== FieldWriter Implementations ======================== + + private class StringFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeBytes(data.getString(i).toBytes()); + } + } + + private class BooleanFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeBoolean(data.getBoolean(i)); + } + } + + private class BinaryFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeBytes(data.getBinary(i)); + } + } + + private class DecimalFieldWriter implements FieldWriter { + private final int precision; + private final int scale; + + DecimalFieldWriter(int precision, int scale) { + this.precision = precision; + this.scale = scale; + } + + @Override + public void write(DataGetters data, int i) { + buf.writeDecimal(data.getDecimal(i, precision, scale), precision); + } + } + + private class TinyIntFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeByte(data.getByte(i)); + } + } + + private class SmallIntFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeShort(data.getShort(i)); + } + } + + private class IntFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeInt(data.getInt(i)); + } + } + + private class BigIntFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeLong(data.getLong(i)); + } + } + + private class FloatFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeFloat(data.getFloat(i)); + } + } + + private class DoubleFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeDouble(data.getDouble(i)); + } + } + + private class TimestampFieldWriter implements FieldWriter { + private final int precision; + + TimestampFieldWriter(int precision) { + this.precision = precision; + } + + @Override + public void write(DataGetters data, int i) { + buf.writeTimestamp(data.getTimestamp(i, precision), precision); + } + } + + private class VariantFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + Variant v = data.getVariant(i); + buf.writeBytes(v.value()); + buf.writeBytes(v.metadata()); + } + } + + private class ArrayFieldWriter implements FieldWriter { + private final FieldWriter elemWriter; + + ArrayFieldWriter(FieldWriter elemWriter) { + this.elemWriter = elemWriter; + } + + @Override + public void write(DataGetters data, int i) { + writeArray(data.getArray(i), elemWriter); + } + } + + private class MapFieldWriter implements FieldWriter { + private final FieldWriter keyWriter; + private final FieldWriter valueWriter; + + MapFieldWriter(FieldWriter keyWriter, FieldWriter valueWriter) { + this.keyWriter = keyWriter; + this.valueWriter = valueWriter; + } + + @Override + public void write(DataGetters data, int i) { + writeMap(data.getMap(i), keyWriter, valueWriter); + } + } + + private class RowFieldWriter implements FieldWriter { + private final int nestedHeaderSize; + private final FieldWriter[] nestedWriters; + private final int numFields; + + RowFieldWriter(RowType nestedType) { + int arity = nestedType.getFieldCount(); + this.numFields = arity; + this.nestedHeaderSize = (arity + 7) / 8; + this.nestedWriters = new FieldWriter[arity]; + for (int j = 0; j < arity; j++) { + nestedWriters[j] = createFieldWriter(nestedType.getTypeAt(j)); + } + } + + @Override + public void write(DataGetters data, int i) { + writeRow(data.getRow(i, numFields), nestedHeaderSize, nestedWriters); + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFooter.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFooter.java new file mode 100644 index 000000000000..adc3edd0ce5f --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFooter.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.fs.SeekableInputStream; + +import java.io.IOException; + +/** Fixed 32-byte footer at the end of a row file. */ +class RowFileFooter { + + static final int FOOTER_SIZE = 32; + static final int MAGIC = 0x524F5753; // "ROWS" + static final byte VERSION = 1; + + final long totalRowCount; + final int blockCount; + final long indexOffset; + final int indexLength; + + RowFileFooter(long totalRowCount, int blockCount, long indexOffset, int indexLength) { + this.totalRowCount = totalRowCount; + this.blockCount = blockCount; + this.indexOffset = indexOffset; + this.indexLength = indexLength; + } + + void writeTo(PositionOutputStream out) throws IOException { + byte[] buf = new byte[FOOTER_SIZE]; + writeLongLE(buf, 0, totalRowCount); + writeIntLE(buf, 8, blockCount); + writeLongLE(buf, 12, indexOffset); + writeIntLE(buf, 20, indexLength); + buf[24] = VERSION; + // bytes 25-27 reserved (zeros) + writeIntLE(buf, 28, MAGIC); + out.write(buf); + } + + static RowFileFooter readFrom(SeekableInputStream in, long fileSize) throws IOException { + in.seek(fileSize - FOOTER_SIZE); + byte[] buf = new byte[FOOTER_SIZE]; + readFully(in, buf); + + int magic = readIntLE(buf, 28); + if (magic != MAGIC) { + throw new IOException( + String.format( + "Invalid row file magic: expected 0x%08X, got 0x%08X", MAGIC, magic)); + } + + byte version = buf[24]; + if (version != VERSION) { + throw new IOException("Unsupported row file version: " + version); + } + + long totalRowCount = readLongLE(buf, 0); + int blockCount = readIntLE(buf, 8); + long indexOffset = readLongLE(buf, 12); + int indexLength = readIntLE(buf, 20); + + return new RowFileFooter(totalRowCount, blockCount, indexOffset, indexLength); + } + + private static void readFully(SeekableInputStream in, byte[] buf) throws IOException { + int off = 0; + while (off < buf.length) { + int read = in.read(buf, off, buf.length - off); + if (read < 0) { + throw new IOException("Unexpected end of file"); + } + off += read; + } + } + + static void writeIntLE(byte[] buf, int offset, int value) { + buf[offset] = (byte) (value & 0xFF); + buf[offset + 1] = (byte) ((value >>> 8) & 0xFF); + buf[offset + 2] = (byte) ((value >>> 16) & 0xFF); + buf[offset + 3] = (byte) ((value >>> 24) & 0xFF); + } + + static int readIntLE(byte[] buf, int offset) { + return (buf[offset] & 0xFF) + | ((buf[offset + 1] & 0xFF) << 8) + | ((buf[offset + 2] & 0xFF) << 16) + | ((buf[offset + 3] & 0xFF) << 24); + } + + static void writeLongLE(byte[] buf, int offset, long value) { + buf[offset] = (byte) (value & 0xFF); + buf[offset + 1] = (byte) ((value >>> 8) & 0xFF); + buf[offset + 2] = (byte) ((value >>> 16) & 0xFF); + buf[offset + 3] = (byte) ((value >>> 24) & 0xFF); + buf[offset + 4] = (byte) ((value >>> 32) & 0xFF); + buf[offset + 5] = (byte) ((value >>> 40) & 0xFF); + buf[offset + 6] = (byte) ((value >>> 48) & 0xFF); + buf[offset + 7] = (byte) ((value >>> 56) & 0xFF); + } + + static long readLongLE(byte[] buf, int offset) { + return (buf[offset] & 0xFFL) + | ((buf[offset + 1] & 0xFFL) << 8) + | ((buf[offset + 2] & 0xFFL) << 16) + | ((buf[offset + 3] & 0xFFL) << 24) + | ((buf[offset + 4] & 0xFFL) << 32) + | ((buf[offset + 5] & 0xFFL) << 40) + | ((buf[offset + 6] & 0xFFL) << 48) + | ((buf[offset + 7] & 0xFFL) << 56); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormat.java new file mode 100644 index 000000000000..38518804c0da --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormat.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FileFormatFactory.FormatContext; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.options.MemorySize; +import org.apache.paimon.predicate.Predicate; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.RowType; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.List; + +/** Row-store file format with block-level ZSTD compression and O(1) row-number lookup. */ +public class RowFileFormat extends FileFormat { + + private static final int DEFAULT_BLOCK_SIZE = 65536; + + private final int blockSize; + private final int zstdLevel; + + public RowFileFormat(FormatContext formatContext) { + super(RowFileFormatFactory.IDENTIFIER); + this.zstdLevel = formatContext.zstdLevel(); + MemorySize bs = formatContext.blockSize(); + this.blockSize = bs != null ? (int) bs.getBytes() : DEFAULT_BLOCK_SIZE; + } + + @Override + public FormatReaderFactory createReaderFactory( + RowType dataSchemaRowType, + RowType projectedRowType, + @Nullable List filters) { + int[] projectionMapping = computeProjectionMapping(dataSchemaRowType, projectedRowType); + return new RowFormatReaderFactory(dataSchemaRowType, projectionMapping); + } + + @Override + public FormatWriterFactory createWriterFactory(RowType type) { + return new RowFormatWriterFactory(type, blockSize, zstdLevel); + } + + @Override + public void validateDataFields(RowType rowType) { + // RowCompactedSerializer supports all Paimon data types + } + + @Nullable + private static int[] computeProjectionMapping( + RowType dataSchemaRowType, RowType projectedRowType) { + if (dataSchemaRowType.equals(projectedRowType)) { + return null; + } + + List dataFields = dataSchemaRowType.getFields(); + List projectedFields = projectedRowType.getFields(); + + List mapping = new ArrayList<>(); + for (DataField projected : projectedFields) { + for (int i = 0; i < dataFields.size(); i++) { + if (dataFields.get(i).id() == projected.id()) { + mapping.add(i); + break; + } + } + } + + int[] result = new int[mapping.size()]; + for (int i = 0; i < mapping.size(); i++) { + result[i] = mapping.get(i); + } + return result; + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormatFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormatFactory.java new file mode 100644 index 000000000000..a9f3364d313b --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormatFactory.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FileFormatFactory; + +/** Factory for the row-store file format. */ +public class RowFileFormatFactory implements FileFormatFactory { + + public static final String IDENTIFIER = "row"; + + @Override + public String identifier() { + return IDENTIFIER; + } + + @Override + public FileFormat create(FormatContext formatContext) { + return new RowFileFormat(formatContext); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileRecordIterator.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileRecordIterator.java new file mode 100644 index 000000000000..650d61ac43b9 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileRecordIterator.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.fs.Path; +import org.apache.paimon.reader.FileRecordIterator; +import org.apache.paimon.utils.ProjectedRow; + +import javax.annotation.Nullable; + +import java.io.IOException; + +/** Iterator over rows within a single decompressed block. */ +class RowFileRecordIterator implements FileRecordIterator { + + private final Path filePath; + private final RowBlockReader blockReader; + @Nullable private final ProjectedRow projectedRow; + private final long blockStartRow; + @Nullable private final int[] selectedLocalIndices; + + private int cursor; + private long currentPosition; + + RowFileRecordIterator( + Path filePath, + RowBlockReader blockReader, + @Nullable int[] projectionMapping, + long blockStartRow) { + this(filePath, blockReader, projectionMapping, blockStartRow, null); + } + + RowFileRecordIterator( + Path filePath, + RowBlockReader blockReader, + @Nullable int[] projectionMapping, + long blockStartRow, + @Nullable int[] selectedLocalIndices) { + this.filePath = filePath; + this.blockReader = blockReader; + this.projectedRow = projectionMapping != null ? ProjectedRow.from(projectionMapping) : null; + this.blockStartRow = blockStartRow; + this.selectedLocalIndices = selectedLocalIndices; + this.cursor = 0; + this.currentPosition = blockStartRow; + } + + @Override + public long returnedPosition() { + return currentPosition; + } + + @Override + public Path filePath() { + return filePath; + } + + @Nullable + @Override + public InternalRow next() throws IOException { + if (selectedLocalIndices != null) { + return nextSelected(selectedLocalIndices); + } else { + return nextSequential(); + } + } + + @Nullable + private InternalRow nextSequential() { + if (cursor >= blockReader.rowCount()) { + return null; + } + currentPosition = blockStartRow + cursor; + InternalRow row = blockReader.readRow(cursor); + cursor++; + return applyProjection(row); + } + + @Nullable + private InternalRow nextSelected(int[] selectedLocalIndices) { + if (cursor >= selectedLocalIndices.length) { + return null; + } + int localIdx = selectedLocalIndices[cursor]; + currentPosition = blockStartRow + localIdx; + InternalRow row = blockReader.readRow(localIdx); + cursor++; + return applyProjection(row); + } + + private InternalRow applyProjection(InternalRow row) { + if (projectedRow != null) { + projectedRow.replaceRow(row); + return projectedRow; + } + return row; + } + + @Override + public void releaseBatch() {} +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java new file mode 100644 index 000000000000..0a1904f8e4cf --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.fs.Path; +import org.apache.paimon.reader.FileRecordIterator; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.RoaringBitmap32; + +import javax.annotation.Nullable; + +import java.io.IOException; + +/** Reader for row-store format files. Reads block by block and returns row iterators. */ +public class RowFormatReader implements FileRecordReader { + + private final Path filePath; + private final RowFileFooter footer; + private final RowBlockIndex blockIndex; + private final RowType rowType; + @Nullable private final int[] projectionMapping; + @Nullable private final RoaringBitmap32 selection; + private final BlockPrefetcher prefetcher; + + RowFormatReader( + InputStreamPool streamPool, + Path filePath, + RowFileFooter footer, + RowBlockIndex blockIndex, + RowType rowType, + @Nullable int[] projectionMapping, + @Nullable RoaringBitmap32 selection) { + this.filePath = filePath; + this.footer = footer; + this.blockIndex = blockIndex; + this.rowType = rowType; + this.projectionMapping = projectionMapping; + this.selection = selection; + this.prefetcher = + new BlockPrefetcher( + streamPool, + blockIndex, + computeBlocksToRead(blockIndex, footer.totalRowCount, selection)); + } + + @Nullable + @Override + public FileRecordIterator readBatch() throws IOException { + byte[] decompressed = prefetcher.nextBlock(); + if (decompressed == null) { + return null; + } + + int blockIdx = prefetcher.currentBlockIdx(); + long blockStartRow = blockIndex.blockRowStart(blockIdx); + RowBlockReader blockReader = new RowBlockReader(new BlockInput(decompressed), rowType); + + if (selection != null) { + long blockEndRow = blockEndRow(blockIdx); + int[] localIndices = computeSelectedLocalIndices(selection, blockStartRow, blockEndRow); + return new RowFileRecordIterator( + filePath, blockReader, projectionMapping, blockStartRow, localIndices); + } else { + return new RowFileRecordIterator( + filePath, blockReader, projectionMapping, blockStartRow); + } + } + + @Override + public void close() throws IOException { + prefetcher.close(); + } + + private long blockEndRow(int blockIdx) { + if (blockIdx + 1 < blockIndex.blockCount()) { + return blockIndex.blockRowStart(blockIdx + 1); + } + return footer.totalRowCount; + } + + private static int[] computeBlocksToRead( + RowBlockIndex blockIndex, long totalRowCount, @Nullable RoaringBitmap32 selection) { + int blockCount = blockIndex.blockCount(); + if (selection == null) { + int[] all = new int[blockCount]; + for (int i = 0; i < blockCount; i++) { + all[i] = i; + } + return all; + } + + int[] blocks = new int[blockCount]; + int count = 0; + for (int i = 0; i < blockCount; i++) { + long blockStart = blockIndex.blockRowStart(i); + long blockEnd = (i + 1 < blockCount) ? blockIndex.blockRowStart(i + 1) : totalRowCount; + if (selection.intersects(blockStart, blockEnd)) { + blocks[count++] = i; + } + } + + int[] result = new int[count]; + System.arraycopy(blocks, 0, result, 0, count); + return result; + } + + private static int[] computeSelectedLocalIndices( + RoaringBitmap32 selection, long blockStartRow, long blockEndRow) { + int capacity = (int) (blockEndRow - blockStartRow); + int[] indices = new int[capacity]; + int count = 0; + + long current = selection.nextValue((int) blockStartRow); + while (current >= 0 && current < blockEndRow) { + indices[count++] = (int) (current - blockStartRow); + if (current == Integer.MAX_VALUE) { + break; + } + current = selection.nextValue((int) current + 1); + } + + if (count == capacity) { + return indices; + } + int[] result = new int[count]; + System.arraycopy(indices, 0, result, 0, count); + return result; + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java new file mode 100644 index 000000000000..08ec2ee9702d --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.types.RowType; + +import javax.annotation.Nullable; + +import java.io.IOException; + +/** Factory for creating {@link RowFormatReader}. */ +public class RowFormatReaderFactory implements FormatReaderFactory { + + private final RowType rowType; + @Nullable private final int[] projectionMapping; + + public RowFormatReaderFactory(RowType rowType, @Nullable int[] projectionMapping) { + this.rowType = rowType; + this.projectionMapping = projectionMapping; + } + + @Override + public FileRecordReader createReader(Context context) throws IOException { + FileIO fileIO = context.fileIO(); + Path path = context.filePath(); + + SeekableInputStream in = fileIO.newInputStream(path); + RowFileFooter footer = RowFileFooter.readFrom(in, context.fileSize()); + RowBlockIndex blockIndex = + RowBlockIndex.readFrom(in, footer.indexOffset, footer.indexLength); + + InputStreamPool streamPool = new InputStreamPool(fileIO, path, 4, in); + + return new RowFormatReader( + streamPool, + path, + footer, + blockIndex, + rowType, + projectionMapping, + context.selection()); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriter.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriter.java new file mode 100644 index 000000000000..ff9ee1112bd6 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriter.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.compression.ZstdBlockCompressor; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.LongArrayList; + +import java.io.IOException; + +/** Writer that produces row-store format files with block-level ZSTD compression. */ +public class RowFormatWriter implements FormatWriter { + + private final PositionOutputStream out; + private final ZstdBlockCompressor compressor; + private final int blockSizeThreshold; + private final RowBlockWriter blockWriter; + + private final LongArrayList blockCompressedSizes; + private final LongArrayList blockUncompressedSizes; + private final LongArrayList blockRowStarts; + + private long totalRowCount; + + public RowFormatWriter( + PositionOutputStream out, RowType rowType, int blockSize, int zstdLevel) { + this.out = out; + this.compressor = new ZstdBlockCompressor(zstdLevel); + this.blockSizeThreshold = blockSize; + this.blockWriter = new RowBlockWriter(new BlockOutput(blockSize), rowType); + this.blockCompressedSizes = new LongArrayList(128); + this.blockUncompressedSizes = new LongArrayList(128); + this.blockRowStarts = new LongArrayList(128); + } + + @Override + public void addElement(InternalRow element) throws IOException { + blockWriter.writeRow(element); + totalRowCount++; + + if (blockWriter.estimatedSize() >= blockSizeThreshold) { + flushBlock(); + } + } + + @Override + public boolean reachTargetSize(boolean suggestedCheck, long targetSize) throws IOException { + if (!suggestedCheck) { + return false; + } + return out.getPos() >= targetSize; + } + + @Override + public void close() throws IOException { + flushBlock(); + + long indexOffset = out.getPos(); + RowBlockIndex index = + new RowBlockIndex( + blockCompressedSizes.toArray(), + blockUncompressedSizes.toArray(), + blockRowStarts.toArray()); + index.writeTo(out); + int indexLength = (int) (out.getPos() - indexOffset); + + RowFileFooter footer = + new RowFileFooter( + totalRowCount, blockCompressedSizes.size(), indexOffset, indexLength); + footer.writeTo(out); + + out.flush(); + out.close(); + } + + private void flushBlock() throws IOException { + if (blockWriter.rowCount() == 0) { + return; + } + + blockRowStarts.add(totalRowCount - blockWriter.rowCount()); + + byte[] uncompressed = blockWriter.finish(); + blockUncompressedSizes.add(uncompressed.length); + + int maxCompressedSize = compressor.getMaxCompressedSize(uncompressed.length); + byte[] compressed = new byte[maxCompressedSize]; + int compressedLen = + compressor.compress(uncompressed, 0, uncompressed.length, compressed, 0); + + out.write(compressed, 0, compressedLen); + blockCompressedSizes.add(compressedLen); + + blockWriter.reset(); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriterFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriterFactory.java new file mode 100644 index 000000000000..e208d7102d99 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriterFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.types.RowType; + +import java.io.IOException; + +/** Factory for creating {@link RowFormatWriter}. */ +public class RowFormatWriterFactory implements FormatWriterFactory { + + private final RowType rowType; + private final int blockSize; + private final int zstdLevel; + + public RowFormatWriterFactory(RowType rowType, int blockSize, int zstdLevel) { + this.rowType = rowType; + this.blockSize = blockSize; + this.zstdLevel = zstdLevel; + } + + @Override + public FormatWriter create(PositionOutputStream out, String compression) throws IOException { + return new RowFormatWriter(out, rowType, blockSize, zstdLevel); + } +} diff --git a/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory b/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory index 80cfe4b946b8..f34a5af57e48 100644 --- a/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory +++ b/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory @@ -20,3 +20,4 @@ org.apache.paimon.format.csv.CsvFileFormatFactory org.apache.paimon.format.text.TextFileFormatFactory org.apache.paimon.format.json.JsonFileFormatFactory org.apache.paimon.format.blob.BlobFileFormatFactory +org.apache.paimon.format.row.RowFileFormatFactory diff --git a/paimon-format/src/test/java/org/apache/paimon/format/row/BlockPrefetcherTest.java b/paimon-format/src/test/java/org/apache/paimon/format/row/BlockPrefetcherTest.java new file mode 100644 index 000000000000..007187c5b54c --- /dev/null +++ b/paimon-format/src/test/java/org/apache/paimon/format/row/BlockPrefetcherTest.java @@ -0,0 +1,577 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FormatReaderContext; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.options.Options; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.types.VarCharType; +import org.apache.paimon.utils.RoaringBitmap32; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for {@link BlockPrefetcher}. */ +public class BlockPrefetcherTest { + + @TempDir java.nio.file.Path tempDir; + + @Test + public void testCoalesceAdjacentBlocks() { + long[] compressedSizes = {100, 100, 100, 100, 100}; + long[] uncompressedSizes = {200, 200, 200, 200, 200}; + long[] rowStarts = {0, 10, 20, 30, 40}; + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + int[] blocksToRead = {0, 1, 2, 3, 4}; + List ranges = + BlockPrefetcher.coalesceRanges(blocksToRead, index); + + assertThat(ranges).hasSize(1); + assertThat(ranges.get(0).offset).isEqualTo(0); + assertThat(ranges.get(0).length).isEqualTo(500); + assertThat(ranges.get(0).blockIndices).containsExactly(0, 1, 2, 3, 4); + } + + @Test + public void testCoalesceWithLargeGap() { + long[] compressedSizes = {100, 100, 100}; + long[] uncompressedSizes = {200, 200, 200}; + long[] rowStarts = {0, 10, 20}; + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + int[] blocksToRead = {0, 2}; + List ranges = + BlockPrefetcher.coalesceRanges(blocksToRead, index); + + // gap between block 0 end (100) and block 2 start (200) is 100, within HOLE_SIZE_LIMIT + assertThat(ranges).hasSize(1); + assertThat(ranges.get(0).blockIndices).containsExactly(0, 2); + } + + @Test + public void testCoalesceSplitsByHoleSize() { + // Create blocks with large gaps (> 256KB) + int numBlocks = 3; + long[] compressedSizes = new long[numBlocks]; + long[] uncompressedSizes = new long[numBlocks]; + long[] rowStarts = new long[numBlocks]; + Arrays.fill(compressedSizes, 1024); + Arrays.fill(uncompressedSizes, 2048); + + // Block offsets: 0, 300*1024 (gap=299KB > 256KB), 600*1024 + compressedSizes[0] = 1024; + compressedSizes[1] = 1024; + compressedSizes[2] = 1024; + + // Override to create large gaps: block 0 at offset 0 size 1024, + // block 1 at offset 300*1024 (but offsets are derived from prefix sum) + // So we need block 0 compressedSize = 300*1024 to put block 1 at that offset + compressedSizes[0] = 300 * 1024; + compressedSizes[1] = 300 * 1024; + compressedSizes[2] = 1024; + rowStarts[0] = 0; + rowStarts[1] = 100; + rowStarts[2] = 200; + + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + // blocksToRead = [0, 2]: gap between block 0 end and block 2 start + // block 0 end = 300*1024, block 2 start = 600*1024, gap = 300*1024 > 256KB + int[] blocksToRead = {0, 2}; + List ranges = + BlockPrefetcher.coalesceRanges(blocksToRead, index); + + assertThat(ranges).hasSize(2); + assertThat(ranges.get(0).blockIndices).containsExactly(0); + assertThat(ranges.get(1).blockIndices).containsExactly(2); + } + + @Test + public void testCoalesceSplitsByRangeSize() { + // Create many blocks that exceed RANGE_SIZE_LIMIT (2MB) when merged + int numBlocks = 30; + long[] compressedSizes = new long[numBlocks]; + long[] uncompressedSizes = new long[numBlocks]; + long[] rowStarts = new long[numBlocks]; + Arrays.fill(compressedSizes, 100 * 1024); // 100KB each + Arrays.fill(uncompressedSizes, 200 * 1024); + for (int i = 0; i < numBlocks; i++) { + rowStarts[i] = i * 100L; + } + + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + int[] blocksToRead = new int[numBlocks]; + for (int i = 0; i < numBlocks; i++) { + blocksToRead[i] = i; + } + + List ranges = + BlockPrefetcher.coalesceRanges(blocksToRead, index); + + // 2MB / 100KB = 20 blocks per range, so 30 blocks should split into 2 ranges + assertThat(ranges.size()).isGreaterThan(1); + for (BlockPrefetcher.MergedRange range : ranges) { + assertThat(range.length).isLessThanOrEqualTo(2 * 1024 * 1024); + } + } + + @Test + public void testCoalesceEmptyInput() { + long[] compressedSizes = {100}; + long[] uncompressedSizes = {200}; + long[] rowStarts = {0}; + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + List ranges = + BlockPrefetcher.coalesceRanges(new int[0], index); + + assertThat(ranges).isEmpty(); + } + + @Test + public void testCoalesceSingleBlock() { + long[] compressedSizes = {1024}; + long[] uncompressedSizes = {2048}; + long[] rowStarts = {0}; + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + int[] blocksToRead = {0}; + List ranges = + BlockPrefetcher.coalesceRanges(blocksToRead, index); + + assertThat(ranges).hasSize(1); + assertThat(ranges.get(0).offset).isEqualTo(0); + assertThat(ranges.get(0).length).isEqualTo(1024); + assertThat(ranges.get(0).blockIndices).containsExactly(0); + } + + @Test + public void testCoalesceNonContiguousBlocks() { + long[] compressedSizes = {100, 100, 100, 100, 100}; + long[] uncompressedSizes = {200, 200, 200, 200, 200}; + long[] rowStarts = {0, 10, 20, 30, 40}; + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + int[] blocksToRead = {0, 2, 4}; + List ranges = + BlockPrefetcher.coalesceRanges(blocksToRead, index); + + // All gaps are small (100 bytes), so everything merges into one range + assertThat(ranges).hasSize(1); + assertThat(ranges.get(0).offset).isEqualTo(0); + assertThat(ranges.get(0).length).isEqualTo(500); + assertThat(ranges.get(0).blockIndices).containsExactly(0, 2, 4); + } + + @Test + public void testPrefetcherReadsNonContiguousBlocks() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "prefetch_non_contig.row"); + Options options = new Options(); + options.setString("file.block-size", "256b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List rows = new ArrayList<>(); + for (int i = 0; i < 500; i++) { + rows.add(GenericRow.of(i)); + } + writeRows(format, rowType, path, rows); + + // Select only specific non-contiguous rows that span multiple blocks + RoaringBitmap32 selection = new RoaringBitmap32(); + selection.add(0); + selection.add(50); + selection.add(200); + selection.add(499); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path), selection)); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(row.getInt(0))); + reader.close(); + + assertThat(result).containsExactly(0, 50, 200, 499); + } + + @Test + public void testPrefetcherWithManyMergedRanges() throws IOException { + RowType rowType = + RowType.builder() + .fields(Arrays.asList(new IntType(), new VarCharType(1000))) + .build(); + + Path path = new Path(tempDir.toUri().toString(), "prefetch_multi_range.row"); + Options options = new Options(); + options.setString("file.block-size", "512b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List rows = new ArrayList<>(); + for (int i = 0; i < 2000; i++) { + rows.add(GenericRow.of(i, BinaryString.fromString("value_" + i))); + } + writeRows(format, rowType, path, rows); + + // Read all rows through the prefetcher (no selection) + List result = readAllRows(format, rowType, path); + assertThat(result).hasSize(2000); + for (int i = 0; i < 2000; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getString(1).toString()).isEqualTo("value_" + i); + } + } + + @Test + public void testPrefetcherWithSelectionAcrossManyRanges() throws IOException { + RowType rowType = + RowType.builder() + .fields(Arrays.asList(new IntType(), new VarCharType(2000))) + .build(); + + Path path = new Path(tempDir.toUri().toString(), "prefetch_sel_ranges.row"); + Options options = new Options(); + options.setString("file.block-size", "256b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + // Write rows with large strings to ensure many blocks + List rows = new ArrayList<>(); + StringBuilder sb = new StringBuilder(); + for (int j = 0; j < 200; j++) { + sb.append('x'); + } + String padding = sb.toString(); + for (int i = 0; i < 1000; i++) { + rows.add(GenericRow.of(i, BinaryString.fromString(padding + i))); + } + writeRows(format, rowType, path, rows); + + // Select rows spread widely across the file + RoaringBitmap32 selection = new RoaringBitmap32(); + selection.add(0); + selection.add(100); + selection.add(500); + selection.add(750); + selection.add(999); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path), selection)); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(row.getInt(0))); + reader.close(); + + assertThat(result).containsExactly(0, 100, 500, 750, 999); + } + + @Test + public void testPrefetcherSingleBlockFile() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "single_block.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + for (int i = 0; i < 10; i++) { + rows.add(GenericRow.of(i)); + } + writeRows(format, rowType, path, rows); + + List result = readAllRows(format, rowType, path); + assertThat(result).hasSize(10); + for (int i = 0; i < 10; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + } + } + + @Test + public void testPrefetcherEmptyBlocksToRead() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "empty_sel.row"); + Options options = new Options(); + options.setString("file.block-size", "256b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List rows = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + rows.add(GenericRow.of(i)); + } + writeRows(format, rowType, path, rows); + + // Selection that matches no rows + RoaringBitmap32 selection = new RoaringBitmap32(); + selection.add(9999); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path), selection)); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(row.getInt(0))); + reader.close(); + + assertThat(result).isEmpty(); + } + + @Test + public void testInputStreamPoolLazyCreation() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "pool_lazy.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + for (int i = 0; i < 10; i++) { + rows.add(GenericRow.of(i)); + } + writeRows(format, rowType, path, rows); + + LocalFileIO fileIO = new LocalFileIO(); + SeekableInputStream initialStream = fileIO.newInputStream(path); + + // Pool with max 4, but only initialStream is opened eagerly + InputStreamPool pool = new InputStreamPool(fileIO, path, 4, initialStream); + + // First borrow returns the initial stream (no new stream opened) + SeekableInputStream s1 = pool.borrow(); + assertThat(s1).isSameAs(initialStream); + + // Second borrow creates a new stream lazily + SeekableInputStream s2 = pool.borrow(); + assertThat(s2).isNotSameAs(initialStream); + + // Third borrow creates another + SeekableInputStream s3 = pool.borrow(); + assertThat(s3).isNotSameAs(s1); + assertThat(s3).isNotSameAs(s2); + + // Return all + pool.returnStream(s1); + pool.returnStream(s2); + pool.returnStream(s3); + + // Re-borrow should reuse returned streams + SeekableInputStream s4 = pool.borrow(); + assertThat(s4 == s1 || s4 == s2 || s4 == s3).isTrue(); + + pool.returnStream(s4); + pool.close(); + } + + @Test + public void testInputStreamPoolConcurrentAccess() throws Exception { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "pool_concurrent.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + rows.add(GenericRow.of(i)); + } + writeRows(format, rowType, path, rows); + + LocalFileIO fileIO = new LocalFileIO(); + SeekableInputStream initialStream = fileIO.newInputStream(path); + InputStreamPool pool = new InputStreamPool(fileIO, path, 4, initialStream); + + int numThreads = 8; + int opsPerThread = 50; + ExecutorService executor = Executors.newFixedThreadPool(numThreads); + CountDownLatch latch = new CountDownLatch(numThreads); + AtomicInteger errors = new AtomicInteger(0); + + for (int t = 0; t < numThreads; t++) { + executor.submit( + () -> { + try { + for (int i = 0; i < opsPerThread; i++) { + SeekableInputStream in = pool.borrow(); + in.seek(0); + pool.returnStream(in); + } + } catch (Exception e) { + errors.incrementAndGet(); + } finally { + latch.countDown(); + } + }); + } + + latch.await(); + executor.shutdown(); + assertThat(errors.get()).isEqualTo(0); + pool.close(); + } + + @Test + public void testPrefetchSlidingWindow() throws IOException { + // Many ranges exceeding PREFETCH_COUNT to verify sliding window works + RowType rowType = + RowType.builder() + .fields(Arrays.asList(new IntType(), new VarCharType(5000))) + .build(); + + Path path = new Path(tempDir.toUri().toString(), "sliding_window.row"); + Options options = new Options(); + options.setString("file.block-size", "256b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + // Large rows to ensure many blocks, each block ~ 1 row + List rows = new ArrayList<>(); + StringBuilder sb = new StringBuilder(); + for (int j = 0; j < 500; j++) { + sb.append('a'); + } + String padding = sb.toString(); + for (int i = 0; i < 200; i++) { + rows.add(GenericRow.of(i, BinaryString.fromString(padding + i))); + } + writeRows(format, rowType, path, rows); + + // Read all rows - will create many merged ranges, well beyond PREFETCH_COUNT + List result = readAllRows(format, rowType, path); + assertThat(result).hasSize(200); + for (int i = 0; i < 200; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + } + } + + @Test + public void testPrefetcherWithSparseSelection() throws IOException { + // Select 1 row per block across many blocks to stress non-contiguous iteration + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "sparse_sel.row"); + Options options = new Options(); + options.setString("file.block-size", "64b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List rows = new ArrayList<>(); + for (int i = 0; i < 1000; i++) { + rows.add(GenericRow.of(i)); + } + writeRows(format, rowType, path, rows); + + // Select every 100th row + RoaringBitmap32 selection = new RoaringBitmap32(); + List expectedValues = new ArrayList<>(); + for (int i = 0; i < 1000; i += 100) { + selection.add(i); + expectedValues.add(i); + } + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path), selection)); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(row.getInt(0))); + reader.close(); + + assertThat(result).containsExactlyElementsOf(expectedValues); + } + + // ======================== Helpers ======================== + + private void writeRows(FileFormat format, RowType rowType, Path path, List rows) + throws IOException { + LocalFileIO fileIO = new LocalFileIO(); + PositionOutputStream out = fileIO.newOutputStream(path, false); + FormatWriter writer = format.createWriterFactory(rowType).create(out, "zstd"); + for (InternalRow row : rows) { + writer.addElement(row); + } + writer.close(); + } + + private List readAllRows(FileFormat format, RowType rowType, Path path) + throws IOException { + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + List result = new ArrayList<>(); + reader.forEachRemaining( + row -> { + GenericRow copy = new GenericRow(rowType.getFieldCount()); + for (int i = 0; i < rowType.getFieldCount(); i++) { + if (row.isNullAt(i)) { + copy.setField(i, null); + } else { + switch (rowType.getTypeAt(i).getTypeRoot()) { + case INTEGER: + copy.setField(i, row.getInt(i)); + break; + case VARCHAR: + copy.setField(i, row.getString(i)); + break; + default: + throw new UnsupportedOperationException(); + } + } + } + result.add(copy); + }); + reader.close(); + return result; + } +} diff --git a/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java b/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java new file mode 100644 index 000000000000..0780f32921df --- /dev/null +++ b/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java @@ -0,0 +1,970 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.GenericArray; +import org.apache.paimon.data.GenericMap; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.Timestamp; +import org.apache.paimon.data.variant.GenericVariant; +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FormatReaderContext; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.options.Options; +import org.apache.paimon.reader.FileRecordIterator; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.BigIntType; +import org.apache.paimon.types.BooleanType; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DateType; +import org.apache.paimon.types.DecimalType; +import org.apache.paimon.types.DoubleType; +import org.apache.paimon.types.FloatType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.MapType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.types.SmallIntType; +import org.apache.paimon.types.TimestampType; +import org.apache.paimon.types.TinyIntType; +import org.apache.paimon.types.VarBinaryType; +import org.apache.paimon.types.VarCharType; +import org.apache.paimon.types.VariantType; +import org.apache.paimon.utils.RoaringBitmap32; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for the row-store file format. */ +public class RowFormatReadWriteTest { + + @TempDir java.nio.file.Path tempDir; + + @Test + public void testBasicReadWrite() throws IOException { + RowType rowType = + RowType.builder() + .fields(Arrays.asList(new IntType(), new VarCharType(100))) + .build(); + + Path path = new Path(tempDir.toUri().toString(), "test.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(1, BinaryString.fromString("hello"))); + expected.add(GenericRow.of(2, BinaryString.fromString("world"))); + expected.add(GenericRow.of(3, BinaryString.fromString("paimon"))); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(expected.size()); + for (int i = 0; i < expected.size(); i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(expected.get(i).getInt(0)); + assertThat(result.get(i).getString(1)).isEqualTo(expected.get(i).getString(1)); + } + } + + @Test + public void testAllPrimitiveTypes() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "f_boolean", new BooleanType()), + new DataField(1, "f_tinyint", new TinyIntType()), + new DataField(2, "f_smallint", new SmallIntType()), + new DataField(3, "f_int", new IntType()), + new DataField(4, "f_bigint", new BigIntType()), + new DataField(5, "f_float", new FloatType()), + new DataField(6, "f_double", new DoubleType()), + new DataField(7, "f_string", new VarCharType(200)), + new DataField(8, "f_binary", new VarBinaryType(200)), + new DataField(9, "f_date", new DateType()), + new DataField(10, "f_decimal_compact", new DecimalType(10, 2)), + new DataField(11, "f_decimal_large", new DecimalType(30, 10)), + new DataField(12, "f_timestamp_compact", new TimestampType(3)), + new DataField(13, "f_timestamp_full", new TimestampType(9)))); + + Path path = new Path(tempDir.toUri().toString(), "all_types.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + expected.add( + GenericRow.of( + true, + (byte) 127, + (short) 32000, + Integer.MAX_VALUE, + Long.MAX_VALUE, + 3.14f, + 2.718281828, + BinaryString.fromString("hello world"), + new byte[] {1, 2, 3, 4, 5}, + 18000, + Decimal.fromBigDecimal(new BigDecimal("12345678.99"), 10, 2), + Decimal.fromBigDecimal( + new BigDecimal("12345678901234567890.1234567890"), 30, 10), + Timestamp.fromEpochMillis(1700000000000L), + Timestamp.fromEpochMillis(1700000000000L, 123456))); + + expected.add( + GenericRow.of( + false, + (byte) -128, + (short) -32000, + Integer.MIN_VALUE, + Long.MIN_VALUE, + -0.0f, + Double.MAX_VALUE, + BinaryString.fromString(""), + new byte[0], + 0, + Decimal.fromBigDecimal(new BigDecimal("-99999999.99"), 10, 2), + Decimal.fromBigDecimal( + new BigDecimal("-12345678901234567890.1234567890"), 30, 10), + Timestamp.fromEpochMillis(0L), + Timestamp.fromEpochMillis(0L, 0))); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(2); + for (int rowIdx = 0; rowIdx < 2; rowIdx++) { + InternalRow actual = result.get(rowIdx); + InternalRow exp = expected.get(rowIdx); + assertThat(actual.getBoolean(0)).isEqualTo(exp.getBoolean(0)); + assertThat(actual.getByte(1)).isEqualTo(exp.getByte(1)); + assertThat(actual.getShort(2)).isEqualTo(exp.getShort(2)); + assertThat(actual.getInt(3)).isEqualTo(exp.getInt(3)); + assertThat(actual.getLong(4)).isEqualTo(exp.getLong(4)); + assertThat(actual.getFloat(5)).isEqualTo(exp.getFloat(5)); + assertThat(actual.getDouble(6)).isEqualTo(exp.getDouble(6)); + assertThat(actual.getString(7)).isEqualTo(exp.getString(7)); + assertThat(actual.getBinary(8)).isEqualTo(exp.getBinary(8)); + assertThat(actual.getInt(9)).isEqualTo(exp.getInt(9)); + assertThat(actual.getDecimal(10, 10, 2)).isEqualTo(exp.getDecimal(10, 10, 2)); + assertThat(actual.getDecimal(11, 30, 10)).isEqualTo(exp.getDecimal(11, 30, 10)); + assertThat(actual.getTimestamp(12, 3)).isEqualTo(exp.getTimestamp(12, 3)); + assertThat(actual.getTimestamp(13, 9)).isEqualTo(exp.getTimestamp(13, 9)); + } + } + + @Test + public void testNullValues() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "f_int", new IntType(true)), + new DataField(1, "f_string", new VarCharType(true, 100)), + new DataField(2, "f_double", new DoubleType(true)), + new DataField(3, "f_binary", new VarBinaryType(true, 100)), + new DataField(4, "f_decimal", new DecimalType(true, 10, 2)), + new DataField(5, "f_timestamp", new TimestampType(true, 9)))); + + Path path = new Path(tempDir.toUri().toString(), "nulls.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(null, null, null, null, null, null)); + expected.add( + GenericRow.of( + 42, + BinaryString.fromString("not null"), + 3.14, + new byte[] {1}, + Decimal.fromBigDecimal(new BigDecimal("1.23"), 10, 2), + Timestamp.fromEpochMillis(1000L, 999))); + expected.add( + GenericRow.of(null, BinaryString.fromString("partial"), null, null, null, null)); + expected.add( + GenericRow.of(100, null, 2.0, null, null, Timestamp.fromEpochMillis(2000L, 0))); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(4); + + assertThat(result.get(0).isNullAt(0)).isTrue(); + assertThat(result.get(0).isNullAt(1)).isTrue(); + assertThat(result.get(0).isNullAt(2)).isTrue(); + assertThat(result.get(0).isNullAt(3)).isTrue(); + assertThat(result.get(0).isNullAt(4)).isTrue(); + assertThat(result.get(0).isNullAt(5)).isTrue(); + + assertThat(result.get(1).getInt(0)).isEqualTo(42); + assertThat(result.get(1).getString(1)).isEqualTo(BinaryString.fromString("not null")); + assertThat(result.get(1).getDouble(2)).isEqualTo(3.14); + assertThat(result.get(1).getBinary(3)).isEqualTo(new byte[] {1}); + assertThat(result.get(1).getDecimal(4, 10, 2)) + .isEqualTo(Decimal.fromBigDecimal(new BigDecimal("1.23"), 10, 2)); + assertThat(result.get(1).getTimestamp(5, 9)) + .isEqualTo(Timestamp.fromEpochMillis(1000L, 999)); + + assertThat(result.get(2).isNullAt(0)).isTrue(); + assertThat(result.get(2).getString(1)).isEqualTo(BinaryString.fromString("partial")); + assertThat(result.get(2).isNullAt(2)).isTrue(); + + assertThat(result.get(3).getInt(0)).isEqualTo(100); + assertThat(result.get(3).isNullAt(1)).isTrue(); + assertThat(result.get(3).getDouble(2)).isEqualTo(2.0); + } + + @Test + public void testArrayType() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "f_int_array", new ArrayType(new IntType())), + new DataField( + 1, "f_string_array", new ArrayType(new VarCharType(100))), + new DataField( + 2, + "f_nullable_array", + new ArrayType(true, new IntType(true))))); + + Path path = new Path(tempDir.toUri().toString(), "arrays.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + expected.add( + GenericRow.of( + new GenericArray(new Object[] {1, 2, 3}), + new GenericArray( + new Object[] { + BinaryString.fromString("a"), BinaryString.fromString("b") + }), + new GenericArray(new Object[] {1, null, 3, null, 5}))); + expected.add( + GenericRow.of( + new GenericArray(new Object[0]), + new GenericArray(new Object[] {BinaryString.fromString("")}), + null)); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(2); + + assertThat(result.get(0).getArray(0).size()).isEqualTo(3); + assertThat(result.get(0).getArray(0).getInt(0)).isEqualTo(1); + assertThat(result.get(0).getArray(0).getInt(1)).isEqualTo(2); + assertThat(result.get(0).getArray(0).getInt(2)).isEqualTo(3); + + assertThat(result.get(0).getArray(1).size()).isEqualTo(2); + assertThat(result.get(0).getArray(1).getString(0)).isEqualTo(BinaryString.fromString("a")); + assertThat(result.get(0).getArray(1).getString(1)).isEqualTo(BinaryString.fromString("b")); + + assertThat(result.get(0).getArray(2).size()).isEqualTo(5); + assertThat(result.get(0).getArray(2).getInt(0)).isEqualTo(1); + assertThat(result.get(0).getArray(2).isNullAt(1)).isTrue(); + assertThat(result.get(0).getArray(2).getInt(2)).isEqualTo(3); + assertThat(result.get(0).getArray(2).isNullAt(3)).isTrue(); + assertThat(result.get(0).getArray(2).getInt(4)).isEqualTo(5); + + assertThat(result.get(1).getArray(0).size()).isEqualTo(0); + assertThat(result.get(1).getArray(1).size()).isEqualTo(1); + assertThat(result.get(1).isNullAt(2)).isTrue(); + } + + @Test + public void testMapType() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField( + 0, + "f_map", + new MapType(new VarCharType(100), new IntType(true))))); + + Path path = new Path(tempDir.toUri().toString(), "maps.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + Map map1 = new HashMap<>(); + map1.put(BinaryString.fromString("a"), 1); + map1.put(BinaryString.fromString("b"), 2); + map1.put(BinaryString.fromString("c"), null); + + Map map2 = new HashMap<>(); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(new GenericMap(map1))); + expected.add(GenericRow.of(new GenericMap(map2))); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(2); + assertThat(result.get(0).getMap(0).size()).isEqualTo(3); + assertThat(result.get(1).getMap(0).size()).isEqualTo(0); + } + + @Test + public void testMapWithNullKeys() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField( + 0, + "f_map", + new MapType( + new VarCharType(true, 100), new IntType(true))))); + + Path path = new Path(tempDir.toUri().toString(), "null_key_maps.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + Map map = new HashMap<>(); + map.put(null, 100); + map.put(BinaryString.fromString("key"), null); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(new GenericMap(map))); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(1); + assertThat(result.get(0).getMap(0).size()).isEqualTo(2); + } + + @Test + public void testNestedRow() throws IOException { + RowType innerType = + new RowType( + Arrays.asList( + new DataField(0, "x", new IntType()), + new DataField(1, "y", new VarCharType(100)))); + + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "nested", innerType), + new DataField(2, "nullable_nested", innerType))); + + Path path = new Path(tempDir.toUri().toString(), "nested.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(1, GenericRow.of(10, BinaryString.fromString("inner")), null)); + expected.add( + GenericRow.of( + 2, + GenericRow.of(20, null), + GenericRow.of(30, BinaryString.fromString("deep")))); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(2); + + assertThat(result.get(0).getInt(0)).isEqualTo(1); + InternalRow nested0 = result.get(0).getRow(1, 2); + assertThat(nested0.getInt(0)).isEqualTo(10); + assertThat(nested0.getString(1)).isEqualTo(BinaryString.fromString("inner")); + assertThat(result.get(0).isNullAt(2)).isTrue(); + + assertThat(result.get(1).getInt(0)).isEqualTo(2); + InternalRow nested1 = result.get(1).getRow(1, 2); + assertThat(nested1.getInt(0)).isEqualTo(20); + assertThat(nested1.isNullAt(1)).isTrue(); + InternalRow nested2 = result.get(1).getRow(2, 2); + assertThat(nested2.getInt(0)).isEqualTo(30); + assertThat(nested2.getString(1)).isEqualTo(BinaryString.fromString("deep")); + } + + @Test + public void testDeeplyNestedTypes() throws IOException { + RowType innerRowType = + new RowType( + Arrays.asList( + new DataField(0, "v", new IntType()), + new DataField(1, "arr", new ArrayType(new IntType())))); + + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField( + 1, + "nested_array", + new ArrayType(new ArrayType(new IntType()))), + new DataField( + 2, + "map_of_arrays", + new MapType( + new VarCharType(50), new ArrayType(new IntType()))), + new DataField(3, "array_of_rows", new ArrayType(innerRowType)))); + + Path path = new Path(tempDir.toUri().toString(), "deeply_nested.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + GenericArray innerArr1 = new GenericArray(new Object[] {1, 2, 3}); + GenericArray innerArr2 = new GenericArray(new Object[] {4, 5}); + GenericArray nestedArray = new GenericArray(new Object[] {innerArr1, innerArr2}); + + Map mapOfArrays = new HashMap<>(); + mapOfArrays.put(BinaryString.fromString("x"), new GenericArray(new Object[] {10, 20})); + mapOfArrays.put(BinaryString.fromString("y"), new GenericArray(new Object[] {30})); + + GenericRow innerRow1 = GenericRow.of(100, new GenericArray(new Object[] {1, 2})); + GenericRow innerRow2 = GenericRow.of(200, new GenericArray(new Object[] {3})); + GenericArray arrayOfRows = new GenericArray(new Object[] {innerRow1, innerRow2}); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(1, nestedArray, new GenericMap(mapOfArrays), arrayOfRows)); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(1); + InternalRow row = result.get(0); + assertThat(row.getInt(0)).isEqualTo(1); + + assertThat(row.getArray(1).size()).isEqualTo(2); + assertThat(row.getArray(1).getArray(0).getInt(0)).isEqualTo(1); + assertThat(row.getArray(1).getArray(0).getInt(2)).isEqualTo(3); + assertThat(row.getArray(1).getArray(1).getInt(0)).isEqualTo(4); + + assertThat(row.getMap(2).size()).isEqualTo(2); + + assertThat(row.getArray(3).size()).isEqualTo(2); + InternalRow readInner0 = row.getArray(3).getRow(0, 2); + assertThat(readInner0.getInt(0)).isEqualTo(100); + assertThat(readInner0.getArray(1).getInt(0)).isEqualTo(1); + assertThat(readInner0.getArray(1).getInt(1)).isEqualTo(2); + InternalRow readInner1 = row.getArray(3).getRow(1, 2); + assertThat(readInner1.getInt(0)).isEqualTo(200); + } + + @Test + public void testVariantType() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "v", new VariantType()))); + + Path path = new Path(tempDir.toUri().toString(), "variant.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + GenericVariant v1 = GenericVariant.fromJson("{\"key\": 123}"); + GenericVariant v2 = GenericVariant.fromJson("[1, 2, 3]"); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(1, v1)); + expected.add(GenericRow.of(2, v2)); + expected.add(GenericRow.of(3, null)); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(3); + assertThat(result.get(0).getInt(0)).isEqualTo(1); + assertThat(result.get(0).getVariant(1).value()).isEqualTo(v1.value()); + assertThat(result.get(0).getVariant(1).metadata()).isEqualTo(v1.metadata()); + assertThat(result.get(1).getVariant(1).value()).isEqualTo(v2.value()); + assertThat(result.get(1).getVariant(1).metadata()).isEqualTo(v2.metadata()); + assertThat(result.get(2).isNullAt(1)).isTrue(); + } + + @Test + public void testMultipleBlocks() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "multi_block.row"); + Options options = new Options(); + options.setString("file.block-size", "1kb"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List expected = new ArrayList<>(); + for (int i = 0; i < 10000; i++) { + expected.add(GenericRow.of(i)); + } + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(expected.size()); + for (int i = 0; i < expected.size(); i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + } + } + + @Test + public void testRowPositionTracking() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "positions.row"); + Options options = new Options(); + options.setString("file.block-size", "512b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List expected = new ArrayList<>(); + for (int i = 0; i < 1000; i++) { + expected.add(GenericRow.of(i)); + } + + writeRows(format, rowType, path, expected); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + long expectedPosition = 0; + FileRecordIterator batch; + while ((batch = reader.readBatch()) != null) { + InternalRow row; + while ((row = batch.next()) != null) { + assertThat(batch.returnedPosition()).isEqualTo(expectedPosition); + assertThat(row.getInt(0)).isEqualTo((int) expectedPosition); + expectedPosition++; + } + batch.releaseBatch(); + } + assertThat(expectedPosition).isEqualTo(1000); + reader.close(); + } + + @Test + public void testProjection() throws IOException { + RowType fullType = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new VarCharType(100)), + new DataField(2, "c", new IntType()))); + + RowType projectedType = new RowType(Arrays.asList(new DataField(2, "c", new IntType()))); + + Path path = new Path(tempDir.toUri().toString(), "projection.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add(GenericRow.of(1, BinaryString.fromString("a"), 100)); + rows.add(GenericRow.of(2, BinaryString.fromString("b"), 200)); + rows.add(GenericRow.of(3, BinaryString.fromString("c"), 300)); + + writeRows(format, fullType, path, rows); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(fullType, projectedType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(GenericRow.of(row.getInt(0)))); + reader.close(); + + assertThat(result.size()).isEqualTo(3); + assertThat(result.get(0).getInt(0)).isEqualTo(100); + assertThat(result.get(1).getInt(0)).isEqualTo(200); + assertThat(result.get(2).getInt(0)).isEqualTo(300); + } + + @Test + public void testProjectionMultipleColumns() throws IOException { + RowType fullType = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new VarCharType(100)), + new DataField(2, "c", new DoubleType()), + new DataField(3, "d", new BigIntType()))); + + RowType projectedType = + new RowType( + Arrays.asList( + new DataField(2, "c", new DoubleType()), + new DataField(0, "a", new IntType()))); + + Path path = new Path(tempDir.toUri().toString(), "projection_multi.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add(GenericRow.of(1, BinaryString.fromString("x"), 1.1, 100L)); + rows.add(GenericRow.of(2, BinaryString.fromString("y"), 2.2, 200L)); + + writeRows(format, fullType, path, rows); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(fullType, projectedType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(GenericRow.of(row.getDouble(0), row.getInt(1)))); + reader.close(); + + assertThat(result.size()).isEqualTo(2); + assertThat(result.get(0).getDouble(0)).isEqualTo(1.1); + assertThat(result.get(0).getInt(1)).isEqualTo(1); + assertThat(result.get(1).getDouble(0)).isEqualTo(2.2); + assertThat(result.get(1).getInt(1)).isEqualTo(2); + } + + @Test + public void testSelection() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "selection.row"); + Options options = new Options(); + options.setString("file.block-size", "256b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List expected = new ArrayList<>(); + for (int i = 0; i < 500; i++) { + expected.add(GenericRow.of(i)); + } + + writeRows(format, rowType, path, expected); + + RoaringBitmap32 selection = new RoaringBitmap32(); + selection.add(0); + selection.add(10); + selection.add(100); + selection.add(499); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path), selection)); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(row.getInt(0))); + reader.close(); + + assertThat(result).containsExactly(0, 10, 100, 499); + } + + @Test + public void testSelectionSkipsEntireBlocks() throws IOException { + RowType rowType = + RowType.builder() + .fields(Arrays.asList(new IntType(), new VarCharType(100))) + .build(); + + Path path = new Path(tempDir.toUri().toString(), "selection_skip.row"); + Options options = new Options(); + options.setString("file.block-size", "256b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List rows = new ArrayList<>(); + for (int i = 0; i < 1000; i++) { + rows.add(GenericRow.of(i, BinaryString.fromString("val" + i))); + } + + writeRows(format, rowType, path, rows); + + RoaringBitmap32 selection = new RoaringBitmap32(); + selection.add(0); + selection.add(999); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path), selection)); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(row.getInt(0))); + reader.close(); + + assertThat(result).containsExactly(0, 999); + } + + @Test + public void testLargeVariableLengthData() throws IOException { + RowType rowType = + RowType.builder() + .fields(Arrays.asList(new IntType(), new VarCharType(10000))) + .build(); + + Path path = new Path(tempDir.toUri().toString(), "large_strings.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + Random random = new Random(42); + List expected = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + StringBuilder sb = new StringBuilder(); + int len = random.nextInt(5000) + 100; + for (int j = 0; j < len; j++) { + sb.append((char) ('a' + random.nextInt(26))); + } + expected.add(GenericRow.of(i, BinaryString.fromString(sb.toString()))); + } + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(expected.size()); + for (int i = 0; i < expected.size(); i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(expected.get(i).getInt(0)); + assertThat(result.get(i).getString(1)).isEqualTo(expected.get(i).getString(1)); + } + } + + @Test + public void testEmptyFile() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "empty.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + writeRows(format, rowType, path, new ArrayList<>()); + List result = readAllRows(format, rowType, path); + + assertThat(result).isEmpty(); + } + + @Test + public void testManyColumns() throws IOException { + int numCols = 100; + List fields = new ArrayList<>(); + for (int i = 0; i < numCols; i++) { + fields.add(new DataField(i, "f" + i, new IntType(true))); + } + RowType rowType = new RowType(fields); + + Path path = new Path(tempDir.toUri().toString(), "many_cols.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + for (int r = 0; r < 50; r++) { + Object[] values = new Object[numCols]; + for (int c = 0; c < numCols; c++) { + values[c] = (r + c) % 3 == 0 ? null : r * numCols + c; + } + expected.add(GenericRow.of(values)); + } + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(50); + for (int r = 0; r < 50; r++) { + for (int c = 0; c < numCols; c++) { + if ((r + c) % 3 == 0) { + assertThat(result.get(r).isNullAt(c)).isTrue(); + } else { + assertThat(result.get(r).getInt(c)).isEqualTo(r * numCols + c); + } + } + } + } + + @Test + public void testRandomizedRoundTrip() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "f_int", new IntType(true)), + new DataField(1, "f_long", new BigIntType(true)), + new DataField(2, "f_str", new VarCharType(true, 500)), + new DataField(3, "f_double", new DoubleType(true)), + new DataField(4, "f_bool", new BooleanType(true)), + new DataField(5, "f_bytes", new VarBinaryType(true, 500)))); + + Path path = new Path(tempDir.toUri().toString(), "random.row"); + Options options = new Options(); + options.setString("file.block-size", "2kb"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + Random random = new Random(12345); + int numRows = 5000; + List expected = new ArrayList<>(numRows); + for (int i = 0; i < numRows; i++) { + Object[] values = new Object[6]; + values[0] = random.nextBoolean() ? null : random.nextInt(); + values[1] = random.nextBoolean() ? null : random.nextLong(); + values[2] = + random.nextBoolean() + ? null + : BinaryString.fromString(randomString(random, random.nextInt(200))); + values[3] = random.nextBoolean() ? null : random.nextDouble(); + values[4] = random.nextBoolean() ? null : random.nextBoolean(); + values[5] = random.nextBoolean() ? null : randomBytes(random, random.nextInt(100)); + expected.add(GenericRow.of(values)); + } + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(numRows); + for (int i = 0; i < numRows; i++) { + InternalRow exp = expected.get(i); + InternalRow act = result.get(i); + for (int c = 0; c < 6; c++) { + assertThat(act.isNullAt(c)) + .as("row %d col %d null mismatch", i, c) + .isEqualTo(exp.isNullAt(c)); + } + if (!act.isNullAt(0)) { + assertThat(act.getInt(0)).isEqualTo(exp.getInt(0)); + } + if (!act.isNullAt(1)) { + assertThat(act.getLong(1)).isEqualTo(exp.getLong(1)); + } + if (!act.isNullAt(2)) { + assertThat(act.getString(2)).isEqualTo(exp.getString(2)); + } + if (!act.isNullAt(3)) { + assertThat(act.getDouble(3)).isEqualTo(exp.getDouble(3)); + } + if (!act.isNullAt(4)) { + assertThat(act.getBoolean(4)).isEqualTo(exp.getBoolean(4)); + } + if (!act.isNullAt(5)) { + assertThat(act.getBinary(5)).isEqualTo(exp.getBinary(5)); + } + } + } + + // ======================== Helpers ======================== + + private void writeRows(FileFormat format, RowType rowType, Path path, List rows) + throws IOException { + LocalFileIO fileIO = new LocalFileIO(); + PositionOutputStream out = fileIO.newOutputStream(path, false); + FormatWriter writer = format.createWriterFactory(rowType).create(out, "zstd"); + for (InternalRow row : rows) { + writer.addElement(row); + } + writer.close(); + } + + private List readAllRows(FileFormat format, RowType rowType, Path path) + throws IOException { + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(copyRow(row, rowType))); + reader.close(); + return result; + } + + private GenericRow copyRow(InternalRow row, RowType rowType) { + int arity = rowType.getFieldCount(); + GenericRow copy = new GenericRow(arity); + for (int i = 0; i < arity; i++) { + if (row.isNullAt(i)) { + copy.setField(i, null); + } else { + copy.setField(i, copyField(row, i, rowType.getTypeAt(i))); + } + } + return copy; + } + + private Object copyField(InternalRow row, int i, org.apache.paimon.types.DataType type) { + switch (type.getTypeRoot()) { + case BOOLEAN: + return row.getBoolean(i); + case TINYINT: + return row.getByte(i); + case SMALLINT: + return row.getShort(i); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + return row.getInt(i); + case BIGINT: + return row.getLong(i); + case FLOAT: + return row.getFloat(i); + case DOUBLE: + return row.getDouble(i); + case CHAR: + case VARCHAR: + return row.getString(i); + case BINARY: + case VARBINARY: + return row.getBinary(i); + case DECIMAL: + { + int p = ((org.apache.paimon.types.DecimalType) type).getPrecision(); + int s = ((org.apache.paimon.types.DecimalType) type).getScale(); + return row.getDecimal(i, p, s); + } + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + { + int p = ((org.apache.paimon.types.TimestampType) type).getPrecision(); + return row.getTimestamp(i, p); + } + case VARIANT: + return row.getVariant(i); + case ARRAY: + return row.getArray(i); + case MAP: + return row.getMap(i); + case MULTISET: + return row.getMap(i); + case ROW: + return row.getRow(i, ((RowType) type).getFieldCount()); + default: + throw new UnsupportedOperationException("Unsupported: " + type); + } + } + + private static String randomString(Random random, int length) { + StringBuilder sb = new StringBuilder(length); + for (int i = 0; i < length; i++) { + sb.append((char) ('a' + random.nextInt(26))); + } + return sb.toString(); + } + + private static byte[] randomBytes(Random random, int length) { + byte[] bytes = new byte[length]; + random.nextBytes(bytes); + return bytes; + } +} From b28e3fa03f5242c353d559256a0d626ed86df74d Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 22 May 2026 14:48:45 +0800 Subject: [PATCH 2/6] fix --- .../paimon/format/row/BlockPrefetcher.java | 210 +++--------------- .../paimon/format/row/InputStreamPool.java | 76 ------- .../paimon/format/row/ReadStrategy.java | 37 +++ .../paimon/format/row/RowBlockIndex.java | 3 + .../paimon/format/row/RowFileFooter.java | 15 +- .../paimon/format/row/RowFormatReader.java | 5 +- .../format/row/RowFormatReaderFactory.java | 34 ++- .../format/row/SequentialReadStrategy.java | 117 ++++++++++ .../format/row/VectoredReadStrategy.java | 201 +++++++++++++++++ .../format/row/BlockPrefetcherTest.java | 126 ++--------- 10 files changed, 443 insertions(+), 381 deletions(-) delete mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/InputStreamPool.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/ReadStrategy.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/SequentialReadStrategy.java create mode 100644 paimon-format/src/main/java/org/apache/paimon/format/row/VectoredReadStrategy.java diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/BlockPrefetcher.java b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockPrefetcher.java index 5ceff2529f4f..e50210dc4cf6 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/row/BlockPrefetcher.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockPrefetcher.java @@ -18,208 +18,68 @@ package org.apache.paimon.format.row; -import org.apache.paimon.compression.ZstdBlockDecompressor; import org.apache.paimon.fs.SeekableInputStream; -import org.apache.paimon.utils.IOUtils; +import org.apache.paimon.fs.VectoredReadable; import java.io.Closeable; import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.ArrayDeque; -import java.util.ArrayList; -import java.util.List; -import java.util.Queue; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; /** - * Prefetches and decompresses blocks with IO coalescing and concurrent reading. + * Prefetches and decompresses blocks ahead of consumption. * - *

Adjacent blocks are merged into larger IO requests to reduce QPS. Multiple merged ranges are - * prefetched concurrently using separate InputStreams. + *

Two IO strategies depending on whether the stream supports positional reads: + * + *

    + *
  • {@link VectoredReadable}: adjacent blocks are coalesced into merged ranges and prefetched + * concurrently using thread-safe {@code preadFully}. + *
  • Otherwise: blocks are read sequentially in a single background thread (async serial + * read-ahead). + *
*/ class BlockPrefetcher implements Closeable { - private static final int HOLE_SIZE_LIMIT = 256 * 1024; - private static final int RANGE_SIZE_LIMIT = 2 * 1024 * 1024; - private static final int PREFETCH_COUNT = 4; - - private static final ExecutorService IO_POOL = Executors.newCachedThreadPool(); - - private final InputStreamPool streamPool; - private final RowBlockIndex blockIndex; - private final ZstdBlockDecompressor decompressor; - - private final List mergedRanges; - private final Queue> prefetchQueue; - private int nextRangeToSubmit; - private int currentRangeIdx; - private byte[] currentRangeData; - private int currentBlockInRange; - - BlockPrefetcher(InputStreamPool streamPool, RowBlockIndex blockIndex, int[] blocksToRead) { - this.streamPool = streamPool; - this.blockIndex = blockIndex; - this.decompressor = new ZstdBlockDecompressor(); - this.mergedRanges = coalesceRanges(blocksToRead, blockIndex); - this.prefetchQueue = new ArrayDeque<>(PREFETCH_COUNT); - this.nextRangeToSubmit = 0; - this.currentRangeIdx = -1; - this.currentBlockInRange = 0; + private final SeekableInputStream inputStream; + private final ReadStrategy strategy; - fillPrefetch(); + BlockPrefetcher(SeekableInputStream inputStream, RowBlockIndex blockIndex, int[] blocksToRead) { + this.inputStream = inputStream; + if (inputStream instanceof VectoredReadable) { + this.strategy = + new VectoredReadStrategy( + (VectoredReadable) inputStream, blockIndex, blocksToRead); + } else { + this.strategy = new SequentialReadStrategy(inputStream, blockIndex, blocksToRead); + } } byte[] nextBlock() throws IOException { - if (currentRangeIdx < 0 - || currentBlockInRange >= mergedRanges.get(currentRangeIdx).blockIndices.length) { - advanceToNextRange(); - } - if (currentRangeIdx >= mergedRanges.size()) { - return null; - } - - MergedRange range = mergedRanges.get(currentRangeIdx); - int blockIdx = range.blockIndices[currentBlockInRange]; - int offsetInBuf = (int) (blockIndex.blockOffset(blockIdx) - range.offset); - int compressedSize = (int) blockIndex.blockCompressedSize(blockIdx); - int uncompressedSize = (int) blockIndex.blockUncompressedSize(blockIdx); - - byte[] decompressed = new byte[uncompressedSize]; - decompressor.decompress(currentRangeData, offsetInBuf, compressedSize, decompressed, 0); - - currentBlockInRange++; - return decompressed; + return strategy.nextBlock(); } int currentBlockIdx() { - if (currentRangeIdx < 0 || currentRangeIdx >= mergedRanges.size()) { - return -1; - } - MergedRange range = mergedRanges.get(currentRangeIdx); - return range.blockIndices[currentBlockInRange - 1]; + return strategy.currentBlockIdx(); } @Override public void close() throws IOException { - for (CompletableFuture f : prefetchQueue) { - f.cancel(true); - } - prefetchQueue.clear(); - streamPool.close(); - } - - private void advanceToNextRange() throws IOException { - currentRangeIdx++; - currentBlockInRange = 0; - - if (currentRangeIdx >= mergedRanges.size()) { - currentRangeData = null; - return; - } - - CompletableFuture future = prefetchQueue.poll(); - if (future == null) { - currentRangeData = readRange(mergedRanges.get(currentRangeIdx)); - } else { - try { - currentRangeData = future.get(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("Interrupted while waiting for prefetch", e); - } catch (ExecutionException e) { - Throwable cause = e.getCause(); - if (cause instanceof IOException) { - throw (IOException) cause; - } - throw new IOException("Prefetch failed", cause); - } - } - fillPrefetch(); - } - - private void fillPrefetch() { - while (prefetchQueue.size() < PREFETCH_COUNT && nextRangeToSubmit < mergedRanges.size()) { - int rangeIdx = nextRangeToSubmit++; - MergedRange range = mergedRanges.get(rangeIdx); - prefetchQueue.add( - CompletableFuture.supplyAsync( - () -> { - try { - return readRange(range); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - }, - IO_POOL)); - } + strategy.close(); + inputStream.close(); } - private byte[] readRange(MergedRange range) throws IOException { - byte[] buf = new byte[range.length]; - SeekableInputStream in = streamPool.borrow(); + static byte[] awaitFuture(CompletableFuture future) throws IOException { try { - in.seek(range.offset); - IOUtils.readFully(in, buf); - } finally { - streamPool.returnStream(in); - } - return buf; - } - - // ======================== Range Coalescing ======================== - - static List coalesceRanges(int[] blocksToRead, RowBlockIndex blockIndex) { - List result = new ArrayList<>(); - if (blocksToRead.length == 0) { - return result; - } - - int rangeStart = 0; - long rangeOffset = blockIndex.blockOffset(blocksToRead[0]); - long rangeEnd = rangeOffset + blockIndex.blockCompressedSize(blocksToRead[0]); - - for (int i = 1; i < blocksToRead.length; i++) { - int blockIdx = blocksToRead[i]; - long blockOffset = blockIndex.blockOffset(blockIdx); - long blockEnd = blockOffset + blockIndex.blockCompressedSize(blockIdx); - long gap = blockOffset - rangeEnd; - long newLength = blockEnd - rangeOffset; - - if (gap < HOLE_SIZE_LIMIT && newLength <= RANGE_SIZE_LIMIT) { - rangeEnd = blockEnd; - } else { - result.add(buildRange(blocksToRead, rangeStart, i, rangeOffset, rangeEnd)); - rangeStart = i; - rangeOffset = blockOffset; - rangeEnd = blockEnd; + return future.get(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException("Interrupted while waiting for prefetch", e); + } catch (ExecutionException e) { + Throwable cause = e.getCause(); + if (cause instanceof IOException) { + throw (IOException) cause; } - } - result.add( - buildRange(blocksToRead, rangeStart, blocksToRead.length, rangeOffset, rangeEnd)); - return result; - } - - private static MergedRange buildRange( - int[] blocksToRead, int from, int to, long rangeOffset, long rangeEnd) { - int[] indices = new int[to - from]; - System.arraycopy(blocksToRead, from, indices, 0, indices.length); - return new MergedRange(rangeOffset, (int) (rangeEnd - rangeOffset), indices); - } - - // ======================== MergedRange ======================== - - static class MergedRange { - final long offset; - final int length; - final int[] blockIndices; - - MergedRange(long offset, int length, int[] blockIndices) { - this.offset = offset; - this.length = length; - this.blockIndices = blockIndices; + throw new IOException("Prefetch failed", cause); } } } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/InputStreamPool.java b/paimon-format/src/main/java/org/apache/paimon/format/row/InputStreamPool.java deleted file mode 100644 index 7ef811505b5c..000000000000 --- a/paimon-format/src/main/java/org/apache/paimon/format/row/InputStreamPool.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.paimon.format.row; - -import org.apache.paimon.fs.FileIO; -import org.apache.paimon.fs.Path; -import org.apache.paimon.fs.SeekableInputStream; - -import java.io.Closeable; -import java.io.IOException; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.atomic.AtomicInteger; - -/** A lazy pool of {@link SeekableInputStream} instances that opens streams on demand. */ -class InputStreamPool implements Closeable { - - private final FileIO fileIO; - private final Path path; - private final int maxSize; - private final AtomicInteger created; - private final LinkedBlockingQueue available; - - InputStreamPool(FileIO fileIO, Path path, int maxSize, SeekableInputStream initialStream) { - this.fileIO = fileIO; - this.path = path; - this.maxSize = maxSize; - this.created = new AtomicInteger(1); - this.available = new LinkedBlockingQueue<>(); - this.available.add(initialStream); - } - - SeekableInputStream borrow() throws IOException { - SeekableInputStream in = available.poll(); - if (in != null) { - return in; - } - if (created.getAndIncrement() < maxSize) { - return fileIO.newInputStream(path); - } - created.decrementAndGet(); - try { - return available.take(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new IOException("Interrupted while waiting for stream", e); - } - } - - void returnStream(SeekableInputStream in) { - available.add(in); - } - - @Override - public void close() throws IOException { - SeekableInputStream in; - while ((in = available.poll()) != null) { - in.close(); - } - } -} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/ReadStrategy.java b/paimon-format/src/main/java/org/apache/paimon/format/row/ReadStrategy.java new file mode 100644 index 000000000000..2a5a07167903 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/ReadStrategy.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import java.io.Closeable; +import java.io.IOException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import static org.apache.paimon.utils.ThreadUtils.newDaemonThreadFactory; + +/** Strategy for reading and prefetching compressed blocks. */ +interface ReadStrategy extends Closeable { + + ExecutorService IO_POOL = + Executors.newCachedThreadPool(newDaemonThreadFactory("ROW-FORMAT-IO")); + + byte[] nextBlock() throws IOException; + + int currentBlockIdx(); +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockIndex.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockIndex.java index 294eac1bcee4..2987d28bf9c5 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockIndex.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockIndex.java @@ -73,7 +73,10 @@ static RowBlockIndex readFrom(SeekableInputStream in, long indexOffset, int inde in.seek(indexOffset); byte[] indexData = new byte[indexLength]; IOUtils.readFully(in, indexData); + return readFrom(indexData); + } + static RowBlockIndex readFrom(byte[] indexData) { int pos = 0; int len1 = decodeVarInt(indexData, pos); pos += varIntSize(len1); diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFooter.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFooter.java index adc3edd0ce5f..c6d0026d11f0 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFooter.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFooter.java @@ -58,23 +58,26 @@ static RowFileFooter readFrom(SeekableInputStream in, long fileSize) throws IOEx in.seek(fileSize - FOOTER_SIZE); byte[] buf = new byte[FOOTER_SIZE]; readFully(in, buf); + return readFrom(buf, 0); + } - int magic = readIntLE(buf, 28); + static RowFileFooter readFrom(byte[] buf, int offset) throws IOException { + int magic = readIntLE(buf, offset + 28); if (magic != MAGIC) { throw new IOException( String.format( "Invalid row file magic: expected 0x%08X, got 0x%08X", MAGIC, magic)); } - byte version = buf[24]; + byte version = buf[offset + 24]; if (version != VERSION) { throw new IOException("Unsupported row file version: " + version); } - long totalRowCount = readLongLE(buf, 0); - int blockCount = readIntLE(buf, 8); - long indexOffset = readLongLE(buf, 12); - int indexLength = readIntLE(buf, 20); + long totalRowCount = readLongLE(buf, offset); + int blockCount = readIntLE(buf, offset + 8); + long indexOffset = readLongLE(buf, offset + 12); + int indexLength = readIntLE(buf, offset + 20); return new RowFileFooter(totalRowCount, blockCount, indexOffset, indexLength); } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java index 0a1904f8e4cf..7d288754ba4e 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.InternalRow; import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.SeekableInputStream; import org.apache.paimon.reader.FileRecordIterator; import org.apache.paimon.reader.FileRecordReader; import org.apache.paimon.types.RowType; @@ -41,7 +42,7 @@ public class RowFormatReader implements FileRecordReader { private final BlockPrefetcher prefetcher; RowFormatReader( - InputStreamPool streamPool, + SeekableInputStream inputStream, Path filePath, RowFileFooter footer, RowBlockIndex blockIndex, @@ -56,7 +57,7 @@ public class RowFormatReader implements FileRecordReader { this.selection = selection; this.prefetcher = new BlockPrefetcher( - streamPool, + inputStream, blockIndex, computeBlocksToRead(blockIndex, footer.totalRowCount, selection)); } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java index 08ec2ee9702d..c0bd11a5d72d 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java @@ -25,6 +25,7 @@ import org.apache.paimon.fs.SeekableInputStream; import org.apache.paimon.reader.FileRecordReader; import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.IOUtils; import javax.annotation.Nullable; @@ -33,6 +34,8 @@ /** Factory for creating {@link RowFormatReader}. */ public class RowFormatReaderFactory implements FormatReaderFactory { + private static final int TAIL_PREFETCH_SIZE = 64 * 1024; + private final RowType rowType; @Nullable private final int[] projectionMapping; @@ -45,21 +48,30 @@ public RowFormatReaderFactory(RowType rowType, @Nullable int[] projectionMapping public FileRecordReader createReader(Context context) throws IOException { FileIO fileIO = context.fileIO(); Path path = context.filePath(); + long fileSize = context.fileSize(); SeekableInputStream in = fileIO.newInputStream(path); - RowFileFooter footer = RowFileFooter.readFrom(in, context.fileSize()); - RowBlockIndex blockIndex = - RowBlockIndex.readFrom(in, footer.indexOffset, footer.indexLength); - InputStreamPool streamPool = new InputStreamPool(fileIO, path, 4, in); + int tailSize = (int) Math.min(TAIL_PREFETCH_SIZE, fileSize); + long tailOffset = fileSize - tailSize; + in.seek(tailOffset); + byte[] tailBuf = new byte[tailSize]; + IOUtils.readFully(in, tailBuf); + + RowFileFooter footer = + RowFileFooter.readFrom(tailBuf, tailSize - RowFileFooter.FOOTER_SIZE); + + RowBlockIndex blockIndex; + if (footer.indexOffset >= tailOffset) { + int indexOffsetInBuf = (int) (footer.indexOffset - tailOffset); + byte[] indexData = new byte[footer.indexLength]; + System.arraycopy(tailBuf, indexOffsetInBuf, indexData, 0, footer.indexLength); + blockIndex = RowBlockIndex.readFrom(indexData); + } else { + blockIndex = RowBlockIndex.readFrom(in, footer.indexOffset, footer.indexLength); + } return new RowFormatReader( - streamPool, - path, - footer, - blockIndex, - rowType, - projectionMapping, - context.selection()); + in, path, footer, blockIndex, rowType, projectionMapping, context.selection()); } } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/SequentialReadStrategy.java b/paimon-format/src/main/java/org/apache/paimon/format/row/SequentialReadStrategy.java new file mode 100644 index 000000000000..fbd2fb397a4c --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/SequentialReadStrategy.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.compression.ZstdBlockDecompressor; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.utils.IOUtils; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.concurrent.CompletableFuture; + +/** + * Reads blocks sequentially in a background thread. + * + *

One block is prefetched ahead so that IO and decompression can overlap. + */ +class SequentialReadStrategy implements ReadStrategy { + + private final SeekableInputStream inputStream; + private final RowBlockIndex blockIndex; + private final ZstdBlockDecompressor decompressor; + private final int[] blocksToRead; + private CompletableFuture nextFuture; + private int nextSubmit; + private int nextConsume; + + SequentialReadStrategy( + SeekableInputStream inputStream, RowBlockIndex blockIndex, int[] blocksToRead) { + this.inputStream = inputStream; + this.blockIndex = blockIndex; + this.decompressor = new ZstdBlockDecompressor(); + this.blocksToRead = blocksToRead; + this.nextSubmit = 0; + this.nextConsume = 0; + submitNext(); + } + + @Override + public byte[] nextBlock() throws IOException { + if (nextConsume >= blocksToRead.length) { + return null; + } + + byte[] compressed; + if (nextFuture != null) { + compressed = BlockPrefetcher.awaitFuture(nextFuture); + nextFuture = null; + } else { + compressed = readBlock(blocksToRead[nextConsume]); + } + nextConsume++; + submitNext(); + + int blockIdx = blocksToRead[nextConsume - 1]; + int uncompressedSize = (int) blockIndex.blockUncompressedSize(blockIdx); + byte[] decompressed = new byte[uncompressedSize]; + decompressor.decompress(compressed, 0, compressed.length, decompressed, 0); + return decompressed; + } + + @Override + public int currentBlockIdx() { + if (nextConsume <= 0 || nextConsume > blocksToRead.length) { + return -1; + } + return blocksToRead[nextConsume - 1]; + } + + @Override + public void close() { + if (nextFuture != null) { + nextFuture.cancel(true); + nextFuture = null; + } + } + + private void submitNext() { + if (nextSubmit < blocksToRead.length) { + int blockIdx = blocksToRead[nextSubmit++]; + nextFuture = + CompletableFuture.supplyAsync( + () -> { + try { + return readBlock(blockIdx); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }, + IO_POOL); + } + } + + private byte[] readBlock(int blockIdx) throws IOException { + int compressedSize = (int) blockIndex.blockCompressedSize(blockIdx); + byte[] buf = new byte[compressedSize]; + inputStream.seek(blockIndex.blockOffset(blockIdx)); + IOUtils.readFully(inputStream, buf); + return buf; + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/VectoredReadStrategy.java b/paimon-format/src/main/java/org/apache/paimon/format/row/VectoredReadStrategy.java new file mode 100644 index 000000000000..a1305bdf5adf --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/VectoredReadStrategy.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.compression.ZstdBlockDecompressor; +import org.apache.paimon.fs.VectoredReadable; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.CompletableFuture; + +/** + * Reads blocks using positional reads ({@code preadFully}) with IO coalescing and concurrent + * prefetch. + * + *

Adjacent blocks are merged into larger IO requests to reduce QPS. Multiple merged ranges are + * prefetched concurrently since {@code preadFully} is thread-safe. + */ +class VectoredReadStrategy implements ReadStrategy { + + static final int HOLE_SIZE_LIMIT = 256 * 1024; + static final int RANGE_SIZE_LIMIT = 2 * 1024 * 1024; + private static final int PREFETCH_COUNT = 4; + + private final VectoredReadable readable; + private final RowBlockIndex blockIndex; + private final ZstdBlockDecompressor decompressor; + private final List mergedRanges; + private final Queue> prefetchQueue; + private int nextRangeToSubmit; + private int currentRangeIdx; + private byte[] currentRangeData; + private int currentBlockInRange; + + VectoredReadStrategy(VectoredReadable readable, RowBlockIndex blockIndex, int[] blocksToRead) { + this.readable = readable; + this.blockIndex = blockIndex; + this.decompressor = new ZstdBlockDecompressor(); + this.mergedRanges = coalesceRanges(blocksToRead, blockIndex); + this.prefetchQueue = new ArrayDeque<>(PREFETCH_COUNT); + this.nextRangeToSubmit = 0; + this.currentRangeIdx = -1; + this.currentBlockInRange = 0; + fillPrefetch(); + } + + @Override + public byte[] nextBlock() throws IOException { + if (currentRangeIdx < 0 + || currentBlockInRange >= mergedRanges.get(currentRangeIdx).blockIndices.length) { + advanceToNextRange(); + } + if (currentRangeIdx >= mergedRanges.size()) { + return null; + } + + MergedRange range = mergedRanges.get(currentRangeIdx); + int blockIdx = range.blockIndices[currentBlockInRange]; + int offsetInBuf = (int) (blockIndex.blockOffset(blockIdx) - range.offset); + int compressedSize = (int) blockIndex.blockCompressedSize(blockIdx); + int uncompressedSize = (int) blockIndex.blockUncompressedSize(blockIdx); + + byte[] decompressed = new byte[uncompressedSize]; + decompressor.decompress(currentRangeData, offsetInBuf, compressedSize, decompressed, 0); + + currentBlockInRange++; + return decompressed; + } + + @Override + public int currentBlockIdx() { + if (currentRangeIdx < 0 || currentRangeIdx >= mergedRanges.size()) { + return -1; + } + MergedRange range = mergedRanges.get(currentRangeIdx); + return range.blockIndices[currentBlockInRange - 1]; + } + + @Override + public void close() { + for (CompletableFuture f : prefetchQueue) { + f.cancel(true); + } + prefetchQueue.clear(); + } + + private void advanceToNextRange() throws IOException { + currentRangeIdx++; + currentBlockInRange = 0; + + if (currentRangeIdx >= mergedRanges.size()) { + currentRangeData = null; + return; + } + + CompletableFuture future = prefetchQueue.poll(); + if (future != null) { + currentRangeData = BlockPrefetcher.awaitFuture(future); + } else { + currentRangeData = readRange(mergedRanges.get(currentRangeIdx)); + } + fillPrefetch(); + } + + private void fillPrefetch() { + while (prefetchQueue.size() < PREFETCH_COUNT && nextRangeToSubmit < mergedRanges.size()) { + int rangeIdx = nextRangeToSubmit++; + MergedRange range = mergedRanges.get(rangeIdx); + prefetchQueue.add( + CompletableFuture.supplyAsync( + () -> { + try { + return readRange(range); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }, + IO_POOL)); + } + } + + private byte[] readRange(MergedRange range) throws IOException { + byte[] buf = new byte[range.length]; + readable.preadFully(range.offset, buf, 0, range.length); + return buf; + } + + // ======================== Range Coalescing ======================== + + static List coalesceRanges(int[] blocksToRead, RowBlockIndex blockIndex) { + List result = new ArrayList<>(); + if (blocksToRead.length == 0) { + return result; + } + + int rangeStart = 0; + long rangeOffset = blockIndex.blockOffset(blocksToRead[0]); + long rangeEnd = rangeOffset + blockIndex.blockCompressedSize(blocksToRead[0]); + + for (int i = 1; i < blocksToRead.length; i++) { + int blockIdx = blocksToRead[i]; + long blockOffset = blockIndex.blockOffset(blockIdx); + long blockEnd = blockOffset + blockIndex.blockCompressedSize(blockIdx); + long gap = blockOffset - rangeEnd; + long newLength = blockEnd - rangeOffset; + + if (gap < HOLE_SIZE_LIMIT && newLength <= RANGE_SIZE_LIMIT) { + rangeEnd = blockEnd; + } else { + result.add(buildRange(blocksToRead, rangeStart, i, rangeOffset, rangeEnd)); + rangeStart = i; + rangeOffset = blockOffset; + rangeEnd = blockEnd; + } + } + result.add( + buildRange(blocksToRead, rangeStart, blocksToRead.length, rangeOffset, rangeEnd)); + return result; + } + + private static MergedRange buildRange( + int[] blocksToRead, int from, int to, long rangeOffset, long rangeEnd) { + int[] indices = new int[to - from]; + System.arraycopy(blocksToRead, from, indices, 0, indices.length); + return new MergedRange(rangeOffset, (int) (rangeEnd - rangeOffset), indices); + } + + // ======================== MergedRange ======================== + + static class MergedRange { + final long offset; + final int length; + final int[] blockIndices; + + MergedRange(long offset, int length, int[] blockIndices) { + this.offset = offset; + this.length = length; + this.blockIndices = blockIndices; + } + } +} diff --git a/paimon-format/src/test/java/org/apache/paimon/format/row/BlockPrefetcherTest.java b/paimon-format/src/test/java/org/apache/paimon/format/row/BlockPrefetcherTest.java index 007187c5b54c..080ce6759e68 100644 --- a/paimon-format/src/test/java/org/apache/paimon/format/row/BlockPrefetcherTest.java +++ b/paimon-format/src/test/java/org/apache/paimon/format/row/BlockPrefetcherTest.java @@ -27,7 +27,6 @@ import org.apache.paimon.format.FormatWriter; import org.apache.paimon.fs.Path; import org.apache.paimon.fs.PositionOutputStream; -import org.apache.paimon.fs.SeekableInputStream; import org.apache.paimon.fs.local.LocalFileIO; import org.apache.paimon.options.Options; import org.apache.paimon.reader.FileRecordReader; @@ -43,10 +42,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.atomic.AtomicInteger; import static org.assertj.core.api.Assertions.assertThat; @@ -63,8 +58,8 @@ public void testCoalesceAdjacentBlocks() { RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); int[] blocksToRead = {0, 1, 2, 3, 4}; - List ranges = - BlockPrefetcher.coalesceRanges(blocksToRead, index); + List ranges = + VectoredReadStrategy.coalesceRanges(blocksToRead, index); assertThat(ranges).hasSize(1); assertThat(ranges.get(0).offset).isEqualTo(0); @@ -80,8 +75,8 @@ public void testCoalesceWithLargeGap() { RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); int[] blocksToRead = {0, 2}; - List ranges = - BlockPrefetcher.coalesceRanges(blocksToRead, index); + List ranges = + VectoredReadStrategy.coalesceRanges(blocksToRead, index); // gap between block 0 end (100) and block 2 start (200) is 100, within HOLE_SIZE_LIMIT assertThat(ranges).hasSize(1); @@ -118,8 +113,8 @@ public void testCoalesceSplitsByHoleSize() { // blocksToRead = [0, 2]: gap between block 0 end and block 2 start // block 0 end = 300*1024, block 2 start = 600*1024, gap = 300*1024 > 256KB int[] blocksToRead = {0, 2}; - List ranges = - BlockPrefetcher.coalesceRanges(blocksToRead, index); + List ranges = + VectoredReadStrategy.coalesceRanges(blocksToRead, index); assertThat(ranges).hasSize(2); assertThat(ranges.get(0).blockIndices).containsExactly(0); @@ -146,12 +141,12 @@ public void testCoalesceSplitsByRangeSize() { blocksToRead[i] = i; } - List ranges = - BlockPrefetcher.coalesceRanges(blocksToRead, index); + List ranges = + VectoredReadStrategy.coalesceRanges(blocksToRead, index); // 2MB / 100KB = 20 blocks per range, so 30 blocks should split into 2 ranges assertThat(ranges.size()).isGreaterThan(1); - for (BlockPrefetcher.MergedRange range : ranges) { + for (VectoredReadStrategy.MergedRange range : ranges) { assertThat(range.length).isLessThanOrEqualTo(2 * 1024 * 1024); } } @@ -163,8 +158,8 @@ public void testCoalesceEmptyInput() { long[] rowStarts = {0}; RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); - List ranges = - BlockPrefetcher.coalesceRanges(new int[0], index); + List ranges = + VectoredReadStrategy.coalesceRanges(new int[0], index); assertThat(ranges).isEmpty(); } @@ -177,8 +172,8 @@ public void testCoalesceSingleBlock() { RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); int[] blocksToRead = {0}; - List ranges = - BlockPrefetcher.coalesceRanges(blocksToRead, index); + List ranges = + VectoredReadStrategy.coalesceRanges(blocksToRead, index); assertThat(ranges).hasSize(1); assertThat(ranges.get(0).offset).isEqualTo(0); @@ -194,8 +189,8 @@ public void testCoalesceNonContiguousBlocks() { RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); int[] blocksToRead = {0, 2, 4}; - List ranges = - BlockPrefetcher.coalesceRanges(blocksToRead, index); + List ranges = + VectoredReadStrategy.coalesceRanges(blocksToRead, index); // All gaps are small (100 bytes), so everything merges into one range assertThat(ranges).hasSize(1); @@ -366,97 +361,6 @@ public void testPrefetcherEmptyBlocksToRead() throws IOException { assertThat(result).isEmpty(); } - @Test - public void testInputStreamPoolLazyCreation() throws IOException { - RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); - - Path path = new Path(tempDir.toUri().toString(), "pool_lazy.row"); - FileFormat format = FileFormat.fromIdentifier("row", new Options()); - - List rows = new ArrayList<>(); - for (int i = 0; i < 10; i++) { - rows.add(GenericRow.of(i)); - } - writeRows(format, rowType, path, rows); - - LocalFileIO fileIO = new LocalFileIO(); - SeekableInputStream initialStream = fileIO.newInputStream(path); - - // Pool with max 4, but only initialStream is opened eagerly - InputStreamPool pool = new InputStreamPool(fileIO, path, 4, initialStream); - - // First borrow returns the initial stream (no new stream opened) - SeekableInputStream s1 = pool.borrow(); - assertThat(s1).isSameAs(initialStream); - - // Second borrow creates a new stream lazily - SeekableInputStream s2 = pool.borrow(); - assertThat(s2).isNotSameAs(initialStream); - - // Third borrow creates another - SeekableInputStream s3 = pool.borrow(); - assertThat(s3).isNotSameAs(s1); - assertThat(s3).isNotSameAs(s2); - - // Return all - pool.returnStream(s1); - pool.returnStream(s2); - pool.returnStream(s3); - - // Re-borrow should reuse returned streams - SeekableInputStream s4 = pool.borrow(); - assertThat(s4 == s1 || s4 == s2 || s4 == s3).isTrue(); - - pool.returnStream(s4); - pool.close(); - } - - @Test - public void testInputStreamPoolConcurrentAccess() throws Exception { - RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); - - Path path = new Path(tempDir.toUri().toString(), "pool_concurrent.row"); - FileFormat format = FileFormat.fromIdentifier("row", new Options()); - - List rows = new ArrayList<>(); - for (int i = 0; i < 100; i++) { - rows.add(GenericRow.of(i)); - } - writeRows(format, rowType, path, rows); - - LocalFileIO fileIO = new LocalFileIO(); - SeekableInputStream initialStream = fileIO.newInputStream(path); - InputStreamPool pool = new InputStreamPool(fileIO, path, 4, initialStream); - - int numThreads = 8; - int opsPerThread = 50; - ExecutorService executor = Executors.newFixedThreadPool(numThreads); - CountDownLatch latch = new CountDownLatch(numThreads); - AtomicInteger errors = new AtomicInteger(0); - - for (int t = 0; t < numThreads; t++) { - executor.submit( - () -> { - try { - for (int i = 0; i < opsPerThread; i++) { - SeekableInputStream in = pool.borrow(); - in.seek(0); - pool.returnStream(in); - } - } catch (Exception e) { - errors.incrementAndGet(); - } finally { - latch.countDown(); - } - }); - } - - latch.await(); - executor.shutdown(); - assertThat(errors.get()).isEqualTo(0); - pool.close(); - } - @Test public void testPrefetchSlidingWindow() throws IOException { // Many ranges exceeding PREFETCH_COUNT to verify sliding window works From 4b47e794b5d7592895ecd8eb4f631e995db40acc Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 22 May 2026 15:41:34 +0800 Subject: [PATCH 3/6] fix --- .../paimon/utils/NestedProjectedRow.java | 222 +++++++++++ .../paimon/utils/NestedProjectedRowTest.java | 348 ++++++++++++++++++ .../paimon/format/row/RowBlockReader.java | 96 +++++ .../paimon/format/row/RowBlockWriter.java | 32 ++ .../paimon/format/row/RowFileFormat.java | 37 +- .../format/row/RowFileRecordIterator.java | 17 +- .../paimon/format/row/RowFormatReader.java | 12 +- .../format/row/RowFormatReaderFactory.java | 9 +- .../format/row/RowFormatReadWriteTest.java | 325 ++++++++++++++++ 9 files changed, 1047 insertions(+), 51 deletions(-) create mode 100644 paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java create mode 100644 paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java b/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java new file mode 100644 index 000000000000..7a456c2761d1 --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.utils; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Blob; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalMap; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.InternalVector; +import org.apache.paimon.data.Timestamp; +import org.apache.paimon.data.variant.Variant; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataTypeRoot; +import org.apache.paimon.types.RowKind; +import org.apache.paimon.types.RowType; + +import javax.annotation.Nullable; + +import java.util.List; + +/** + * A projected view of {@link InternalRow} that supports nested ROW field pruning. + * + *

Unlike {@link ProjectedRow} which only handles top-level projection, this class recursively + * projects nested ROW fields. It maps each projected field to the corresponding position in the + * data schema by name, and for ROW-typed fields, recursively applies sub-projections. + */ +public class NestedProjectedRow implements InternalRow { + + private final int[] indexMapping; + @Nullable private final NestedProjectedRow[] nestedProjections; + @Nullable private final int[] nestedArity; + private InternalRow row; + + private NestedProjectedRow( + int[] indexMapping, + @Nullable NestedProjectedRow[] nestedProjections, + @Nullable int[] nestedArity) { + this.indexMapping = indexMapping; + this.nestedProjections = nestedProjections; + this.nestedArity = nestedArity; + } + + public NestedProjectedRow replaceRow(InternalRow row) { + this.row = row; + return this; + } + + /** + * Creates a {@link NestedProjectedRow} from the data schema and projected schema. Returns null + * if the two schemas are identical (no projection needed). + */ + @Nullable + public static NestedProjectedRow create(RowType dataSchema, RowType projectedSchema) { + if (dataSchema.equals(projectedSchema)) { + return null; + } + + List dataFields = dataSchema.getFields(); + List projectedFields = projectedSchema.getFields(); + List dataFieldNames = dataSchema.getFieldNames(); + + int projectedSize = projectedFields.size(); + int[] indexMapping = new int[projectedSize]; + NestedProjectedRow[] nestedProjections = null; + int[] nestedArity = null; + boolean hasNested = false; + + for (int i = 0; i < projectedSize; i++) { + DataField projected = projectedFields.get(i); + int dataIdx = dataFieldNames.indexOf(projected.name()); + indexMapping[i] = dataIdx; + + if (projected.type().getTypeRoot() == DataTypeRoot.ROW) { + RowType dataNestedType = (RowType) dataFields.get(dataIdx).type(); + RowType projectedNestedType = (RowType) projected.type(); + NestedProjectedRow sub = create(dataNestedType, projectedNestedType); + if (sub != null) { + if (nestedProjections == null) { + nestedProjections = new NestedProjectedRow[projectedSize]; + nestedArity = new int[projectedSize]; + } + nestedProjections[i] = sub; + nestedArity[i] = dataNestedType.getFieldCount(); + hasNested = true; + } + } + } + + if (!hasNested) { + return new NestedProjectedRow(indexMapping, null, null); + } + return new NestedProjectedRow(indexMapping, nestedProjections, nestedArity); + } + + @Override + public int getFieldCount() { + return indexMapping.length; + } + + @Override + public RowKind getRowKind() { + return row.getRowKind(); + } + + @Override + public void setRowKind(RowKind kind) { + row.setRowKind(kind); + } + + @Override + public boolean isNullAt(int pos) { + return row.isNullAt(indexMapping[pos]); + } + + @Override + public boolean getBoolean(int pos) { + return row.getBoolean(indexMapping[pos]); + } + + @Override + public byte getByte(int pos) { + return row.getByte(indexMapping[pos]); + } + + @Override + public short getShort(int pos) { + return row.getShort(indexMapping[pos]); + } + + @Override + public int getInt(int pos) { + return row.getInt(indexMapping[pos]); + } + + @Override + public long getLong(int pos) { + return row.getLong(indexMapping[pos]); + } + + @Override + public float getFloat(int pos) { + return row.getFloat(indexMapping[pos]); + } + + @Override + public double getDouble(int pos) { + return row.getDouble(indexMapping[pos]); + } + + @Override + public BinaryString getString(int pos) { + return row.getString(indexMapping[pos]); + } + + @Override + public Decimal getDecimal(int pos, int precision, int scale) { + return row.getDecimal(indexMapping[pos], precision, scale); + } + + @Override + public Timestamp getTimestamp(int pos, int precision) { + return row.getTimestamp(indexMapping[pos], precision); + } + + @Override + public byte[] getBinary(int pos) { + return row.getBinary(indexMapping[pos]); + } + + @Override + public Variant getVariant(int pos) { + return row.getVariant(indexMapping[pos]); + } + + @Override + public Blob getBlob(int pos) { + return row.getBlob(indexMapping[pos]); + } + + @Override + public InternalArray getArray(int pos) { + return row.getArray(indexMapping[pos]); + } + + @Override + public InternalVector getVector(int pos) { + return row.getVector(indexMapping[pos]); + } + + @Override + public InternalMap getMap(int pos) { + return row.getMap(indexMapping[pos]); + } + + @Override + public InternalRow getRow(int pos, int numFields) { + if (nestedProjections != null && nestedProjections[pos] != null) { + InternalRow inner = row.getRow(indexMapping[pos], nestedArity[pos]); + return nestedProjections[pos].replaceRow(inner); + } + return row.getRow(indexMapping[pos], numFields); + } +} diff --git a/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java b/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java new file mode 100644 index 000000000000..39ef485eb0f2 --- /dev/null +++ b/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java @@ -0,0 +1,348 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.utils; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Test for {@link NestedProjectedRow}. */ +public class NestedProjectedRowTest { + + @Test + void testReturnNullWhenSchemasAreEqual() { + RowType schema = RowType.of(DataTypes.INT(), DataTypes.STRING()); + assertThat(NestedProjectedRow.create(schema, schema)).isNull(); + } + + @Test + void testTopLevelProjection() { + // data: ROW + RowType dataSchema = + RowType.builder() + .field("a", DataTypes.INT()) + .field("b", DataTypes.STRING()) + .field("c", DataTypes.BIGINT()) + .build(); + + // projected: ROW + RowType projectedSchema = + RowType.builder() + .field("c", DataTypes.BIGINT()) + .field("a", DataTypes.INT()) + .build(); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow row = GenericRow.of(42, BinaryString.fromString("hello"), 100L); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getFieldCount()).isEqualTo(2); + assertThat(projected.getLong(0)).isEqualTo(100L); + assertThat(projected.getInt(1)).isEqualTo(42); + } + + @Test + void testTopLevelFieldSubset() { + // data: ROW + RowType dataSchema = + RowType.builder() + .field("a", DataTypes.INT()) + .field("b", DataTypes.STRING()) + .field("c", DataTypes.DOUBLE()) + .build(); + + // projected: ROW + RowType projectedSchema = RowType.builder().field("b", DataTypes.STRING()).build(); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow row = GenericRow.of(1, BinaryString.fromString("world"), 3.14); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getFieldCount()).isEqualTo(1); + assertThat(projected.getString(0)).isEqualTo(BinaryString.fromString("world")); + } + + @Test + void testNestedRowProjection() { + // data: ROW> + RowType nestedType = + RowType.builder() + .field("x", DataTypes.INT()) + .field("y", DataTypes.INT()) + .field("z", DataTypes.INT()) + .build(); + RowType dataSchema = + RowType.builder().field("id", DataTypes.INT()).field("r", nestedType).build(); + + // projected: ROW> + RowType projectedNestedType = RowType.builder().field("z", DataTypes.INT()).build(); + RowType projectedSchema = RowType.builder().field("r", projectedNestedType).build(); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow innerRow = GenericRow.of(10, 20, 30); + GenericRow row = GenericRow.of(1, innerRow); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getFieldCount()).isEqualTo(1); + InternalRow projectedInner = projected.getRow(0, 1); + assertThat(projectedInner.getFieldCount()).isEqualTo(1); + assertThat(projectedInner.getInt(0)).isEqualTo(30); + } + + @Test + void testNestedRowProjectionMultipleFields() { + // data: ROW> + RowType nestedType = + RowType.builder() + .field("a", DataTypes.INT()) + .field("b", DataTypes.INT()) + .field("c", DataTypes.INT()) + .build(); + RowType dataSchema = + RowType.builder().field("id", DataTypes.INT()).field("r", nestedType).build(); + + // projected: ROW> + RowType projectedNestedType = + RowType.builder().field("c", DataTypes.INT()).field("a", DataTypes.INT()).build(); + RowType projectedSchema = + RowType.builder() + .field("id", DataTypes.INT()) + .field("r", projectedNestedType) + .build(); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow innerRow = GenericRow.of(10, 20, 30); + GenericRow row = GenericRow.of(1, innerRow); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getFieldCount()).isEqualTo(2); + assertThat(projected.getInt(0)).isEqualTo(1); + InternalRow projectedInner = projected.getRow(1, 2); + assertThat(projectedInner.getInt(0)).isEqualTo(30); + assertThat(projectedInner.getInt(1)).isEqualTo(10); + } + + @Test + void testDeeplyNestedProjection() { + // data: ROW>> + RowType level2 = + RowType.builder() + .field("x", DataTypes.INT()) + .field("y", DataTypes.INT()) + .field("z", DataTypes.INT()) + .build(); + RowType level1 = RowType.builder().field("b", level2).build(); + RowType dataSchema = RowType.builder().field("a", level1).build(); + + // projected: ROW>> + RowType projLevel2 = RowType.builder().field("y", DataTypes.INT()).build(); + RowType projLevel1 = RowType.builder().field("b", projLevel2).build(); + RowType projectedSchema = RowType.builder().field("a", projLevel1).build(); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow l2Row = GenericRow.of(10, 20, 30); + GenericRow l1Row = GenericRow.of(l2Row); + GenericRow row = GenericRow.of(l1Row); + InternalRow projected = projection.replaceRow(row); + + InternalRow projL1 = projected.getRow(0, 1); + InternalRow projL2 = projL1.getRow(0, 1); + assertThat(projL2.getInt(0)).isEqualTo(20); + } + + @Test + void testNestedRowWithoutProjection() { + // data: ROW> + RowType nestedType = + RowType.builder().field("x", DataTypes.INT()).field("y", DataTypes.INT()).build(); + RowType dataSchema = + RowType.builder().field("id", DataTypes.INT()).field("r", nestedType).build(); + + // projected: ROW> (nested is unchanged, only top-level pruned) + RowType projectedSchema = RowType.builder().field("r", nestedType).build(); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow innerRow = GenericRow.of(10, 20); + GenericRow row = GenericRow.of(1, innerRow); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getFieldCount()).isEqualTo(1); + InternalRow projectedInner = projected.getRow(0, 2); + assertThat(projectedInner.getInt(0)).isEqualTo(10); + assertThat(projectedInner.getInt(1)).isEqualTo(20); + } + + @Test + void testNullHandling() { + // data: ROW> + RowType nestedType = + RowType.builder().field("x", DataTypes.INT()).field("y", DataTypes.INT()).build(); + RowType dataSchema = + RowType.builder().field("a", DataTypes.INT()).field("r", nestedType).build(); + + // projected: ROW> + RowType projectedNestedType = RowType.builder().field("y", DataTypes.INT()).build(); + RowType projectedSchema = RowType.builder().field("r", projectedNestedType).build(); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + // null nested row + GenericRow row = GenericRow.of(1, null); + InternalRow projected = projection.replaceRow(row); + assertThat(projected.isNullAt(0)).isTrue(); + + // non-null nested row with null field + GenericRow innerRow = GenericRow.of(10, null); + GenericRow row2 = GenericRow.of(1, innerRow); + InternalRow projected2 = projection.replaceRow(row2); + assertThat(projected2.isNullAt(0)).isFalse(); + InternalRow projectedInner = projected2.getRow(0, 1); + assertThat(projectedInner.isNullAt(0)).isTrue(); + } + + @Test + void testReplaceRowIsReusable() { + RowType dataSchema = + RowType.builder().field("a", DataTypes.INT()).field("b", DataTypes.INT()).build(); + RowType projectedSchema = RowType.builder().field("b", DataTypes.INT()).build(); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow row1 = GenericRow.of(1, 10); + assertThat(projection.replaceRow(row1).getInt(0)).isEqualTo(10); + + GenericRow row2 = GenericRow.of(2, 20); + assertThat(projection.replaceRow(row2).getInt(0)).isEqualTo(20); + } + + @Test + void testRowKindPreserved() { + RowType dataSchema = + RowType.builder().field("a", DataTypes.INT()).field("b", DataTypes.INT()).build(); + RowType projectedSchema = RowType.builder().field("b", DataTypes.INT()).build(); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow row = GenericRow.of(1, 10); + row.setRowKind(org.apache.paimon.types.RowKind.DELETE); + InternalRow projected = projection.replaceRow(row); + assertThat(projected.getRowKind()).isEqualTo(org.apache.paimon.types.RowKind.DELETE); + } + + @Test + void testMultipleNestedRows() { + // data: ROW, r2 ROW> + RowType nested1 = + RowType.builder().field("a", DataTypes.INT()).field("b", DataTypes.INT()).build(); + RowType nested2 = + RowType.builder() + .field("x", DataTypes.STRING()) + .field("y", DataTypes.STRING()) + .build(); + RowType dataSchema = RowType.builder().field("r1", nested1).field("r2", nested2).build(); + + // projected: ROW, r2 ROW> + RowType projNested1 = RowType.builder().field("b", DataTypes.INT()).build(); + RowType projNested2 = RowType.builder().field("y", DataTypes.STRING()).build(); + RowType projectedSchema = + RowType.builder().field("r1", projNested1).field("r2", projNested2).build(); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow inner1 = GenericRow.of(10, 20); + GenericRow inner2 = + GenericRow.of(BinaryString.fromString("hello"), BinaryString.fromString("world")); + GenericRow row = GenericRow.of(inner1, inner2); + InternalRow projected = projection.replaceRow(row); + + InternalRow projInner1 = projected.getRow(0, 1); + assertThat(projInner1.getInt(0)).isEqualTo(20); + + InternalRow projInner2 = projected.getRow(1, 1); + assertThat(projInner2.getString(0)).isEqualTo(BinaryString.fromString("world")); + } + + @Test + void testAllDataTypes() { + RowType dataSchema = + RowType.builder() + .field("f_bool", DataTypes.BOOLEAN()) + .field("f_byte", DataTypes.TINYINT()) + .field("f_short", DataTypes.SMALLINT()) + .field("f_int", DataTypes.INT()) + .field("f_long", DataTypes.BIGINT()) + .field("f_float", DataTypes.FLOAT()) + .field("f_double", DataTypes.DOUBLE()) + .field("f_string", DataTypes.STRING()) + .field("f_binary", DataTypes.BYTES()) + .build(); + + // project a subset in different order + RowType projectedSchema = + RowType.builder() + .field("f_double", DataTypes.DOUBLE()) + .field("f_bool", DataTypes.BOOLEAN()) + .field("f_string", DataTypes.STRING()) + .field("f_byte", DataTypes.TINYINT()) + .build(); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow row = + GenericRow.of( + true, + (byte) 7, + (short) 16, + 42, + 100L, + 1.5f, + 3.14, + BinaryString.fromString("test"), + new byte[] {1, 2, 3}); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getDouble(0)).isEqualTo(3.14); + assertThat(projected.getBoolean(1)).isTrue(); + assertThat(projected.getString(2)).isEqualTo(BinaryString.fromString("test")); + assertThat(projected.getByte(3)).isEqualTo((byte) 7); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockReader.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockReader.java index 2a87d9ba1e00..22027fc3cf50 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockReader.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockReader.java @@ -18,6 +18,8 @@ package org.apache.paimon.format.row; +import org.apache.paimon.data.BinaryVector; +import org.apache.paimon.data.Blob; import org.apache.paimon.data.GenericArray; import org.apache.paimon.data.GenericMap; import org.apache.paimon.data.GenericRow; @@ -31,6 +33,7 @@ import org.apache.paimon.types.MapType; import org.apache.paimon.types.MultisetType; import org.apache.paimon.types.RowType; +import org.apache.paimon.types.VectorType; import java.util.HashMap; import java.util.Map; @@ -120,6 +123,13 @@ private FieldReader createFieldReader(DataType type) { return new TimestampFieldReader(getPrecision(type)); case VARIANT: return new VariantFieldReader(); + case BLOB: + return new BlobFieldReader(); + case VECTOR: + { + VectorType vectorType = (VectorType) type; + return new VectorFieldReader(vectorType.getElementType()); + } case ARRAY: { DataType elementType = ((ArrayType) type).getElementType(); @@ -289,6 +299,92 @@ public Object read() { } } + private class BlobFieldReader implements FieldReader { + @Override + public Object read() { + return Blob.fromData(buf.readBytes()); + } + } + + private class VectorFieldReader implements FieldReader { + private final DataType elementType; + + VectorFieldReader(DataType elementType) { + this.elementType = elementType; + } + + @Override + public Object read() { + int size = buf.readVarInt(); + InternalArray array = readVectorElements(size); + return BinaryVector.fromInternalArray(array, elementType); + } + + private InternalArray readVectorElements(int size) { + switch (elementType.getTypeRoot()) { + case BOOLEAN: + { + boolean[] arr = new boolean[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readBoolean(); + } + return new GenericArray(arr); + } + case TINYINT: + { + byte[] arr = new byte[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readByte(); + } + return new GenericArray(arr); + } + case SMALLINT: + { + short[] arr = new short[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readShort(); + } + return new GenericArray(arr); + } + case INTEGER: + { + int[] arr = new int[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readInt(); + } + return new GenericArray(arr); + } + case BIGINT: + { + long[] arr = new long[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readLong(); + } + return new GenericArray(arr); + } + case FLOAT: + { + float[] arr = new float[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readFloat(); + } + return new GenericArray(arr); + } + case DOUBLE: + { + double[] arr = new double[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readDouble(); + } + return new GenericArray(arr); + } + default: + throw new UnsupportedOperationException( + "Unsupported vector element type: " + elementType); + } + } + } + private class ArrayFieldReader implements FieldReader { private final FieldReader elementReader; diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockWriter.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockWriter.java index a88c1f05994d..4a7f7be0fed5 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockWriter.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockWriter.java @@ -22,6 +22,7 @@ import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.InternalVector; import org.apache.paimon.data.variant.Variant; import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.DataType; @@ -29,6 +30,7 @@ import org.apache.paimon.types.MapType; import org.apache.paimon.types.MultisetType; import org.apache.paimon.types.RowType; +import org.apache.paimon.types.VectorType; import org.apache.paimon.utils.IntArrayList; import static org.apache.paimon.types.DataTypeChecks.getPrecision; @@ -145,6 +147,11 @@ private FieldWriter createFieldWriter(DataType type) { return new TimestampFieldWriter(getPrecision(type)); case VARIANT: return new VariantFieldWriter(); + case BLOB: + return new BlobFieldWriter(); + case VECTOR: + return new VectorFieldWriter( + createFieldWriter(((VectorType) type).getElementType())); case ARRAY: return new ArrayFieldWriter(createFieldWriter(((ArrayType) type).getElementType())); case MULTISET: @@ -300,6 +307,31 @@ public void write(DataGetters data, int i) { } } + private class BlobFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeBytes(data.getBlob(i).toData()); + } + } + + private class VectorFieldWriter implements FieldWriter { + private final FieldWriter elemWriter; + + VectorFieldWriter(FieldWriter elemWriter) { + this.elemWriter = elemWriter; + } + + @Override + public void write(DataGetters data, int i) { + InternalVector vector = data.getVector(i); + int size = vector.size(); + buf.writeVarInt(size); + for (int j = 0; j < size; j++) { + elemWriter.write(vector, j); + } + } + } + private class ArrayFieldWriter implements FieldWriter { private final FieldWriter elemWriter; diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormat.java index 38518804c0da..8683dd4fde28 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormat.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormat.java @@ -24,12 +24,11 @@ import org.apache.paimon.format.FormatWriterFactory; import org.apache.paimon.options.MemorySize; import org.apache.paimon.predicate.Predicate; -import org.apache.paimon.types.DataField; import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.NestedProjectedRow; import javax.annotation.Nullable; -import java.util.ArrayList; import java.util.List; /** Row-store file format with block-level ZSTD compression and O(1) row-number lookup. */ @@ -52,8 +51,9 @@ public FormatReaderFactory createReaderFactory( RowType dataSchemaRowType, RowType projectedRowType, @Nullable List filters) { - int[] projectionMapping = computeProjectionMapping(dataSchemaRowType, projectedRowType); - return new RowFormatReaderFactory(dataSchemaRowType, projectionMapping); + NestedProjectedRow projection = + NestedProjectedRow.create(dataSchemaRowType, projectedRowType); + return new RowFormatReaderFactory(dataSchemaRowType, projection); } @Override @@ -63,33 +63,6 @@ public FormatWriterFactory createWriterFactory(RowType type) { @Override public void validateDataFields(RowType rowType) { - // RowCompactedSerializer supports all Paimon data types - } - - @Nullable - private static int[] computeProjectionMapping( - RowType dataSchemaRowType, RowType projectedRowType) { - if (dataSchemaRowType.equals(projectedRowType)) { - return null; - } - - List dataFields = dataSchemaRowType.getFields(); - List projectedFields = projectedRowType.getFields(); - - List mapping = new ArrayList<>(); - for (DataField projected : projectedFields) { - for (int i = 0; i < dataFields.size(); i++) { - if (dataFields.get(i).id() == projected.id()) { - mapping.add(i); - break; - } - } - } - - int[] result = new int[mapping.size()]; - for (int i = 0; i < mapping.size(); i++) { - result[i] = mapping.get(i); - } - return result; + // Row format supports all Paimon data types } } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileRecordIterator.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileRecordIterator.java index 650d61ac43b9..4746d728e751 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileRecordIterator.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileRecordIterator.java @@ -21,7 +21,7 @@ import org.apache.paimon.data.InternalRow; import org.apache.paimon.fs.Path; import org.apache.paimon.reader.FileRecordIterator; -import org.apache.paimon.utils.ProjectedRow; +import org.apache.paimon.utils.NestedProjectedRow; import javax.annotation.Nullable; @@ -32,7 +32,7 @@ class RowFileRecordIterator implements FileRecordIterator { private final Path filePath; private final RowBlockReader blockReader; - @Nullable private final ProjectedRow projectedRow; + @Nullable private final NestedProjectedRow projection; private final long blockStartRow; @Nullable private final int[] selectedLocalIndices; @@ -42,20 +42,20 @@ class RowFileRecordIterator implements FileRecordIterator { RowFileRecordIterator( Path filePath, RowBlockReader blockReader, - @Nullable int[] projectionMapping, + @Nullable NestedProjectedRow projection, long blockStartRow) { - this(filePath, blockReader, projectionMapping, blockStartRow, null); + this(filePath, blockReader, projection, blockStartRow, null); } RowFileRecordIterator( Path filePath, RowBlockReader blockReader, - @Nullable int[] projectionMapping, + @Nullable NestedProjectedRow projection, long blockStartRow, @Nullable int[] selectedLocalIndices) { this.filePath = filePath; this.blockReader = blockReader; - this.projectedRow = projectionMapping != null ? ProjectedRow.from(projectionMapping) : null; + this.projection = projection; this.blockStartRow = blockStartRow; this.selectedLocalIndices = selectedLocalIndices; this.cursor = 0; @@ -106,9 +106,8 @@ private InternalRow nextSelected(int[] selectedLocalIndices) { } private InternalRow applyProjection(InternalRow row) { - if (projectedRow != null) { - projectedRow.replaceRow(row); - return projectedRow; + if (projection != null) { + return projection.replaceRow(row); } return row; } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java index 7d288754ba4e..7310b3dfab9b 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java @@ -24,6 +24,7 @@ import org.apache.paimon.reader.FileRecordIterator; import org.apache.paimon.reader.FileRecordReader; import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.NestedProjectedRow; import org.apache.paimon.utils.RoaringBitmap32; import javax.annotation.Nullable; @@ -37,7 +38,7 @@ public class RowFormatReader implements FileRecordReader { private final RowFileFooter footer; private final RowBlockIndex blockIndex; private final RowType rowType; - @Nullable private final int[] projectionMapping; + @Nullable private final NestedProjectedRow projection; @Nullable private final RoaringBitmap32 selection; private final BlockPrefetcher prefetcher; @@ -47,13 +48,13 @@ public class RowFormatReader implements FileRecordReader { RowFileFooter footer, RowBlockIndex blockIndex, RowType rowType, - @Nullable int[] projectionMapping, + @Nullable NestedProjectedRow projection, @Nullable RoaringBitmap32 selection) { this.filePath = filePath; this.footer = footer; this.blockIndex = blockIndex; this.rowType = rowType; - this.projectionMapping = projectionMapping; + this.projection = projection; this.selection = selection; this.prefetcher = new BlockPrefetcher( @@ -78,10 +79,9 @@ public FileRecordIterator readBatch() throws IOException { long blockEndRow = blockEndRow(blockIdx); int[] localIndices = computeSelectedLocalIndices(selection, blockStartRow, blockEndRow); return new RowFileRecordIterator( - filePath, blockReader, projectionMapping, blockStartRow, localIndices); + filePath, blockReader, projection, blockStartRow, localIndices); } else { - return new RowFileRecordIterator( - filePath, blockReader, projectionMapping, blockStartRow); + return new RowFileRecordIterator(filePath, blockReader, projection, blockStartRow); } } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java index c0bd11a5d72d..1cab7626bef5 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java @@ -26,6 +26,7 @@ import org.apache.paimon.reader.FileRecordReader; import org.apache.paimon.types.RowType; import org.apache.paimon.utils.IOUtils; +import org.apache.paimon.utils.NestedProjectedRow; import javax.annotation.Nullable; @@ -37,11 +38,11 @@ public class RowFormatReaderFactory implements FormatReaderFactory { private static final int TAIL_PREFETCH_SIZE = 64 * 1024; private final RowType rowType; - @Nullable private final int[] projectionMapping; + @Nullable private final NestedProjectedRow projection; - public RowFormatReaderFactory(RowType rowType, @Nullable int[] projectionMapping) { + public RowFormatReaderFactory(RowType rowType, @Nullable NestedProjectedRow projection) { this.rowType = rowType; - this.projectionMapping = projectionMapping; + this.projection = projection; } @Override @@ -72,6 +73,6 @@ public FileRecordReader createReader(Context context) throws IOExce } return new RowFormatReader( - in, path, footer, blockIndex, rowType, projectionMapping, context.selection()); + in, path, footer, blockIndex, rowType, projection, context.selection()); } } diff --git a/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java b/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java index 0780f32921df..dc634457ebe3 100644 --- a/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java +++ b/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java @@ -863,6 +863,331 @@ public void testRandomizedRoundTrip() throws IOException { } } + @Test + public void testBlobAndVectorTypes() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "data", new org.apache.paimon.types.BlobType()), + new DataField( + 2, + "embedding", + new org.apache.paimon.types.VectorType( + 4, new FloatType())))); + + Path path = new Path(tempDir.toUri().toString(), "blob_vector.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + LocalFileIO fileIO = new LocalFileIO(); + PositionOutputStream out = fileIO.newOutputStream(path, false); + FormatWriter writer = format.createWriterFactory(rowType).create(out, "zstd"); + writer.addElement( + GenericRow.of( + 1, + org.apache.paimon.data.Blob.fromData(new byte[] {10, 20, 30}), + org.apache.paimon.data.BinaryVector.fromPrimitiveArray( + new float[] {1.0f, 2.0f, 3.0f, 4.0f}))); + writer.addElement(GenericRow.of(2, null, null)); + writer.close(); + + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List result = new ArrayList<>(); + reader.forEachRemaining( + row -> { + GenericRow copy = new GenericRow(3); + copy.setField(0, row.getInt(0)); + copy.setField(1, row.isNullAt(1) ? null : row.getBlob(1)); + copy.setField(2, row.isNullAt(2) ? null : row.getVector(2)); + result.add(copy); + }); + reader.close(); + + assertThat(result.size()).isEqualTo(2); + InternalRow row1 = result.get(0); + assertThat(row1.getInt(0)).isEqualTo(1); + assertThat(row1.getBlob(1).toData()).isEqualTo(new byte[] {10, 20, 30}); + org.apache.paimon.data.InternalVector vec = row1.getVector(2); + assertThat(vec.size()).isEqualTo(4); + assertThat(vec.getFloat(0)).isEqualTo(1.0f); + assertThat(vec.getFloat(1)).isEqualTo(2.0f); + assertThat(vec.getFloat(2)).isEqualTo(3.0f); + assertThat(vec.getFloat(3)).isEqualTo(4.0f); + + InternalRow row2 = result.get(1); + assertThat(row2.getInt(0)).isEqualTo(2); + assertThat(row2.isNullAt(1)).isTrue(); + assertThat(row2.isNullAt(2)).isTrue(); + } + + @Test + public void testNestedRowProjection() throws IOException { + RowType innerType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", innerType))); + + Path path = new Path(tempDir.toUri().toString(), "nested_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(1, GenericRow.of(10, 100))); + expected.add(GenericRow.of(2, GenericRow.of(20, 200))); + writeRows(format, dataSchema, path, expected); + + // Read with projected type: only top-level 'r', nested only 'b' + RowType projectedInner = new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "r", projectedInner))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List result = new ArrayList<>(); + reader.forEachRemaining( + row -> { + // projectedSchema is ROW> + // row.getRow(0, 1) should return the projected nested row with only 'b' + InternalRow nested = row.getRow(0, 1); + result.add(GenericRow.of(nested.getInt(0))); + }); + reader.close(); + + // nested.getInt(0) should be 'b' value (100, 200), not 'a' value (10, 20) + assertThat(result.size()).isEqualTo(2); + assertThat(result.get(0).getInt(0)).isEqualTo(100); + assertThat(result.get(1).getInt(0)).isEqualTo(200); + } + + @Test + public void testDeeplyNestedProjection() throws IOException { + // data: ROW>> + RowType level2 = + new RowType( + Arrays.asList( + new DataField(20, "a", new IntType()), + new DataField(21, "b", new IntType()), + new DataField(22, "c", new IntType()))); + RowType level1 = + new RowType( + Arrays.asList( + new DataField(10, "x", new IntType()), + new DataField(11, "l2", level2))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "l1", level1))); + + Path path = new Path(tempDir.toUri().toString(), "deep_nested_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add(GenericRow.of(1, GenericRow.of(10, GenericRow.of(100, 200, 300)))); + rows.add(GenericRow.of(2, GenericRow.of(20, GenericRow.of(400, 500, 600)))); + writeRows(format, dataSchema, path, rows); + + // projected: ROW>> + RowType projLevel2 = new RowType(Arrays.asList(new DataField(22, "c", new IntType()))); + RowType projLevel1 = new RowType(Arrays.asList(new DataField(11, "l2", projLevel2))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "l1", projLevel1))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List results = new ArrayList<>(); + reader.forEachRemaining( + row -> { + InternalRow l1 = row.getRow(0, 1); + InternalRow l2 = l1.getRow(0, 1); + results.add(l2.getInt(0)); + }); + reader.close(); + + assertThat(results).containsExactly(300, 600); + } + + @Test + public void testNestedProjectionWithNullRows() throws IOException { + // data: ROW> + RowType innerType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", innerType))); + + Path path = new Path(tempDir.toUri().toString(), "nested_null_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add(GenericRow.of(1, GenericRow.of(10, 100))); + rows.add(GenericRow.of(2, null)); + rows.add(GenericRow.of(3, GenericRow.of(30, 300))); + writeRows(format, dataSchema, path, rows); + + // projected: ROW> + RowType projectedInner = new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "r", projectedInner))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List nullFlags = new ArrayList<>(); + List values = new ArrayList<>(); + reader.forEachRemaining( + row -> { + boolean isNull = row.isNullAt(0); + nullFlags.add(new boolean[] {isNull}); + if (!isNull) { + values.add(row.getRow(0, 1).getInt(0)); + } + }); + reader.close(); + + assertThat(nullFlags.size()).isEqualTo(3); + assertThat(nullFlags.get(0)[0]).isFalse(); + assertThat(nullFlags.get(1)[0]).isTrue(); + assertThat(nullFlags.get(2)[0]).isFalse(); + assertThat(values).containsExactly(100, 300); + } + + @Test + public void testMultipleNestedRowsProjection() throws IOException { + // data: ROW, r2 ROW, id INT> + RowType nested1 = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType nested2 = + new RowType( + Arrays.asList( + new DataField(20, "x", new IntType()), + new DataField(21, "y", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "r1", nested1), + new DataField(1, "r2", nested2), + new DataField(2, "id", new IntType()))); + + Path path = new Path(tempDir.toUri().toString(), "multi_nested_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add(GenericRow.of(GenericRow.of(1, 2), GenericRow.of(3, 4), 100)); + rows.add(GenericRow.of(GenericRow.of(5, 6), GenericRow.of(7, 8), 200)); + writeRows(format, dataSchema, path, rows); + + // projected: ROW, r2 ROW> + RowType projNested1 = new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projNested2 = new RowType(Arrays.asList(new DataField(20, "x", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField(0, "r1", projNested1), + new DataField(1, "r2", projNested2))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List results = new ArrayList<>(); + reader.forEachRemaining( + row -> { + int b = row.getRow(0, 1).getInt(0); + int x = row.getRow(1, 1).getInt(0); + results.add(new int[] {b, x}); + }); + reader.close(); + + assertThat(results.size()).isEqualTo(2); + assertThat(results.get(0)).isEqualTo(new int[] {2, 3}); + assertThat(results.get(1)).isEqualTo(new int[] {6, 7}); + } + + @Test + public void testNestedProjectionWithFieldReordering() throws IOException { + // data: ROW> + RowType innerType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()), + new DataField(12, "c", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", innerType))); + + Path path = new Path(tempDir.toUri().toString(), "nested_reorder_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add(GenericRow.of(1, GenericRow.of(10, 20, 30))); + rows.add(GenericRow.of(2, GenericRow.of(40, 50, 60))); + writeRows(format, dataSchema, path, rows); + + // projected: ROW> (reversed order, skip 'b') + RowType projectedInner = + new RowType( + Arrays.asList( + new DataField(12, "c", new IntType()), + new DataField(10, "a", new IntType()))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "r", projectedInner))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List results = new ArrayList<>(); + reader.forEachRemaining( + row -> { + InternalRow nested = row.getRow(0, 2); + results.add(new int[] {nested.getInt(0), nested.getInt(1)}); + }); + reader.close(); + + // c, a order + assertThat(results.size()).isEqualTo(2); + assertThat(results.get(0)).isEqualTo(new int[] {30, 10}); + assertThat(results.get(1)).isEqualTo(new int[] {60, 40}); + } + // ======================== Helpers ======================== private void writeRows(FileFormat format, RowType rowType, Path path, List rows) From 0ca020c63c0545613376c9cb6e90f04e6e306679 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 22 May 2026 16:07:05 +0800 Subject: [PATCH 4/6] fix --- .../paimon/utils/NestedProjectedRow.java | 17 +- .../paimon/utils/NestedProjectedRowTest.java | 306 ++++++++++++------ 2 files changed, 217 insertions(+), 106 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java b/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java index 7a456c2761d1..7394fae13d4c 100644 --- a/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java @@ -65,8 +65,11 @@ public NestedProjectedRow replaceRow(InternalRow row) { } /** - * Creates a {@link NestedProjectedRow} from the data schema and projected schema. Returns null - * if the two schemas are identical (no projection needed). + * Creates a {@link NestedProjectedRow} from the data schema and projected schema using field + * IDs to match fields. Returns null if the two schemas are identical (no projection needed). + * + * @param dataSchema the full schema of the underlying row data + * @param projectedSchema the projected schema to read */ @Nullable public static NestedProjectedRow create(RowType dataSchema, RowType projectedSchema) { @@ -76,7 +79,6 @@ public static NestedProjectedRow create(RowType dataSchema, RowType projectedSch List dataFields = dataSchema.getFields(); List projectedFields = projectedSchema.getFields(); - List dataFieldNames = dataSchema.getFieldNames(); int projectedSize = projectedFields.size(); int[] indexMapping = new int[projectedSize]; @@ -86,7 +88,14 @@ public static NestedProjectedRow create(RowType dataSchema, RowType projectedSch for (int i = 0; i < projectedSize; i++) { DataField projected = projectedFields.get(i); - int dataIdx = dataFieldNames.indexOf(projected.name()); + int dataIdx = dataSchema.getFieldIndexByFieldId(projected.id()); + DataField dataField = dataFields.get(dataIdx); + Preconditions.checkArgument( + dataField.name().equals(projected.name()), + "Field name mismatch for field id %s: data schema has '%s' but projected schema has '%s'", + projected.id(), + dataField.name(), + projected.name()); indexMapping[i] = dataIdx; if (projected.type().getTypeRoot() == DataTypeRoot.ROW) { diff --git a/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java b/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java index 39ef485eb0f2..701ee85b4f96 100644 --- a/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java +++ b/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java @@ -21,38 +21,54 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.GenericRow; import org.apache.paimon.data.InternalRow; -import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.BigIntType; +import org.apache.paimon.types.BooleanType; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DoubleType; +import org.apache.paimon.types.FloatType; +import org.apache.paimon.types.IntType; import org.apache.paimon.types.RowType; +import org.apache.paimon.types.SmallIntType; +import org.apache.paimon.types.TinyIntType; +import org.apache.paimon.types.VarBinaryType; +import org.apache.paimon.types.VarCharType; import org.junit.jupiter.api.Test; +import java.util.Arrays; + import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; /** Test for {@link NestedProjectedRow}. */ public class NestedProjectedRowTest { @Test void testReturnNullWhenSchemasAreEqual() { - RowType schema = RowType.of(DataTypes.INT(), DataTypes.STRING()); + RowType schema = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new VarCharType()))); assertThat(NestedProjectedRow.create(schema, schema)).isNull(); } @Test void testTopLevelProjection() { - // data: ROW + // data: ROW RowType dataSchema = - RowType.builder() - .field("a", DataTypes.INT()) - .field("b", DataTypes.STRING()) - .field("c", DataTypes.BIGINT()) - .build(); + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new VarCharType()), + new DataField(2, "c", new BigIntType()))); - // projected: ROW + // projected: ROW RowType projectedSchema = - RowType.builder() - .field("c", DataTypes.BIGINT()) - .field("a", DataTypes.INT()) - .build(); + new RowType( + Arrays.asList( + new DataField(2, "c", new BigIntType()), + new DataField(0, "a", new IntType()))); NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); assertThat(projection).isNotNull(); @@ -67,16 +83,17 @@ void testTopLevelProjection() { @Test void testTopLevelFieldSubset() { - // data: ROW + // data: ROW RowType dataSchema = - RowType.builder() - .field("a", DataTypes.INT()) - .field("b", DataTypes.STRING()) - .field("c", DataTypes.DOUBLE()) - .build(); + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new VarCharType()), + new DataField(2, "c", new DoubleType()))); - // projected: ROW - RowType projectedSchema = RowType.builder().field("b", DataTypes.STRING()).build(); + // projected: ROW + RowType projectedSchema = + new RowType(Arrays.asList(new DataField(1, "b", new VarCharType()))); NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); assertThat(projection).isNotNull(); @@ -90,19 +107,24 @@ void testTopLevelFieldSubset() { @Test void testNestedRowProjection() { - // data: ROW> + // data: ROW(1)> RowType nestedType = - RowType.builder() - .field("x", DataTypes.INT()) - .field("y", DataTypes.INT()) - .field("z", DataTypes.INT()) - .build(); + new RowType( + Arrays.asList( + new DataField(10, "x", new IntType()), + new DataField(11, "y", new IntType()), + new DataField(12, "z", new IntType()))); RowType dataSchema = - RowType.builder().field("id", DataTypes.INT()).field("r", nestedType).build(); + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", nestedType))); - // projected: ROW> - RowType projectedNestedType = RowType.builder().field("z", DataTypes.INT()).build(); - RowType projectedSchema = RowType.builder().field("r", projectedNestedType).build(); + // projected: ROW(1)> + RowType projectedNestedType = + new RowType(Arrays.asList(new DataField(12, "z", new IntType()))); + RowType projectedSchema = + new RowType(Arrays.asList(new DataField(1, "r", projectedNestedType))); NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); assertThat(projection).isNotNull(); @@ -119,24 +141,30 @@ void testNestedRowProjection() { @Test void testNestedRowProjectionMultipleFields() { - // data: ROW> + // data: ROW(1)> RowType nestedType = - RowType.builder() - .field("a", DataTypes.INT()) - .field("b", DataTypes.INT()) - .field("c", DataTypes.INT()) - .build(); + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()), + new DataField(12, "c", new IntType()))); RowType dataSchema = - RowType.builder().field("id", DataTypes.INT()).field("r", nestedType).build(); + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", nestedType))); - // projected: ROW> + // projected: ROW(1)> RowType projectedNestedType = - RowType.builder().field("c", DataTypes.INT()).field("a", DataTypes.INT()).build(); + new RowType( + Arrays.asList( + new DataField(12, "c", new IntType()), + new DataField(10, "a", new IntType()))); RowType projectedSchema = - RowType.builder() - .field("id", DataTypes.INT()) - .field("r", projectedNestedType) - .build(); + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", projectedNestedType))); NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); assertThat(projection).isNotNull(); @@ -154,20 +182,20 @@ void testNestedRowProjectionMultipleFields() { @Test void testDeeplyNestedProjection() { - // data: ROW>> + // data: ROW>> RowType level2 = - RowType.builder() - .field("x", DataTypes.INT()) - .field("y", DataTypes.INT()) - .field("z", DataTypes.INT()) - .build(); - RowType level1 = RowType.builder().field("b", level2).build(); - RowType dataSchema = RowType.builder().field("a", level1).build(); - - // projected: ROW>> - RowType projLevel2 = RowType.builder().field("y", DataTypes.INT()).build(); - RowType projLevel1 = RowType.builder().field("b", projLevel2).build(); - RowType projectedSchema = RowType.builder().field("a", projLevel1).build(); + new RowType( + Arrays.asList( + new DataField(10, "x", new IntType()), + new DataField(11, "y", new IntType()), + new DataField(12, "z", new IntType()))); + RowType level1 = new RowType(Arrays.asList(new DataField(5, "b", level2))); + RowType dataSchema = new RowType(Arrays.asList(new DataField(0, "a", level1))); + + // projected: ROW>> + RowType projLevel2 = new RowType(Arrays.asList(new DataField(11, "y", new IntType()))); + RowType projLevel1 = new RowType(Arrays.asList(new DataField(5, "b", projLevel2))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(0, "a", projLevel1))); NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); assertThat(projection).isNotNull(); @@ -183,15 +211,22 @@ void testDeeplyNestedProjection() { } @Test - void testNestedRowWithoutProjection() { - // data: ROW> + void testNestedRowWithoutInnerProjection() { + // data: ROW(1)> RowType nestedType = - RowType.builder().field("x", DataTypes.INT()).field("y", DataTypes.INT()).build(); + new RowType( + Arrays.asList( + new DataField(10, "x", new IntType()), + new DataField(11, "y", new IntType()))); RowType dataSchema = - RowType.builder().field("id", DataTypes.INT()).field("r", nestedType).build(); + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", nestedType))); - // projected: ROW> (nested is unchanged, only top-level pruned) - RowType projectedSchema = RowType.builder().field("r", nestedType).build(); + // projected: ROW(1)> (nested is unchanged, only top-level + // pruned) + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "r", nestedType))); NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); assertThat(projection).isNotNull(); @@ -208,15 +243,23 @@ void testNestedRowWithoutProjection() { @Test void testNullHandling() { - // data: ROW> + // data: ROW(1)> RowType nestedType = - RowType.builder().field("x", DataTypes.INT()).field("y", DataTypes.INT()).build(); + new RowType( + Arrays.asList( + new DataField(10, "x", new IntType()), + new DataField(11, "y", new IntType()))); RowType dataSchema = - RowType.builder().field("a", DataTypes.INT()).field("r", nestedType).build(); + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "r", nestedType))); - // projected: ROW> - RowType projectedNestedType = RowType.builder().field("y", DataTypes.INT()).build(); - RowType projectedSchema = RowType.builder().field("r", projectedNestedType).build(); + // projected: ROW(1)> + RowType projectedNestedType = + new RowType(Arrays.asList(new DataField(11, "y", new IntType()))); + RowType projectedSchema = + new RowType(Arrays.asList(new DataField(1, "r", projectedNestedType))); NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); assertThat(projection).isNotNull(); @@ -238,8 +281,11 @@ void testNullHandling() { @Test void testReplaceRowIsReusable() { RowType dataSchema = - RowType.builder().field("a", DataTypes.INT()).field("b", DataTypes.INT()).build(); - RowType projectedSchema = RowType.builder().field("b", DataTypes.INT()).build(); + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new IntType()))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "b", new IntType()))); NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); assertThat(projection).isNotNull(); @@ -254,8 +300,11 @@ void testReplaceRowIsReusable() { @Test void testRowKindPreserved() { RowType dataSchema = - RowType.builder().field("a", DataTypes.INT()).field("b", DataTypes.INT()).build(); - RowType projectedSchema = RowType.builder().field("b", DataTypes.INT()).build(); + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new IntType()))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "b", new IntType()))); NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); assertThat(projection).isNotNull(); @@ -268,21 +317,30 @@ void testRowKindPreserved() { @Test void testMultipleNestedRows() { - // data: ROW, r2 ROW> + // data: ROW(0), r2 ROW(1)> RowType nested1 = - RowType.builder().field("a", DataTypes.INT()).field("b", DataTypes.INT()).build(); + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); RowType nested2 = - RowType.builder() - .field("x", DataTypes.STRING()) - .field("y", DataTypes.STRING()) - .build(); - RowType dataSchema = RowType.builder().field("r1", nested1).field("r2", nested2).build(); - - // projected: ROW, r2 ROW> - RowType projNested1 = RowType.builder().field("b", DataTypes.INT()).build(); - RowType projNested2 = RowType.builder().field("y", DataTypes.STRING()).build(); + new RowType( + Arrays.asList( + new DataField(20, "x", new VarCharType()), + new DataField(21, "y", new VarCharType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "r1", nested1), new DataField(1, "r2", nested2))); + + // projected: ROW(0), r2 ROW(1)> + RowType projNested1 = new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projNested2 = new RowType(Arrays.asList(new DataField(21, "y", new VarCharType()))); RowType projectedSchema = - RowType.builder().field("r1", projNested1).field("r2", projNested2).build(); + new RowType( + Arrays.asList( + new DataField(0, "r1", projNested1), + new DataField(1, "r2", projNested2))); NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); assertThat(projection).isNotNull(); @@ -302,27 +360,28 @@ void testMultipleNestedRows() { @Test void testAllDataTypes() { + // data with various types, each with unique field ID RowType dataSchema = - RowType.builder() - .field("f_bool", DataTypes.BOOLEAN()) - .field("f_byte", DataTypes.TINYINT()) - .field("f_short", DataTypes.SMALLINT()) - .field("f_int", DataTypes.INT()) - .field("f_long", DataTypes.BIGINT()) - .field("f_float", DataTypes.FLOAT()) - .field("f_double", DataTypes.DOUBLE()) - .field("f_string", DataTypes.STRING()) - .field("f_binary", DataTypes.BYTES()) - .build(); + new RowType( + Arrays.asList( + new DataField(0, "f_bool", new BooleanType()), + new DataField(1, "f_byte", new TinyIntType()), + new DataField(2, "f_short", new SmallIntType()), + new DataField(3, "f_int", new IntType()), + new DataField(4, "f_long", new BigIntType()), + new DataField(5, "f_float", new FloatType()), + new DataField(6, "f_double", new DoubleType()), + new DataField(7, "f_string", new VarCharType()), + new DataField(8, "f_binary", new VarBinaryType()))); // project a subset in different order RowType projectedSchema = - RowType.builder() - .field("f_double", DataTypes.DOUBLE()) - .field("f_bool", DataTypes.BOOLEAN()) - .field("f_string", DataTypes.STRING()) - .field("f_byte", DataTypes.TINYINT()) - .build(); + new RowType( + Arrays.asList( + new DataField(6, "f_double", new DoubleType()), + new DataField(0, "f_bool", new BooleanType()), + new DataField(7, "f_string", new VarCharType()), + new DataField(1, "f_byte", new TinyIntType()))); NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); assertThat(projection).isNotNull(); @@ -345,4 +404,47 @@ void testAllDataTypes() { assertThat(projected.getString(2)).isEqualTo(BinaryString.fromString("test")); assertThat(projected.getByte(3)).isEqualTo((byte) 7); } + + @Test + void testFieldNameMismatchThrows() { + // data: ROW + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new IntType()))); + + // projected: same field id=1 but wrong name "wrong" + RowType projectedSchema = + new RowType(Arrays.asList(new DataField(1, "wrong", new IntType()))); + + assertThatThrownBy(() -> NestedProjectedRow.create(dataSchema, projectedSchema)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Field name mismatch") + .hasMessageContaining("'b'") + .hasMessageContaining("'wrong'"); + } + + @Test + void testNestedFieldNameMismatchThrows() { + // data: ROW(0)> + RowType nestedType = + new RowType( + Arrays.asList( + new DataField(10, "x", new IntType()), + new DataField(11, "y", new IntType()))); + RowType dataSchema = new RowType(Arrays.asList(new DataField(0, "r", nestedType))); + + // projected: nested field id=11 but wrong name "wrong_name" + RowType projectedNestedType = + new RowType(Arrays.asList(new DataField(11, "wrong_name", new IntType()))); + RowType projectedSchema = + new RowType(Arrays.asList(new DataField(0, "r", projectedNestedType))); + + assertThatThrownBy(() -> NestedProjectedRow.create(dataSchema, projectedSchema)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Field name mismatch") + .hasMessageContaining("'y'") + .hasMessageContaining("'wrong_name'"); + } } From 2c50672397a5186218b767ef58bdcbe196823c55 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 22 May 2026 16:45:30 +0800 Subject: [PATCH 5/6] fix --- .../paimon/utils/NestedProjectedRow.java | 315 +++++++++++++++++- .../paimon/utils/NestedProjectedRowTest.java | 154 +++++++++ .../format/row/RowFormatReadWriteTest.java | 108 ++++++ 3 files changed, 565 insertions(+), 12 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java b/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java index 7394fae13d4c..ee09b31bcf64 100644 --- a/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java @@ -27,8 +27,11 @@ import org.apache.paimon.data.InternalVector; import org.apache.paimon.data.Timestamp; import org.apache.paimon.data.variant.Variant; +import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataType; import org.apache.paimon.types.DataTypeRoot; +import org.apache.paimon.types.MapType; import org.apache.paimon.types.RowKind; import org.apache.paimon.types.RowType; @@ -41,22 +44,41 @@ * *

Unlike {@link ProjectedRow} which only handles top-level projection, this class recursively * projects nested ROW fields. It maps each projected field to the corresponding position in the - * data schema by name, and for ROW-typed fields, recursively applies sub-projections. + * data schema by field ID, and for ROW-typed fields, recursively applies sub-projections. It also + * handles projection through ARRAY and MAP types whose elements contain ROW fields. */ public class NestedProjectedRow implements InternalRow { private final int[] indexMapping; @Nullable private final NestedProjectedRow[] nestedProjections; @Nullable private final int[] nestedArity; + @Nullable private final NestedProjectedRow[] arrayElementProjections; + @Nullable private final int[] arrayElementArity; + @Nullable private final NestedProjectedRow[] mapKeyProjections; + @Nullable private final int[] mapKeyArity; + @Nullable private final NestedProjectedRow[] mapValueProjections; + @Nullable private final int[] mapValueArity; private InternalRow row; private NestedProjectedRow( int[] indexMapping, @Nullable NestedProjectedRow[] nestedProjections, - @Nullable int[] nestedArity) { + @Nullable int[] nestedArity, + @Nullable NestedProjectedRow[] arrayElementProjections, + @Nullable int[] arrayElementArity, + @Nullable NestedProjectedRow[] mapKeyProjections, + @Nullable int[] mapKeyArity, + @Nullable NestedProjectedRow[] mapValueProjections, + @Nullable int[] mapValueArity) { this.indexMapping = indexMapping; this.nestedProjections = nestedProjections; this.nestedArity = nestedArity; + this.arrayElementProjections = arrayElementProjections; + this.arrayElementArity = arrayElementArity; + this.mapKeyProjections = mapKeyProjections; + this.mapKeyArity = mapKeyArity; + this.mapValueProjections = mapValueProjections; + this.mapValueArity = mapValueArity; } public NestedProjectedRow replaceRow(InternalRow row) { @@ -84,7 +106,12 @@ public static NestedProjectedRow create(RowType dataSchema, RowType projectedSch int[] indexMapping = new int[projectedSize]; NestedProjectedRow[] nestedProjections = null; int[] nestedArity = null; - boolean hasNested = false; + NestedProjectedRow[] arrayElementProjections = null; + int[] arrayElementArity = null; + NestedProjectedRow[] mapKeyProjections = null; + int[] mapKeyArity = null; + NestedProjectedRow[] mapValueProjections = null; + int[] mapValueArity = null; for (int i = 0; i < projectedSize; i++) { DataField projected = projectedFields.get(i); @@ -98,8 +125,9 @@ public static NestedProjectedRow create(RowType dataSchema, RowType projectedSch projected.name()); indexMapping[i] = dataIdx; - if (projected.type().getTypeRoot() == DataTypeRoot.ROW) { - RowType dataNestedType = (RowType) dataFields.get(dataIdx).type(); + DataTypeRoot typeRoot = projected.type().getTypeRoot(); + if (typeRoot == DataTypeRoot.ROW) { + RowType dataNestedType = (RowType) dataField.type(); RowType projectedNestedType = (RowType) projected.type(); NestedProjectedRow sub = create(dataNestedType, projectedNestedType); if (sub != null) { @@ -109,15 +137,69 @@ public static NestedProjectedRow create(RowType dataSchema, RowType projectedSch } nestedProjections[i] = sub; nestedArity[i] = dataNestedType.getFieldCount(); - hasNested = true; + } + } else if (typeRoot == DataTypeRoot.ARRAY) { + DataType projectedElement = ((ArrayType) projected.type()).getElementType(); + DataType dataElement = ((ArrayType) dataField.type()).getElementType(); + if (projectedElement.getTypeRoot() == DataTypeRoot.ROW) { + RowType dataElementRow = (RowType) dataElement; + RowType projectedElementRow = (RowType) projectedElement; + NestedProjectedRow sub = create(dataElementRow, projectedElementRow); + if (sub != null) { + if (arrayElementProjections == null) { + arrayElementProjections = new NestedProjectedRow[projectedSize]; + arrayElementArity = new int[projectedSize]; + } + arrayElementProjections[i] = sub; + arrayElementArity[i] = dataElementRow.getFieldCount(); + } + } + } else if (typeRoot == DataTypeRoot.MAP || typeRoot == DataTypeRoot.MULTISET) { + MapType projectedMapType = (MapType) projected.type(); + MapType dataMapType = (MapType) dataField.type(); + DataType projectedKey = projectedMapType.getKeyType(); + DataType dataKey = dataMapType.getKeyType(); + if (projectedKey.getTypeRoot() == DataTypeRoot.ROW) { + RowType dataKeyRow = (RowType) dataKey; + RowType projectedKeyRow = (RowType) projectedKey; + NestedProjectedRow sub = create(dataKeyRow, projectedKeyRow); + if (sub != null) { + if (mapKeyProjections == null) { + mapKeyProjections = new NestedProjectedRow[projectedSize]; + mapKeyArity = new int[projectedSize]; + } + mapKeyProjections[i] = sub; + mapKeyArity[i] = dataKeyRow.getFieldCount(); + } + } + DataType projectedValue = projectedMapType.getValueType(); + DataType dataValue = dataMapType.getValueType(); + if (projectedValue.getTypeRoot() == DataTypeRoot.ROW) { + RowType dataValueRow = (RowType) dataValue; + RowType projectedValueRow = (RowType) projectedValue; + NestedProjectedRow sub = create(dataValueRow, projectedValueRow); + if (sub != null) { + if (mapValueProjections == null) { + mapValueProjections = new NestedProjectedRow[projectedSize]; + mapValueArity = new int[projectedSize]; + } + mapValueProjections[i] = sub; + mapValueArity[i] = dataValueRow.getFieldCount(); + } } } } - if (!hasNested) { - return new NestedProjectedRow(indexMapping, null, null); - } - return new NestedProjectedRow(indexMapping, nestedProjections, nestedArity); + return new NestedProjectedRow( + indexMapping, + nestedProjections, + nestedArity, + arrayElementProjections, + arrayElementArity, + mapKeyProjections, + mapKeyArity, + mapValueProjections, + mapValueArity); } @Override @@ -207,7 +289,12 @@ public Blob getBlob(int pos) { @Override public InternalArray getArray(int pos) { - return row.getArray(indexMapping[pos]); + InternalArray array = row.getArray(indexMapping[pos]); + if (arrayElementProjections != null && arrayElementProjections[pos] != null) { + return new ProjectedInternalArray( + array, arrayElementProjections[pos], arrayElementArity[pos]); + } + return array; } @Override @@ -217,7 +304,17 @@ public InternalVector getVector(int pos) { @Override public InternalMap getMap(int pos) { - return row.getMap(indexMapping[pos]); + InternalMap map = row.getMap(indexMapping[pos]); + if ((mapKeyProjections != null && mapKeyProjections[pos] != null) + || (mapValueProjections != null && mapValueProjections[pos] != null)) { + NestedProjectedRow keyProj = mapKeyProjections != null ? mapKeyProjections[pos] : null; + int keyAr = mapKeyArity != null ? mapKeyArity[pos] : 0; + NestedProjectedRow valueProj = + mapValueProjections != null ? mapValueProjections[pos] : null; + int valueAr = mapValueArity != null ? mapValueArity[pos] : 0; + return new ProjectedInternalMap(map, keyProj, keyAr, valueProj, valueAr); + } + return map; } @Override @@ -228,4 +325,198 @@ public InternalRow getRow(int pos, int numFields) { } return row.getRow(indexMapping[pos], numFields); } + + // ======================== ProjectedInternalArray ======================== + + private static class ProjectedInternalArray implements InternalArray { + + private final InternalArray array; + private final NestedProjectedRow elementProjection; + private final int elementArity; + + ProjectedInternalArray( + InternalArray array, NestedProjectedRow elementProjection, int elementArity) { + this.array = array; + this.elementProjection = elementProjection; + this.elementArity = elementArity; + } + + @Override + public int size() { + return array.size(); + } + + @Override + public boolean isNullAt(int pos) { + return array.isNullAt(pos); + } + + @Override + public InternalRow getRow(int pos, int numFields) { + InternalRow inner = array.getRow(pos, elementArity); + return elementProjection.replaceRow(inner); + } + + @Override + public boolean getBoolean(int pos) { + return array.getBoolean(pos); + } + + @Override + public byte getByte(int pos) { + return array.getByte(pos); + } + + @Override + public short getShort(int pos) { + return array.getShort(pos); + } + + @Override + public int getInt(int pos) { + return array.getInt(pos); + } + + @Override + public long getLong(int pos) { + return array.getLong(pos); + } + + @Override + public float getFloat(int pos) { + return array.getFloat(pos); + } + + @Override + public double getDouble(int pos) { + return array.getDouble(pos); + } + + @Override + public BinaryString getString(int pos) { + return array.getString(pos); + } + + @Override + public Decimal getDecimal(int pos, int precision, int scale) { + return array.getDecimal(pos, precision, scale); + } + + @Override + public Timestamp getTimestamp(int pos, int precision) { + return array.getTimestamp(pos, precision); + } + + @Override + public byte[] getBinary(int pos) { + return array.getBinary(pos); + } + + @Override + public Variant getVariant(int pos) { + return array.getVariant(pos); + } + + @Override + public Blob getBlob(int pos) { + return array.getBlob(pos); + } + + @Override + public InternalArray getArray(int pos) { + return array.getArray(pos); + } + + @Override + public InternalVector getVector(int pos) { + return array.getVector(pos); + } + + @Override + public InternalMap getMap(int pos) { + return array.getMap(pos); + } + + @Override + public boolean[] toBooleanArray() { + return array.toBooleanArray(); + } + + @Override + public byte[] toByteArray() { + return array.toByteArray(); + } + + @Override + public short[] toShortArray() { + return array.toShortArray(); + } + + @Override + public int[] toIntArray() { + return array.toIntArray(); + } + + @Override + public long[] toLongArray() { + return array.toLongArray(); + } + + @Override + public float[] toFloatArray() { + return array.toFloatArray(); + } + + @Override + public double[] toDoubleArray() { + return array.toDoubleArray(); + } + } + + // ======================== ProjectedInternalMap ======================== + + private static class ProjectedInternalMap implements InternalMap { + + private final InternalMap map; + @Nullable private final NestedProjectedRow keyProjection; + private final int keyArity; + @Nullable private final NestedProjectedRow valueProjection; + private final int valueArity; + + ProjectedInternalMap( + InternalMap map, + @Nullable NestedProjectedRow keyProjection, + int keyArity, + @Nullable NestedProjectedRow valueProjection, + int valueArity) { + this.map = map; + this.keyProjection = keyProjection; + this.keyArity = keyArity; + this.valueProjection = valueProjection; + this.valueArity = valueArity; + } + + @Override + public int size() { + return map.size(); + } + + @Override + public InternalArray keyArray() { + InternalArray keys = map.keyArray(); + if (keyProjection != null) { + return new ProjectedInternalArray(keys, keyProjection, keyArity); + } + return keys; + } + + @Override + public InternalArray valueArray() { + InternalArray values = map.valueArray(); + if (valueProjection != null) { + return new ProjectedInternalArray(values, valueProjection, valueArity); + } + return values; + } + } } diff --git a/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java b/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java index 701ee85b4f96..570365391cca 100644 --- a/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java +++ b/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java @@ -19,14 +19,20 @@ package org.apache.paimon.utils; import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.GenericArray; +import org.apache.paimon.data.GenericMap; import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalMap; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.DataField; import org.apache.paimon.types.DoubleType; import org.apache.paimon.types.FloatType; import org.apache.paimon.types.IntType; +import org.apache.paimon.types.MapType; import org.apache.paimon.types.RowType; import org.apache.paimon.types.SmallIntType; import org.apache.paimon.types.TinyIntType; @@ -36,6 +42,8 @@ import org.junit.jupiter.api.Test; import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -447,4 +455,150 @@ void testNestedFieldNameMismatchThrows() { .hasMessageContaining("'y'") .hasMessageContaining("'wrong_name'"); } + + @Test + void testArrayElementProjection() { + // data: ROW>(0)> + RowType elementType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType(Arrays.asList(new DataField(0, "arr", new ArrayType(elementType)))); + + // projected: ROW>(0)> + RowType projectedElementType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField(0, "arr", new ArrayType(projectedElementType)))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + // arr = [ROW, ROW] + GenericArray array = + new GenericArray(new Object[] {GenericRow.of(1, 100), GenericRow.of(2, 200)}); + GenericRow row = GenericRow.of(array); + InternalRow projected = projection.replaceRow(row); + + InternalArray projectedArray = projected.getArray(0); + assertThat(projectedArray.size()).isEqualTo(2); + assertThat(projectedArray.getRow(0, 1).getInt(0)).isEqualTo(100); + assertThat(projectedArray.getRow(1, 1).getInt(0)).isEqualTo(200); + } + + @Test + void testArrayElementProjectionWithNull() { + // data: ROW>(0)> + RowType elementType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType(Arrays.asList(new DataField(0, "arr", new ArrayType(elementType)))); + + // projected: ROW>(0)> + RowType projectedElementType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField(0, "arr", new ArrayType(projectedElementType)))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + // arr = [ROW, null] + GenericArray array = new GenericArray(new Object[] {GenericRow.of(1, 100), null}); + GenericRow row = GenericRow.of(array); + InternalRow projected = projection.replaceRow(row); + + InternalArray projectedArray = projected.getArray(0); + assertThat(projectedArray.size()).isEqualTo(2); + assertThat(projectedArray.getRow(0, 1).getInt(0)).isEqualTo(100); + assertThat(projectedArray.isNullAt(1)).isTrue(); + } + + @Test + void testMapValueProjection() { + // data: ROW>(0)> + RowType valueType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "m", new MapType(new IntType(), valueType)))); + + // projected: ROW>(0)> + RowType projectedValueType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField( + 0, "m", new MapType(new IntType(), projectedValueType)))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + // m = {1 -> ROW, 2 -> ROW} + Map mapData = new HashMap<>(); + mapData.put(1, GenericRow.of(10, 100)); + mapData.put(2, GenericRow.of(20, 200)); + GenericRow row = GenericRow.of(new GenericMap(mapData)); + InternalRow projected = projection.replaceRow(row); + + InternalMap projectedMap = projected.getMap(0); + assertThat(projectedMap.size()).isEqualTo(2); + InternalArray values = projectedMap.valueArray(); + InternalArray keys = projectedMap.keyArray(); + for (int i = 0; i < 2; i++) { + int key = keys.getInt(i); + int b = values.getRow(i, 1).getInt(0); + if (key == 1) { + assertThat(b).isEqualTo(100); + } else { + assertThat(b).isEqualTo(200); + } + } + } + + @Test + void testArrayWithNoProjectionNeeded() { + // data: ROW>(0), id INT(1)> + RowType elementType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "arr", new ArrayType(elementType)), + new DataField(1, "id", new IntType()))); + + // projected: ROW>(0)> - full element, just drop id + RowType projectedSchema = + new RowType(Arrays.asList(new DataField(0, "arr", new ArrayType(elementType)))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericArray array = new GenericArray(new Object[] {GenericRow.of(1, 2)}); + GenericRow row = GenericRow.of(array, 99); + InternalRow projected = projection.replaceRow(row); + + InternalArray projectedArray = projected.getArray(0); + assertThat(projectedArray.size()).isEqualTo(1); + InternalRow element = projectedArray.getRow(0, 2); + assertThat(element.getInt(0)).isEqualTo(1); + assertThat(element.getInt(1)).isEqualTo(2); + } } diff --git a/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java b/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java index dc634457ebe3..1e2478bb43ab 100644 --- a/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java +++ b/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java @@ -1188,6 +1188,114 @@ public void testNestedProjectionWithFieldReordering() throws IOException { assertThat(results.get(1)).isEqualTo(new int[] {60, 40}); } + @Test + public void testArrayElementProjection() throws IOException { + // data: ROW>> + RowType elementType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType(Arrays.asList(new DataField(0, "arr", new ArrayType(elementType)))); + + Path path = new Path(tempDir.toUri().toString(), "array_elem_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add( + GenericRow.of( + new GenericArray( + new Object[] {GenericRow.of(1, 100), GenericRow.of(2, 200)}))); + rows.add(GenericRow.of(new GenericArray(new Object[] {GenericRow.of(3, 300)}))); + writeRows(format, dataSchema, path, rows); + + // projected: ROW>> + RowType projectedElementType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField(0, "arr", new ArrayType(projectedElementType)))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List results = new ArrayList<>(); + reader.forEachRemaining( + row -> { + InternalRow.FieldGetter arrayGetter = + InternalRow.createFieldGetter(new ArrayType(projectedElementType), 0); + org.apache.paimon.data.InternalArray arr = row.getArray(0); + for (int i = 0; i < arr.size(); i++) { + results.add(arr.getRow(i, 1).getInt(0)); + } + }); + reader.close(); + + // Should get 'b' values (100, 200, 300), not 'a' values (1, 2, 3) + assertThat(results).containsExactly(100, 200, 300); + } + + @Test + public void testMapValueProjection() throws IOException { + // data: ROW>> + RowType valueType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "m", new MapType(new IntType(), valueType)))); + + Path path = new Path(tempDir.toUri().toString(), "map_value_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + Map mapData = new java.util.HashMap<>(); + mapData.put(1, GenericRow.of(10, 100)); + mapData.put(2, GenericRow.of(20, 200)); + List rows = new ArrayList<>(); + rows.add(GenericRow.of(new GenericMap(mapData))); + writeRows(format, dataSchema, path, rows); + + // projected: ROW>> + RowType projectedValueType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField( + 0, "m", new MapType(new IntType(), projectedValueType)))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List results = new ArrayList<>(); + reader.forEachRemaining( + row -> { + org.apache.paimon.data.InternalMap m = row.getMap(0); + org.apache.paimon.data.InternalArray keys = m.keyArray(); + org.apache.paimon.data.InternalArray values = m.valueArray(); + for (int i = 0; i < m.size(); i++) { + results.add(values.getRow(i, 1).getInt(0)); + } + }); + reader.close(); + + // Should get 'b' values (100, 200), not 'a' values (10, 20) + assertThat(results).containsExactlyInAnyOrder(100, 200); + } + // ======================== Helpers ======================== private void writeRows(FileFormat format, RowType rowType, Path path, List rows) From 06276b0a79bf9f3c64239ccc0963a6cde4ca5901 Mon Sep 17 00:00:00 2001 From: JingsongLi Date: Fri, 22 May 2026 17:11:40 +0800 Subject: [PATCH 6/6] fix --- .../paimon/utils/NestedProjectedRow.java | 309 ++++++++++-------- .../paimon/utils/NestedProjectedRowTest.java | 116 +++++++ 2 files changed, 287 insertions(+), 138 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java b/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java index ee09b31bcf64..add679e76ad6 100644 --- a/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java @@ -32,6 +32,7 @@ import org.apache.paimon.types.DataType; import org.apache.paimon.types.DataTypeRoot; import org.apache.paimon.types.MapType; +import org.apache.paimon.types.MultisetType; import org.apache.paimon.types.RowKind; import org.apache.paimon.types.RowType; @@ -45,40 +46,25 @@ *

Unlike {@link ProjectedRow} which only handles top-level projection, this class recursively * projects nested ROW fields. It maps each projected field to the corresponding position in the * data schema by field ID, and for ROW-typed fields, recursively applies sub-projections. It also - * handles projection through ARRAY and MAP types whose elements contain ROW fields. + * handles projection through ARRAY, MAP, and MULTISET types at arbitrary nesting depth. */ public class NestedProjectedRow implements InternalRow { private final int[] indexMapping; @Nullable private final NestedProjectedRow[] nestedProjections; @Nullable private final int[] nestedArity; - @Nullable private final NestedProjectedRow[] arrayElementProjections; - @Nullable private final int[] arrayElementArity; - @Nullable private final NestedProjectedRow[] mapKeyProjections; - @Nullable private final int[] mapKeyArity; - @Nullable private final NestedProjectedRow[] mapValueProjections; - @Nullable private final int[] mapValueArity; + @Nullable private final ElementProjection[] elementProjections; private InternalRow row; private NestedProjectedRow( int[] indexMapping, @Nullable NestedProjectedRow[] nestedProjections, @Nullable int[] nestedArity, - @Nullable NestedProjectedRow[] arrayElementProjections, - @Nullable int[] arrayElementArity, - @Nullable NestedProjectedRow[] mapKeyProjections, - @Nullable int[] mapKeyArity, - @Nullable NestedProjectedRow[] mapValueProjections, - @Nullable int[] mapValueArity) { + @Nullable ElementProjection[] elementProjections) { this.indexMapping = indexMapping; this.nestedProjections = nestedProjections; this.nestedArity = nestedArity; - this.arrayElementProjections = arrayElementProjections; - this.arrayElementArity = arrayElementArity; - this.mapKeyProjections = mapKeyProjections; - this.mapKeyArity = mapKeyArity; - this.mapValueProjections = mapValueProjections; - this.mapValueArity = mapValueArity; + this.elementProjections = elementProjections; } public NestedProjectedRow replaceRow(InternalRow row) { @@ -106,12 +92,7 @@ public static NestedProjectedRow create(RowType dataSchema, RowType projectedSch int[] indexMapping = new int[projectedSize]; NestedProjectedRow[] nestedProjections = null; int[] nestedArity = null; - NestedProjectedRow[] arrayElementProjections = null; - int[] arrayElementArity = null; - NestedProjectedRow[] mapKeyProjections = null; - int[] mapKeyArity = null; - NestedProjectedRow[] mapValueProjections = null; - int[] mapValueArity = null; + ElementProjection[] elementProjections = null; for (int i = 0; i < projectedSize; i++) { DataField projected = projectedFields.get(i); @@ -138,68 +119,93 @@ public static NestedProjectedRow create(RowType dataSchema, RowType projectedSch nestedProjections[i] = sub; nestedArity[i] = dataNestedType.getFieldCount(); } - } else if (typeRoot == DataTypeRoot.ARRAY) { - DataType projectedElement = ((ArrayType) projected.type()).getElementType(); - DataType dataElement = ((ArrayType) dataField.type()).getElementType(); - if (projectedElement.getTypeRoot() == DataTypeRoot.ROW) { - RowType dataElementRow = (RowType) dataElement; - RowType projectedElementRow = (RowType) projectedElement; - NestedProjectedRow sub = create(dataElementRow, projectedElementRow); - if (sub != null) { - if (arrayElementProjections == null) { - arrayElementProjections = new NestedProjectedRow[projectedSize]; - arrayElementArity = new int[projectedSize]; - } - arrayElementProjections[i] = sub; - arrayElementArity[i] = dataElementRow.getFieldCount(); - } - } - } else if (typeRoot == DataTypeRoot.MAP || typeRoot == DataTypeRoot.MULTISET) { - MapType projectedMapType = (MapType) projected.type(); - MapType dataMapType = (MapType) dataField.type(); - DataType projectedKey = projectedMapType.getKeyType(); - DataType dataKey = dataMapType.getKeyType(); - if (projectedKey.getTypeRoot() == DataTypeRoot.ROW) { - RowType dataKeyRow = (RowType) dataKey; - RowType projectedKeyRow = (RowType) projectedKey; - NestedProjectedRow sub = create(dataKeyRow, projectedKeyRow); - if (sub != null) { - if (mapKeyProjections == null) { - mapKeyProjections = new NestedProjectedRow[projectedSize]; - mapKeyArity = new int[projectedSize]; - } - mapKeyProjections[i] = sub; - mapKeyArity[i] = dataKeyRow.getFieldCount(); - } - } - DataType projectedValue = projectedMapType.getValueType(); - DataType dataValue = dataMapType.getValueType(); - if (projectedValue.getTypeRoot() == DataTypeRoot.ROW) { - RowType dataValueRow = (RowType) dataValue; - RowType projectedValueRow = (RowType) projectedValue; - NestedProjectedRow sub = create(dataValueRow, projectedValueRow); - if (sub != null) { - if (mapValueProjections == null) { - mapValueProjections = new NestedProjectedRow[projectedSize]; - mapValueArity = new int[projectedSize]; - } - mapValueProjections[i] = sub; - mapValueArity[i] = dataValueRow.getFieldCount(); + } else { + ElementProjection ep = createElementProjection(dataField.type(), projected.type()); + if (ep != null) { + if (elementProjections == null) { + elementProjections = new ElementProjection[projectedSize]; } + elementProjections[i] = ep; } } } return new NestedProjectedRow( - indexMapping, - nestedProjections, - nestedArity, - arrayElementProjections, - arrayElementArity, - mapKeyProjections, - mapKeyArity, - mapValueProjections, - mapValueArity); + indexMapping, nestedProjections, nestedArity, elementProjections); + } + + @Nullable + private static ElementProjection createElementProjection( + DataType dataType, DataType projectedType) { + if (dataType.equals(projectedType)) { + return null; + } + DataTypeRoot typeRoot = projectedType.getTypeRoot(); + switch (typeRoot) { + case ARRAY: + DataType dataElement = ((ArrayType) dataType).getElementType(); + DataType projectedElement = ((ArrayType) projectedType).getElementType(); + return createCollectionElementProjection(dataElement, projectedElement); + case MAP: + return createMapProjection( + ((MapType) dataType).getKeyType(), + ((MapType) projectedType).getKeyType(), + ((MapType) dataType).getValueType(), + ((MapType) projectedType).getValueType()); + case MULTISET: + DataType dataMultisetElement = ((MultisetType) dataType).getElementType(); + DataType projectedMultisetElement = ((MultisetType) projectedType).getElementType(); + return createMapProjection( + dataMultisetElement, projectedMultisetElement, null, null); + default: + return null; + } + } + + @Nullable + private static ElementProjection createMapProjection( + DataType dataKey, + DataType projectedKey, + @Nullable DataType dataValue, + @Nullable DataType projectedValue) { + ElementProjection keyProj = null; + ElementProjection valueProj = null; + + if (dataKey != null && projectedKey != null && !dataKey.equals(projectedKey)) { + keyProj = createCollectionElementProjection(dataKey, projectedKey); + } + + if (dataValue != null && projectedValue != null && !dataValue.equals(projectedValue)) { + valueProj = createCollectionElementProjection(dataValue, projectedValue); + } + + if (keyProj == null && valueProj == null) { + return null; + } + return new ElementProjection(null, 0, keyProj, valueProj); + } + + @Nullable + private static ElementProjection createCollectionElementProjection( + DataType dataType, DataType projectedType) { + if (dataType.equals(projectedType)) { + return null; + } + if (projectedType.getTypeRoot() == DataTypeRoot.ROW) { + RowType dataRow = (RowType) dataType; + RowType projRow = (RowType) projectedType; + NestedProjectedRow sub = create(dataRow, projRow); + if (sub != null) { + return new ElementProjection(sub, dataRow.getFieldCount(), null, null); + } + return null; + } + // Element is a collection type (ARRAY, MAP, MULTISET) — wrap one level deeper + ElementProjection inner = createElementProjection(dataType, projectedType); + if (inner != null) { + return new ElementProjection(null, 0, inner, null); + } + return null; } @Override @@ -290,9 +296,8 @@ public Blob getBlob(int pos) { @Override public InternalArray getArray(int pos) { InternalArray array = row.getArray(indexMapping[pos]); - if (arrayElementProjections != null && arrayElementProjections[pos] != null) { - return new ProjectedInternalArray( - array, arrayElementProjections[pos], arrayElementArity[pos]); + if (elementProjections != null && elementProjections[pos] != null) { + return elementProjections[pos].projectArray(array); } return array; } @@ -305,14 +310,8 @@ public InternalVector getVector(int pos) { @Override public InternalMap getMap(int pos) { InternalMap map = row.getMap(indexMapping[pos]); - if ((mapKeyProjections != null && mapKeyProjections[pos] != null) - || (mapValueProjections != null && mapValueProjections[pos] != null)) { - NestedProjectedRow keyProj = mapKeyProjections != null ? mapKeyProjections[pos] : null; - int keyAr = mapKeyArity != null ? mapKeyArity[pos] : 0; - NestedProjectedRow valueProj = - mapValueProjections != null ? mapValueProjections[pos] : null; - int valueAr = mapValueArity != null ? mapValueArity[pos] : 0; - return new ProjectedInternalMap(map, keyProj, keyAr, valueProj, valueAr); + if (elementProjections != null && elementProjections[pos] != null) { + return elementProjections[pos].projectMap(map); } return map; } @@ -326,19 +325,59 @@ public InternalRow getRow(int pos, int numFields) { return row.getRow(indexMapping[pos], numFields); } + // ======================== ElementProjection ======================== + + /** + * Describes how to project elements within a collection type (ARRAY, MAP, MULTISET). + * Recursively handles nested collections. + */ + static class ElementProjection { + @Nullable final NestedProjectedRow rowProjection; + final int rowArity; + @Nullable final ElementProjection keyOrElementProjection; + @Nullable final ElementProjection valueProjection; + + ElementProjection( + @Nullable NestedProjectedRow rowProjection, + int rowArity, + @Nullable ElementProjection keyOrElementProjection, + @Nullable ElementProjection valueProjection) { + this.rowProjection = rowProjection; + this.rowArity = rowArity; + this.keyOrElementProjection = keyOrElementProjection; + this.valueProjection = valueProjection; + } + + InternalArray projectArray(InternalArray array) { + return new ProjectedInternalArray(array, this); + } + + InternalMap projectMap(InternalMap map) { + InternalArray keys = map.keyArray(); + InternalArray values = map.valueArray(); + InternalArray projectedKeys = + keyOrElementProjection != null + ? keyOrElementProjection.projectArray(keys) + : keys; + InternalArray projectedValues = + valueProjection != null ? valueProjection.projectArray(values) : values; + if (projectedKeys == keys && projectedValues == values) { + return map; + } + return new ProjectedInternalMap(map.size(), projectedKeys, projectedValues); + } + } + // ======================== ProjectedInternalArray ======================== private static class ProjectedInternalArray implements InternalArray { private final InternalArray array; - private final NestedProjectedRow elementProjection; - private final int elementArity; + private final ElementProjection projection; - ProjectedInternalArray( - InternalArray array, NestedProjectedRow elementProjection, int elementArity) { + ProjectedInternalArray(InternalArray array, ElementProjection projection) { this.array = array; - this.elementProjection = elementProjection; - this.elementArity = elementArity; + this.projection = projection; } @Override @@ -353,8 +392,29 @@ public boolean isNullAt(int pos) { @Override public InternalRow getRow(int pos, int numFields) { - InternalRow inner = array.getRow(pos, elementArity); - return elementProjection.replaceRow(inner); + if (projection.rowProjection != null) { + InternalRow inner = array.getRow(pos, projection.rowArity); + return projection.rowProjection.replaceRow(inner); + } + return array.getRow(pos, numFields); + } + + @Override + public InternalArray getArray(int pos) { + InternalArray inner = array.getArray(pos); + if (projection.keyOrElementProjection != null) { + return projection.keyOrElementProjection.projectArray(inner); + } + return inner; + } + + @Override + public InternalMap getMap(int pos) { + InternalMap inner = array.getMap(pos); + if (projection.keyOrElementProjection != null) { + return projection.keyOrElementProjection.projectMap(inner); + } + return inner; } @Override @@ -422,21 +482,11 @@ public Blob getBlob(int pos) { return array.getBlob(pos); } - @Override - public InternalArray getArray(int pos) { - return array.getArray(pos); - } - @Override public InternalVector getVector(int pos) { return array.getVector(pos); } - @Override - public InternalMap getMap(int pos) { - return array.getMap(pos); - } - @Override public boolean[] toBooleanArray() { return array.toBooleanArray(); @@ -477,46 +527,29 @@ public double[] toDoubleArray() { private static class ProjectedInternalMap implements InternalMap { - private final InternalMap map; - @Nullable private final NestedProjectedRow keyProjection; - private final int keyArity; - @Nullable private final NestedProjectedRow valueProjection; - private final int valueArity; - - ProjectedInternalMap( - InternalMap map, - @Nullable NestedProjectedRow keyProjection, - int keyArity, - @Nullable NestedProjectedRow valueProjection, - int valueArity) { - this.map = map; - this.keyProjection = keyProjection; - this.keyArity = keyArity; - this.valueProjection = valueProjection; - this.valueArity = valueArity; + private final int size; + private final InternalArray keyArray; + private final InternalArray valueArray; + + ProjectedInternalMap(int size, InternalArray keyArray, InternalArray valueArray) { + this.size = size; + this.keyArray = keyArray; + this.valueArray = valueArray; } @Override public int size() { - return map.size(); + return size; } @Override public InternalArray keyArray() { - InternalArray keys = map.keyArray(); - if (keyProjection != null) { - return new ProjectedInternalArray(keys, keyProjection, keyArity); - } - return keys; + return keyArray; } @Override public InternalArray valueArray() { - InternalArray values = map.valueArray(); - if (valueProjection != null) { - return new ProjectedInternalArray(values, valueProjection, valueArity); - } - return values; + return valueArray; } } } diff --git a/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java b/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java index 570365391cca..4a12756bb937 100644 --- a/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java +++ b/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java @@ -33,6 +33,7 @@ import org.apache.paimon.types.FloatType; import org.apache.paimon.types.IntType; import org.apache.paimon.types.MapType; +import org.apache.paimon.types.MultisetType; import org.apache.paimon.types.RowType; import org.apache.paimon.types.SmallIntType; import org.apache.paimon.types.TinyIntType; @@ -601,4 +602,119 @@ void testArrayWithNoProjectionNeeded() { assertThat(element.getInt(0)).isEqualTo(1); assertThat(element.getInt(1)).isEqualTo(2); } + + @Test + void testNestedArrayProjection() { + // data: ROW>>(0)> + RowType elementType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField( + 0, "arr", new ArrayType(new ArrayType(elementType))))); + + // projected: ROW>>(0)> + RowType projectedElementType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField( + 0, + "arr", + new ArrayType(new ArrayType(projectedElementType))))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + // arr = [[ROW, ROW]] + GenericArray innerArray = + new GenericArray(new Object[] {GenericRow.of(1, 100), GenericRow.of(2, 200)}); + GenericArray outerArray = new GenericArray(new Object[] {innerArray}); + GenericRow row = GenericRow.of(outerArray); + InternalRow projected = projection.replaceRow(row); + + InternalArray projOuter = projected.getArray(0); + assertThat(projOuter.size()).isEqualTo(1); + InternalArray projInner = projOuter.getArray(0); + assertThat(projInner.size()).isEqualTo(2); + assertThat(projInner.getRow(0, 1).getInt(0)).isEqualTo(100); + assertThat(projInner.getRow(1, 1).getInt(0)).isEqualTo(200); + } + + @Test + void testMapWithArrayValueProjection() { + // data: ROW>>(0)> + RowType elementType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField( + 0, + "m", + new MapType(new IntType(), new ArrayType(elementType))))); + + // projected: ROW>>(0)> + RowType projectedElementType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField( + 0, + "m", + new MapType( + new IntType(), + new ArrayType(projectedElementType))))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + // m = {1 -> [ROW]} + Map mapData = new HashMap<>(); + mapData.put(1, new GenericArray(new Object[] {GenericRow.of(10, 100)})); + GenericRow row = GenericRow.of(new GenericMap(mapData)); + InternalRow projected = projection.replaceRow(row); + + InternalMap projectedMap = projected.getMap(0); + InternalArray values = projectedMap.valueArray(); + InternalArray valueArr = values.getArray(0); + assertThat(valueArr.getRow(0, 1).getInt(0)).isEqualTo(100); + } + + @Test + void testMultisetDoesNotThrow() { + // data: ROW(1)> + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "ms", new MultisetType(new VarCharType())))); + + // projected: ROW(1)> + RowType projectedSchema = + new RowType( + Arrays.asList(new DataField(1, "ms", new MultisetType(new VarCharType())))); + + // Should not throw ClassCastException + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + Map msData = new HashMap<>(); + msData.put(BinaryString.fromString("hello"), 2); + GenericRow row = GenericRow.of(42, new GenericMap(msData)); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getFieldCount()).isEqualTo(1); + InternalMap ms = projected.getMap(0); + assertThat(ms.size()).isEqualTo(1); + } }