diff --git a/docs/docs/concepts/spec/fileformat.md b/docs/docs/concepts/spec/fileformat.md index baac8f2b0536..a38a667cbb83 100644 --- a/docs/docs/concepts/spec/fileformat.md +++ b/docs/docs/concepts/spec/fileformat.md @@ -24,9 +24,10 @@ under the License. # File Format -Currently, supports Parquet, Avro, ORC, CSV, JSON, and Lance file formats. +Currently, supports Parquet, Avro, ORC, CSV, JSON, Lance, and Row file formats. - Recommended column format is Parquet, which has a high compression rate and fast column projection queries. -- Recommended row based format is Avro, which has good performance n reading and writing full row (all columns). +- Recommended row based format is Avro, which has good performance on reading and writing full row (all columns). +- Recommended format for row-number based O(1) lookups is Row, which stores data in row-oriented blocks with ZSTD compression and supports fast random access by row number. - Recommended testing format is CSV, which has better readability but the worst read-write performance. - Recommended format for ML workloads is Lance, which is optimized for vector search and machine learning use cases. @@ -754,6 +755,20 @@ Limitations: 1. Lance file format does not support `MAP` type. 2. Lance file format does not support `TIMESTAMP_LOCAL_ZONE` type. +## ROW + +The Row format is a row-oriented storage format designed for O(1) random access by row number. Data is organized in blocks with ZSTD Level 1 compression. Each block contains complete rows serialized in a compact binary format with an offset array for direct row positioning. + +Key features: +- **O(1) Row Lookup**: Block index + in-block offset array enables direct access to any row by its global row number +- **Block-level ZSTD Compression**: Each block is independently compressed for good compression ratio with fast decompression +- **Compact Serialization**: Rows are serialized with a null bitmap followed by field values in sequence, minimizing overhead +- **Selection Pushdown**: Supports RoaringBitmap-based row selection, skipping entire blocks that contain no selected rows + +The Row format supports all Paimon data types: BOOLEAN, TINYINT, SMALLINT, INT, BIGINT, FLOAT, DOUBLE, CHAR, VARCHAR, BINARY, VARBINARY, DECIMAL, DATE, TIME, TIMESTAMP, TIMESTAMP_LOCAL_ZONE, VARIANT, ARRAY, MAP, ROW. + +For detailed file layout and binary format specification, see [Row Format](./rowformat). + ## BLOB The BLOB format is a specialized format for storing large binary objects such as images, videos, and other multimodal data. Unlike other formats that store data inline, BLOB format stores large binary data in separate files with an optimized layout for random access. diff --git a/docs/docs/concepts/spec/rowformat.md b/docs/docs/concepts/spec/rowformat.md new file mode 100644 index 000000000000..cddbbda22c7f --- /dev/null +++ b/docs/docs/concepts/spec/rowformat.md @@ -0,0 +1,213 @@ +--- +title: "Row Format" +sidebar_position: 8 +--- + + + +# Row Format Specification + +The Row format (`.row`) is a row-oriented file format optimized for O(1) random access by row number. It is designed for scenarios where fast point lookups by row position are critical, such as deletion vector applications and changelog materialization. + +## File Layout + +A `.row` file consists of three sections: + +``` ++====================================================================+ +| ROW FILE (.row) | ++====================================================================+ +| Data Block 0 (ZSTD compressed) | +| Data Block 1 (ZSTD compressed) | +| ... | +| Data Block K (ZSTD compressed) | ++--------------------------------------------------------------------+ +| Block Index (Delta+ZigZag+Varint encoded) | ++--------------------------------------------------------------------+ +| Footer (fixed 32 bytes) | ++====================================================================+ +``` + +## Data Block + +Each data block is independently ZSTD Level 1 compressed. The uncompressed content has the following layout: + +``` ++-----------------------------------------------------------+ +| row_0_bytes | row_1_bytes | ... | row_N_bytes | ++-----------------------------------------------------------+ +| offset[0] (int32 LE) | offset[1] | ... | offset[N] | ++-----------------------------------------------------------+ +| row_count (int32 LE) | ++-----------------------------------------------------------+ +``` + +- **Row data region**: Each row is serialized sequentially using the compact row format (see below). +- **Offset array**: An array of int32 little-endian values, one per row, storing the byte offset of each row within the uncompressed block. +- **Row count**: A single int32 little-endian value at the very end of the block, storing the number of rows in this block. + +A new block is flushed when the estimated uncompressed size reaches the configured block size threshold (default 64 KB, configurable via `file.block-size`). + +### Row Serialization Format + +Each row is serialized as: + +``` ++-----------------------------------------------+ +| null_bitmap | field_0 | field_1 | ... | field_N | ++-----------------------------------------------+ +``` + +**Null bitmap**: `ceil(arity / 8)` bytes. Bit `i` is set (1) if field `i` is null. The bit position is `byte[i/8] & (1 << (i%8))`. Non-null fields are serialized in order; null fields occupy no space beyond the bitmap bit. + +### Primitive Type Encoding + +All multi-byte primitives use **little-endian** byte order. + +| Paimon Type | Encoding | +|---|---| +| BOOLEAN | 1 byte: 0 = false, 1 = true | +| TINYINT | 1 byte signed | +| SMALLINT | 2 bytes int16 LE | +| INT / DATE / TIME | 4 bytes int32 LE | +| BIGINT | 8 bytes int64 LE | +| FLOAT | 4 bytes IEEE 754 LE | +| DOUBLE | 8 bytes IEEE 754 LE | +| CHAR / VARCHAR | varint(length) + UTF-8 bytes | +| BINARY / VARBINARY | varint(length) + raw bytes | +| DECIMAL(P, S) where P <= 18 | 8 bytes int64 LE (unscaled long) | +| DECIMAL(P, S) where P > 18 | varint(length) + unscaled bytes (big-endian two's complement) | +| TIMESTAMP(P) where P <= 3 | 8 bytes int64 LE (epoch millis) | +| TIMESTAMP(P) where P > 3 | 8 bytes int64 LE (epoch millis) + varint(nanoOfMillisecond) | +| VARIANT | varint(len1) + value bytes + varint(len2) + metadata bytes | + +### Varint Encoding + +Variable-length integer encoding (unsigned LEB128): +- Each byte uses 7 bits for data and 1 bit (MSB) as continuation flag. +- If MSB = 1, more bytes follow. If MSB = 0, this is the last byte. +- Maximum 5 bytes for int32 values. + +### Complex Type Encoding + +**ARRAY**: + +``` +varint(size) | null_bitmap[ceil(size/8) bytes] | element_0 | element_1 | ... | element_N +``` + +Null bitmap uses the same bit layout as row nulls. Non-null elements are serialized in order using the element type's encoding. + +**MAP**: + +A map is serialized as two arrays (keys array followed by values array): + +``` +[keys array] [values array] +``` + +Each array follows the ARRAY encoding above (varint size + null bitmap + elements). Both keys and values support null entries. + +**ROW (nested)**: + +Nested rows use the same format as top-level rows: + +``` +null_bitmap[ceil(arity/8) bytes] | field_0 | field_1 | ... | field_N +``` + +## Block Index + +The block index stores metadata for all blocks, enabling binary search to locate the block containing a given row number. + +``` ++--------------------------------------------------------------------+ +| varint(len_0) | encoded_block_compressed_sizes | +| varint(len_1) | encoded_block_uncompressed_sizes | +| varint(len_2) | encoded_block_row_starts | ++--------------------------------------------------------------------+ +``` + +Each of the three arrays is encoded using **Delta + ZigZag + Varint** compression: +1. Compute deltas between consecutive values +2. ZigZag encode each delta (maps signed to unsigned) +3. Varint encode each ZigZag value + +This is highly efficient for monotonically increasing sequences (row starts) and similar-valued sequences (sizes). + +The arrays are: +- **blockCompressedSizes**: Compressed size of each block. Block offsets are derived by prefix sum (first block starts at file position 0). +- **blockUncompressedSizes**: Uncompressed size of each block (needed to allocate decompression buffer) +- **blockRowStarts**: Cumulative row count at the start of each block (for binary search) + +## Footer + +The footer is a fixed 32-byte structure at the end of the file: + +``` ++-----------------------------------------------+ +| totalRowCount | int64 | 8 bytes | LE | +| blockCount | int32 | 4 bytes | LE | +| indexOffset | int64 | 8 bytes | LE | +| indexLength | int32 | 4 bytes | LE | +| version | int8 | 1 byte | | +| reserved | | 3 bytes | | +| magic | int32 | 4 bytes | LE | ++-----------------------------------------------+ +``` + +- **totalRowCount**: Total number of rows in the file. +- **blockCount**: Number of data blocks. +- **indexOffset**: Byte offset in the file where the block index starts. +- **indexLength**: Length in bytes of the block index section. +- **version**: Format version, currently `1`. +- **reserved**: 3 bytes reserved for future use (must be 0). +- **magic**: `0x524F5753` (ASCII "ROWS"), used for format validation. + +## Row Number Lookup Algorithm + +To read a specific row by its global row number: + +1. **Read Footer**: Seek to file end - 32 bytes, read the 32-byte footer. Validate magic number. +2. **Read Block Index**: Seek to `indexOffset`, read `indexLength` bytes, decode the three arrays. Compute block offsets by prefix sum of `blockCompressedSizes[]`. +3. **Binary Search**: Search `blockRowStarts[]` to find block `b` where `blockRowStarts[b] <= rowNum < blockRowStarts[b+1]`. +4. **Read Block**: Seek to `blockOffset(b)`, read `blockCompressedSizes[b]` bytes. +5. **Decompress**: ZSTD decompress into a buffer of size `blockUncompressedSizes[b]`. +6. **Locate Row**: Compute `localIdx = rowNum - blockRowStarts[b]`. Read `offsets[localIdx]` from the offset array at the end of the decompressed block. +7. **Deserialize**: Read the row starting at the computed offset using the row serialization format. + +## Projection + +Column projection is applied after full row deserialization. Since the compact row format serializes fields sequentially without per-field offset metadata, individual fields cannot be skipped during deserialization. After the complete row is deserialized, a projection mapping selects the requested columns. + +## Selection (Deletion Vectors) + +Row selection via `RoaringBitmap32` enables efficient filtering: + +1. For each block, check if the selection bitmap intersects with `[blockRowStart, blockRowEnd)`. +2. If no intersection, skip the entire block (no I/O or decompression). +3. If there is an intersection, decompress the block and only deserialize the selected rows using their local indices. + +## Configuration + +| Option | Default | Description | +|---|---|---| +| `file.block-size` | 64 KB | Uncompressed block size threshold. Larger blocks improve compression ratio but increase read amplification for point lookups. | +| ZSTD Level | 1 | Fixed at level 1 for fast compression with reasonable ratio. | diff --git a/docs/sidebars.js b/docs/sidebars.js index 0c959ab96325..634d0830e970 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -49,6 +49,7 @@ const sidebars = { "concepts/spec/manifest", "concepts/spec/datafile", "concepts/spec/fileformat", + "concepts/spec/rowformat", "concepts/spec/tableindex", "concepts/spec/fileindex" ] diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java b/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java new file mode 100644 index 000000000000..add679e76ad6 --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/utils/NestedProjectedRow.java @@ -0,0 +1,555 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.utils; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Blob; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalMap; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.InternalVector; +import org.apache.paimon.data.Timestamp; +import org.apache.paimon.data.variant.Variant; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.DataTypeRoot; +import org.apache.paimon.types.MapType; +import org.apache.paimon.types.MultisetType; +import org.apache.paimon.types.RowKind; +import org.apache.paimon.types.RowType; + +import javax.annotation.Nullable; + +import java.util.List; + +/** + * A projected view of {@link InternalRow} that supports nested ROW field pruning. + * + *

Unlike {@link ProjectedRow} which only handles top-level projection, this class recursively + * projects nested ROW fields. It maps each projected field to the corresponding position in the + * data schema by field ID, and for ROW-typed fields, recursively applies sub-projections. It also + * handles projection through ARRAY, MAP, and MULTISET types at arbitrary nesting depth. + */ +public class NestedProjectedRow implements InternalRow { + + private final int[] indexMapping; + @Nullable private final NestedProjectedRow[] nestedProjections; + @Nullable private final int[] nestedArity; + @Nullable private final ElementProjection[] elementProjections; + private InternalRow row; + + private NestedProjectedRow( + int[] indexMapping, + @Nullable NestedProjectedRow[] nestedProjections, + @Nullable int[] nestedArity, + @Nullable ElementProjection[] elementProjections) { + this.indexMapping = indexMapping; + this.nestedProjections = nestedProjections; + this.nestedArity = nestedArity; + this.elementProjections = elementProjections; + } + + public NestedProjectedRow replaceRow(InternalRow row) { + this.row = row; + return this; + } + + /** + * Creates a {@link NestedProjectedRow} from the data schema and projected schema using field + * IDs to match fields. Returns null if the two schemas are identical (no projection needed). + * + * @param dataSchema the full schema of the underlying row data + * @param projectedSchema the projected schema to read + */ + @Nullable + public static NestedProjectedRow create(RowType dataSchema, RowType projectedSchema) { + if (dataSchema.equals(projectedSchema)) { + return null; + } + + List dataFields = dataSchema.getFields(); + List projectedFields = projectedSchema.getFields(); + + int projectedSize = projectedFields.size(); + int[] indexMapping = new int[projectedSize]; + NestedProjectedRow[] nestedProjections = null; + int[] nestedArity = null; + ElementProjection[] elementProjections = null; + + for (int i = 0; i < projectedSize; i++) { + DataField projected = projectedFields.get(i); + int dataIdx = dataSchema.getFieldIndexByFieldId(projected.id()); + DataField dataField = dataFields.get(dataIdx); + Preconditions.checkArgument( + dataField.name().equals(projected.name()), + "Field name mismatch for field id %s: data schema has '%s' but projected schema has '%s'", + projected.id(), + dataField.name(), + projected.name()); + indexMapping[i] = dataIdx; + + DataTypeRoot typeRoot = projected.type().getTypeRoot(); + if (typeRoot == DataTypeRoot.ROW) { + RowType dataNestedType = (RowType) dataField.type(); + RowType projectedNestedType = (RowType) projected.type(); + NestedProjectedRow sub = create(dataNestedType, projectedNestedType); + if (sub != null) { + if (nestedProjections == null) { + nestedProjections = new NestedProjectedRow[projectedSize]; + nestedArity = new int[projectedSize]; + } + nestedProjections[i] = sub; + nestedArity[i] = dataNestedType.getFieldCount(); + } + } else { + ElementProjection ep = createElementProjection(dataField.type(), projected.type()); + if (ep != null) { + if (elementProjections == null) { + elementProjections = new ElementProjection[projectedSize]; + } + elementProjections[i] = ep; + } + } + } + + return new NestedProjectedRow( + indexMapping, nestedProjections, nestedArity, elementProjections); + } + + @Nullable + private static ElementProjection createElementProjection( + DataType dataType, DataType projectedType) { + if (dataType.equals(projectedType)) { + return null; + } + DataTypeRoot typeRoot = projectedType.getTypeRoot(); + switch (typeRoot) { + case ARRAY: + DataType dataElement = ((ArrayType) dataType).getElementType(); + DataType projectedElement = ((ArrayType) projectedType).getElementType(); + return createCollectionElementProjection(dataElement, projectedElement); + case MAP: + return createMapProjection( + ((MapType) dataType).getKeyType(), + ((MapType) projectedType).getKeyType(), + ((MapType) dataType).getValueType(), + ((MapType) projectedType).getValueType()); + case MULTISET: + DataType dataMultisetElement = ((MultisetType) dataType).getElementType(); + DataType projectedMultisetElement = ((MultisetType) projectedType).getElementType(); + return createMapProjection( + dataMultisetElement, projectedMultisetElement, null, null); + default: + return null; + } + } + + @Nullable + private static ElementProjection createMapProjection( + DataType dataKey, + DataType projectedKey, + @Nullable DataType dataValue, + @Nullable DataType projectedValue) { + ElementProjection keyProj = null; + ElementProjection valueProj = null; + + if (dataKey != null && projectedKey != null && !dataKey.equals(projectedKey)) { + keyProj = createCollectionElementProjection(dataKey, projectedKey); + } + + if (dataValue != null && projectedValue != null && !dataValue.equals(projectedValue)) { + valueProj = createCollectionElementProjection(dataValue, projectedValue); + } + + if (keyProj == null && valueProj == null) { + return null; + } + return new ElementProjection(null, 0, keyProj, valueProj); + } + + @Nullable + private static ElementProjection createCollectionElementProjection( + DataType dataType, DataType projectedType) { + if (dataType.equals(projectedType)) { + return null; + } + if (projectedType.getTypeRoot() == DataTypeRoot.ROW) { + RowType dataRow = (RowType) dataType; + RowType projRow = (RowType) projectedType; + NestedProjectedRow sub = create(dataRow, projRow); + if (sub != null) { + return new ElementProjection(sub, dataRow.getFieldCount(), null, null); + } + return null; + } + // Element is a collection type (ARRAY, MAP, MULTISET) — wrap one level deeper + ElementProjection inner = createElementProjection(dataType, projectedType); + if (inner != null) { + return new ElementProjection(null, 0, inner, null); + } + return null; + } + + @Override + public int getFieldCount() { + return indexMapping.length; + } + + @Override + public RowKind getRowKind() { + return row.getRowKind(); + } + + @Override + public void setRowKind(RowKind kind) { + row.setRowKind(kind); + } + + @Override + public boolean isNullAt(int pos) { + return row.isNullAt(indexMapping[pos]); + } + + @Override + public boolean getBoolean(int pos) { + return row.getBoolean(indexMapping[pos]); + } + + @Override + public byte getByte(int pos) { + return row.getByte(indexMapping[pos]); + } + + @Override + public short getShort(int pos) { + return row.getShort(indexMapping[pos]); + } + + @Override + public int getInt(int pos) { + return row.getInt(indexMapping[pos]); + } + + @Override + public long getLong(int pos) { + return row.getLong(indexMapping[pos]); + } + + @Override + public float getFloat(int pos) { + return row.getFloat(indexMapping[pos]); + } + + @Override + public double getDouble(int pos) { + return row.getDouble(indexMapping[pos]); + } + + @Override + public BinaryString getString(int pos) { + return row.getString(indexMapping[pos]); + } + + @Override + public Decimal getDecimal(int pos, int precision, int scale) { + return row.getDecimal(indexMapping[pos], precision, scale); + } + + @Override + public Timestamp getTimestamp(int pos, int precision) { + return row.getTimestamp(indexMapping[pos], precision); + } + + @Override + public byte[] getBinary(int pos) { + return row.getBinary(indexMapping[pos]); + } + + @Override + public Variant getVariant(int pos) { + return row.getVariant(indexMapping[pos]); + } + + @Override + public Blob getBlob(int pos) { + return row.getBlob(indexMapping[pos]); + } + + @Override + public InternalArray getArray(int pos) { + InternalArray array = row.getArray(indexMapping[pos]); + if (elementProjections != null && elementProjections[pos] != null) { + return elementProjections[pos].projectArray(array); + } + return array; + } + + @Override + public InternalVector getVector(int pos) { + return row.getVector(indexMapping[pos]); + } + + @Override + public InternalMap getMap(int pos) { + InternalMap map = row.getMap(indexMapping[pos]); + if (elementProjections != null && elementProjections[pos] != null) { + return elementProjections[pos].projectMap(map); + } + return map; + } + + @Override + public InternalRow getRow(int pos, int numFields) { + if (nestedProjections != null && nestedProjections[pos] != null) { + InternalRow inner = row.getRow(indexMapping[pos], nestedArity[pos]); + return nestedProjections[pos].replaceRow(inner); + } + return row.getRow(indexMapping[pos], numFields); + } + + // ======================== ElementProjection ======================== + + /** + * Describes how to project elements within a collection type (ARRAY, MAP, MULTISET). + * Recursively handles nested collections. + */ + static class ElementProjection { + @Nullable final NestedProjectedRow rowProjection; + final int rowArity; + @Nullable final ElementProjection keyOrElementProjection; + @Nullable final ElementProjection valueProjection; + + ElementProjection( + @Nullable NestedProjectedRow rowProjection, + int rowArity, + @Nullable ElementProjection keyOrElementProjection, + @Nullable ElementProjection valueProjection) { + this.rowProjection = rowProjection; + this.rowArity = rowArity; + this.keyOrElementProjection = keyOrElementProjection; + this.valueProjection = valueProjection; + } + + InternalArray projectArray(InternalArray array) { + return new ProjectedInternalArray(array, this); + } + + InternalMap projectMap(InternalMap map) { + InternalArray keys = map.keyArray(); + InternalArray values = map.valueArray(); + InternalArray projectedKeys = + keyOrElementProjection != null + ? keyOrElementProjection.projectArray(keys) + : keys; + InternalArray projectedValues = + valueProjection != null ? valueProjection.projectArray(values) : values; + if (projectedKeys == keys && projectedValues == values) { + return map; + } + return new ProjectedInternalMap(map.size(), projectedKeys, projectedValues); + } + } + + // ======================== ProjectedInternalArray ======================== + + private static class ProjectedInternalArray implements InternalArray { + + private final InternalArray array; + private final ElementProjection projection; + + ProjectedInternalArray(InternalArray array, ElementProjection projection) { + this.array = array; + this.projection = projection; + } + + @Override + public int size() { + return array.size(); + } + + @Override + public boolean isNullAt(int pos) { + return array.isNullAt(pos); + } + + @Override + public InternalRow getRow(int pos, int numFields) { + if (projection.rowProjection != null) { + InternalRow inner = array.getRow(pos, projection.rowArity); + return projection.rowProjection.replaceRow(inner); + } + return array.getRow(pos, numFields); + } + + @Override + public InternalArray getArray(int pos) { + InternalArray inner = array.getArray(pos); + if (projection.keyOrElementProjection != null) { + return projection.keyOrElementProjection.projectArray(inner); + } + return inner; + } + + @Override + public InternalMap getMap(int pos) { + InternalMap inner = array.getMap(pos); + if (projection.keyOrElementProjection != null) { + return projection.keyOrElementProjection.projectMap(inner); + } + return inner; + } + + @Override + public boolean getBoolean(int pos) { + return array.getBoolean(pos); + } + + @Override + public byte getByte(int pos) { + return array.getByte(pos); + } + + @Override + public short getShort(int pos) { + return array.getShort(pos); + } + + @Override + public int getInt(int pos) { + return array.getInt(pos); + } + + @Override + public long getLong(int pos) { + return array.getLong(pos); + } + + @Override + public float getFloat(int pos) { + return array.getFloat(pos); + } + + @Override + public double getDouble(int pos) { + return array.getDouble(pos); + } + + @Override + public BinaryString getString(int pos) { + return array.getString(pos); + } + + @Override + public Decimal getDecimal(int pos, int precision, int scale) { + return array.getDecimal(pos, precision, scale); + } + + @Override + public Timestamp getTimestamp(int pos, int precision) { + return array.getTimestamp(pos, precision); + } + + @Override + public byte[] getBinary(int pos) { + return array.getBinary(pos); + } + + @Override + public Variant getVariant(int pos) { + return array.getVariant(pos); + } + + @Override + public Blob getBlob(int pos) { + return array.getBlob(pos); + } + + @Override + public InternalVector getVector(int pos) { + return array.getVector(pos); + } + + @Override + public boolean[] toBooleanArray() { + return array.toBooleanArray(); + } + + @Override + public byte[] toByteArray() { + return array.toByteArray(); + } + + @Override + public short[] toShortArray() { + return array.toShortArray(); + } + + @Override + public int[] toIntArray() { + return array.toIntArray(); + } + + @Override + public long[] toLongArray() { + return array.toLongArray(); + } + + @Override + public float[] toFloatArray() { + return array.toFloatArray(); + } + + @Override + public double[] toDoubleArray() { + return array.toDoubleArray(); + } + } + + // ======================== ProjectedInternalMap ======================== + + private static class ProjectedInternalMap implements InternalMap { + + private final int size; + private final InternalArray keyArray; + private final InternalArray valueArray; + + ProjectedInternalMap(int size, InternalArray keyArray, InternalArray valueArray) { + this.size = size; + this.keyArray = keyArray; + this.valueArray = valueArray; + } + + @Override + public int size() { + return size; + } + + @Override + public InternalArray keyArray() { + return keyArray; + } + + @Override + public InternalArray valueArray() { + return valueArray; + } + } +} diff --git a/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java b/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java new file mode 100644 index 000000000000..4a12756bb937 --- /dev/null +++ b/paimon-common/src/test/java/org/apache/paimon/utils/NestedProjectedRowTest.java @@ -0,0 +1,720 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.utils; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.GenericArray; +import org.apache.paimon.data.GenericMap; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalMap; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.BigIntType; +import org.apache.paimon.types.BooleanType; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DoubleType; +import org.apache.paimon.types.FloatType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.MapType; +import org.apache.paimon.types.MultisetType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.types.SmallIntType; +import org.apache.paimon.types.TinyIntType; +import org.apache.paimon.types.VarBinaryType; +import org.apache.paimon.types.VarCharType; + +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** Test for {@link NestedProjectedRow}. */ +public class NestedProjectedRowTest { + + @Test + void testReturnNullWhenSchemasAreEqual() { + RowType schema = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new VarCharType()))); + assertThat(NestedProjectedRow.create(schema, schema)).isNull(); + } + + @Test + void testTopLevelProjection() { + // data: ROW + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new VarCharType()), + new DataField(2, "c", new BigIntType()))); + + // projected: ROW + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField(2, "c", new BigIntType()), + new DataField(0, "a", new IntType()))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow row = GenericRow.of(42, BinaryString.fromString("hello"), 100L); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getFieldCount()).isEqualTo(2); + assertThat(projected.getLong(0)).isEqualTo(100L); + assertThat(projected.getInt(1)).isEqualTo(42); + } + + @Test + void testTopLevelFieldSubset() { + // data: ROW + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new VarCharType()), + new DataField(2, "c", new DoubleType()))); + + // projected: ROW + RowType projectedSchema = + new RowType(Arrays.asList(new DataField(1, "b", new VarCharType()))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow row = GenericRow.of(1, BinaryString.fromString("world"), 3.14); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getFieldCount()).isEqualTo(1); + assertThat(projected.getString(0)).isEqualTo(BinaryString.fromString("world")); + } + + @Test + void testNestedRowProjection() { + // data: ROW(1)> + RowType nestedType = + new RowType( + Arrays.asList( + new DataField(10, "x", new IntType()), + new DataField(11, "y", new IntType()), + new DataField(12, "z", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", nestedType))); + + // projected: ROW(1)> + RowType projectedNestedType = + new RowType(Arrays.asList(new DataField(12, "z", new IntType()))); + RowType projectedSchema = + new RowType(Arrays.asList(new DataField(1, "r", projectedNestedType))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow innerRow = GenericRow.of(10, 20, 30); + GenericRow row = GenericRow.of(1, innerRow); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getFieldCount()).isEqualTo(1); + InternalRow projectedInner = projected.getRow(0, 1); + assertThat(projectedInner.getFieldCount()).isEqualTo(1); + assertThat(projectedInner.getInt(0)).isEqualTo(30); + } + + @Test + void testNestedRowProjectionMultipleFields() { + // data: ROW(1)> + RowType nestedType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()), + new DataField(12, "c", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", nestedType))); + + // projected: ROW(1)> + RowType projectedNestedType = + new RowType( + Arrays.asList( + new DataField(12, "c", new IntType()), + new DataField(10, "a", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", projectedNestedType))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow innerRow = GenericRow.of(10, 20, 30); + GenericRow row = GenericRow.of(1, innerRow); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getFieldCount()).isEqualTo(2); + assertThat(projected.getInt(0)).isEqualTo(1); + InternalRow projectedInner = projected.getRow(1, 2); + assertThat(projectedInner.getInt(0)).isEqualTo(30); + assertThat(projectedInner.getInt(1)).isEqualTo(10); + } + + @Test + void testDeeplyNestedProjection() { + // data: ROW>> + RowType level2 = + new RowType( + Arrays.asList( + new DataField(10, "x", new IntType()), + new DataField(11, "y", new IntType()), + new DataField(12, "z", new IntType()))); + RowType level1 = new RowType(Arrays.asList(new DataField(5, "b", level2))); + RowType dataSchema = new RowType(Arrays.asList(new DataField(0, "a", level1))); + + // projected: ROW>> + RowType projLevel2 = new RowType(Arrays.asList(new DataField(11, "y", new IntType()))); + RowType projLevel1 = new RowType(Arrays.asList(new DataField(5, "b", projLevel2))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(0, "a", projLevel1))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow l2Row = GenericRow.of(10, 20, 30); + GenericRow l1Row = GenericRow.of(l2Row); + GenericRow row = GenericRow.of(l1Row); + InternalRow projected = projection.replaceRow(row); + + InternalRow projL1 = projected.getRow(0, 1); + InternalRow projL2 = projL1.getRow(0, 1); + assertThat(projL2.getInt(0)).isEqualTo(20); + } + + @Test + void testNestedRowWithoutInnerProjection() { + // data: ROW(1)> + RowType nestedType = + new RowType( + Arrays.asList( + new DataField(10, "x", new IntType()), + new DataField(11, "y", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", nestedType))); + + // projected: ROW(1)> (nested is unchanged, only top-level + // pruned) + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "r", nestedType))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow innerRow = GenericRow.of(10, 20); + GenericRow row = GenericRow.of(1, innerRow); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getFieldCount()).isEqualTo(1); + InternalRow projectedInner = projected.getRow(0, 2); + assertThat(projectedInner.getInt(0)).isEqualTo(10); + assertThat(projectedInner.getInt(1)).isEqualTo(20); + } + + @Test + void testNullHandling() { + // data: ROW(1)> + RowType nestedType = + new RowType( + Arrays.asList( + new DataField(10, "x", new IntType()), + new DataField(11, "y", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "r", nestedType))); + + // projected: ROW(1)> + RowType projectedNestedType = + new RowType(Arrays.asList(new DataField(11, "y", new IntType()))); + RowType projectedSchema = + new RowType(Arrays.asList(new DataField(1, "r", projectedNestedType))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + // null nested row + GenericRow row = GenericRow.of(1, null); + InternalRow projected = projection.replaceRow(row); + assertThat(projected.isNullAt(0)).isTrue(); + + // non-null nested row with null field + GenericRow innerRow = GenericRow.of(10, null); + GenericRow row2 = GenericRow.of(1, innerRow); + InternalRow projected2 = projection.replaceRow(row2); + assertThat(projected2.isNullAt(0)).isFalse(); + InternalRow projectedInner = projected2.getRow(0, 1); + assertThat(projectedInner.isNullAt(0)).isTrue(); + } + + @Test + void testReplaceRowIsReusable() { + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new IntType()))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "b", new IntType()))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow row1 = GenericRow.of(1, 10); + assertThat(projection.replaceRow(row1).getInt(0)).isEqualTo(10); + + GenericRow row2 = GenericRow.of(2, 20); + assertThat(projection.replaceRow(row2).getInt(0)).isEqualTo(20); + } + + @Test + void testRowKindPreserved() { + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new IntType()))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "b", new IntType()))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow row = GenericRow.of(1, 10); + row.setRowKind(org.apache.paimon.types.RowKind.DELETE); + InternalRow projected = projection.replaceRow(row); + assertThat(projected.getRowKind()).isEqualTo(org.apache.paimon.types.RowKind.DELETE); + } + + @Test + void testMultipleNestedRows() { + // data: ROW(0), r2 ROW(1)> + RowType nested1 = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType nested2 = + new RowType( + Arrays.asList( + new DataField(20, "x", new VarCharType()), + new DataField(21, "y", new VarCharType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "r1", nested1), new DataField(1, "r2", nested2))); + + // projected: ROW(0), r2 ROW(1)> + RowType projNested1 = new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projNested2 = new RowType(Arrays.asList(new DataField(21, "y", new VarCharType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField(0, "r1", projNested1), + new DataField(1, "r2", projNested2))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow inner1 = GenericRow.of(10, 20); + GenericRow inner2 = + GenericRow.of(BinaryString.fromString("hello"), BinaryString.fromString("world")); + GenericRow row = GenericRow.of(inner1, inner2); + InternalRow projected = projection.replaceRow(row); + + InternalRow projInner1 = projected.getRow(0, 1); + assertThat(projInner1.getInt(0)).isEqualTo(20); + + InternalRow projInner2 = projected.getRow(1, 1); + assertThat(projInner2.getString(0)).isEqualTo(BinaryString.fromString("world")); + } + + @Test + void testAllDataTypes() { + // data with various types, each with unique field ID + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "f_bool", new BooleanType()), + new DataField(1, "f_byte", new TinyIntType()), + new DataField(2, "f_short", new SmallIntType()), + new DataField(3, "f_int", new IntType()), + new DataField(4, "f_long", new BigIntType()), + new DataField(5, "f_float", new FloatType()), + new DataField(6, "f_double", new DoubleType()), + new DataField(7, "f_string", new VarCharType()), + new DataField(8, "f_binary", new VarBinaryType()))); + + // project a subset in different order + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField(6, "f_double", new DoubleType()), + new DataField(0, "f_bool", new BooleanType()), + new DataField(7, "f_string", new VarCharType()), + new DataField(1, "f_byte", new TinyIntType()))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericRow row = + GenericRow.of( + true, + (byte) 7, + (short) 16, + 42, + 100L, + 1.5f, + 3.14, + BinaryString.fromString("test"), + new byte[] {1, 2, 3}); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getDouble(0)).isEqualTo(3.14); + assertThat(projected.getBoolean(1)).isTrue(); + assertThat(projected.getString(2)).isEqualTo(BinaryString.fromString("test")); + assertThat(projected.getByte(3)).isEqualTo((byte) 7); + } + + @Test + void testFieldNameMismatchThrows() { + // data: ROW + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new IntType()))); + + // projected: same field id=1 but wrong name "wrong" + RowType projectedSchema = + new RowType(Arrays.asList(new DataField(1, "wrong", new IntType()))); + + assertThatThrownBy(() -> NestedProjectedRow.create(dataSchema, projectedSchema)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Field name mismatch") + .hasMessageContaining("'b'") + .hasMessageContaining("'wrong'"); + } + + @Test + void testNestedFieldNameMismatchThrows() { + // data: ROW(0)> + RowType nestedType = + new RowType( + Arrays.asList( + new DataField(10, "x", new IntType()), + new DataField(11, "y", new IntType()))); + RowType dataSchema = new RowType(Arrays.asList(new DataField(0, "r", nestedType))); + + // projected: nested field id=11 but wrong name "wrong_name" + RowType projectedNestedType = + new RowType(Arrays.asList(new DataField(11, "wrong_name", new IntType()))); + RowType projectedSchema = + new RowType(Arrays.asList(new DataField(0, "r", projectedNestedType))); + + assertThatThrownBy(() -> NestedProjectedRow.create(dataSchema, projectedSchema)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Field name mismatch") + .hasMessageContaining("'y'") + .hasMessageContaining("'wrong_name'"); + } + + @Test + void testArrayElementProjection() { + // data: ROW>(0)> + RowType elementType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType(Arrays.asList(new DataField(0, "arr", new ArrayType(elementType)))); + + // projected: ROW>(0)> + RowType projectedElementType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField(0, "arr", new ArrayType(projectedElementType)))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + // arr = [ROW, ROW] + GenericArray array = + new GenericArray(new Object[] {GenericRow.of(1, 100), GenericRow.of(2, 200)}); + GenericRow row = GenericRow.of(array); + InternalRow projected = projection.replaceRow(row); + + InternalArray projectedArray = projected.getArray(0); + assertThat(projectedArray.size()).isEqualTo(2); + assertThat(projectedArray.getRow(0, 1).getInt(0)).isEqualTo(100); + assertThat(projectedArray.getRow(1, 1).getInt(0)).isEqualTo(200); + } + + @Test + void testArrayElementProjectionWithNull() { + // data: ROW>(0)> + RowType elementType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType(Arrays.asList(new DataField(0, "arr", new ArrayType(elementType)))); + + // projected: ROW>(0)> + RowType projectedElementType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField(0, "arr", new ArrayType(projectedElementType)))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + // arr = [ROW, null] + GenericArray array = new GenericArray(new Object[] {GenericRow.of(1, 100), null}); + GenericRow row = GenericRow.of(array); + InternalRow projected = projection.replaceRow(row); + + InternalArray projectedArray = projected.getArray(0); + assertThat(projectedArray.size()).isEqualTo(2); + assertThat(projectedArray.getRow(0, 1).getInt(0)).isEqualTo(100); + assertThat(projectedArray.isNullAt(1)).isTrue(); + } + + @Test + void testMapValueProjection() { + // data: ROW>(0)> + RowType valueType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "m", new MapType(new IntType(), valueType)))); + + // projected: ROW>(0)> + RowType projectedValueType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField( + 0, "m", new MapType(new IntType(), projectedValueType)))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + // m = {1 -> ROW, 2 -> ROW} + Map mapData = new HashMap<>(); + mapData.put(1, GenericRow.of(10, 100)); + mapData.put(2, GenericRow.of(20, 200)); + GenericRow row = GenericRow.of(new GenericMap(mapData)); + InternalRow projected = projection.replaceRow(row); + + InternalMap projectedMap = projected.getMap(0); + assertThat(projectedMap.size()).isEqualTo(2); + InternalArray values = projectedMap.valueArray(); + InternalArray keys = projectedMap.keyArray(); + for (int i = 0; i < 2; i++) { + int key = keys.getInt(i); + int b = values.getRow(i, 1).getInt(0); + if (key == 1) { + assertThat(b).isEqualTo(100); + } else { + assertThat(b).isEqualTo(200); + } + } + } + + @Test + void testArrayWithNoProjectionNeeded() { + // data: ROW>(0), id INT(1)> + RowType elementType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "arr", new ArrayType(elementType)), + new DataField(1, "id", new IntType()))); + + // projected: ROW>(0)> - full element, just drop id + RowType projectedSchema = + new RowType(Arrays.asList(new DataField(0, "arr", new ArrayType(elementType)))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + GenericArray array = new GenericArray(new Object[] {GenericRow.of(1, 2)}); + GenericRow row = GenericRow.of(array, 99); + InternalRow projected = projection.replaceRow(row); + + InternalArray projectedArray = projected.getArray(0); + assertThat(projectedArray.size()).isEqualTo(1); + InternalRow element = projectedArray.getRow(0, 2); + assertThat(element.getInt(0)).isEqualTo(1); + assertThat(element.getInt(1)).isEqualTo(2); + } + + @Test + void testNestedArrayProjection() { + // data: ROW>>(0)> + RowType elementType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField( + 0, "arr", new ArrayType(new ArrayType(elementType))))); + + // projected: ROW>>(0)> + RowType projectedElementType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField( + 0, + "arr", + new ArrayType(new ArrayType(projectedElementType))))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + // arr = [[ROW, ROW]] + GenericArray innerArray = + new GenericArray(new Object[] {GenericRow.of(1, 100), GenericRow.of(2, 200)}); + GenericArray outerArray = new GenericArray(new Object[] {innerArray}); + GenericRow row = GenericRow.of(outerArray); + InternalRow projected = projection.replaceRow(row); + + InternalArray projOuter = projected.getArray(0); + assertThat(projOuter.size()).isEqualTo(1); + InternalArray projInner = projOuter.getArray(0); + assertThat(projInner.size()).isEqualTo(2); + assertThat(projInner.getRow(0, 1).getInt(0)).isEqualTo(100); + assertThat(projInner.getRow(1, 1).getInt(0)).isEqualTo(200); + } + + @Test + void testMapWithArrayValueProjection() { + // data: ROW>>(0)> + RowType elementType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField( + 0, + "m", + new MapType(new IntType(), new ArrayType(elementType))))); + + // projected: ROW>>(0)> + RowType projectedElementType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField( + 0, + "m", + new MapType( + new IntType(), + new ArrayType(projectedElementType))))); + + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + // m = {1 -> [ROW]} + Map mapData = new HashMap<>(); + mapData.put(1, new GenericArray(new Object[] {GenericRow.of(10, 100)})); + GenericRow row = GenericRow.of(new GenericMap(mapData)); + InternalRow projected = projection.replaceRow(row); + + InternalMap projectedMap = projected.getMap(0); + InternalArray values = projectedMap.valueArray(); + InternalArray valueArr = values.getArray(0); + assertThat(valueArr.getRow(0, 1).getInt(0)).isEqualTo(100); + } + + @Test + void testMultisetDoesNotThrow() { + // data: ROW(1)> + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "ms", new MultisetType(new VarCharType())))); + + // projected: ROW(1)> + RowType projectedSchema = + new RowType( + Arrays.asList(new DataField(1, "ms", new MultisetType(new VarCharType())))); + + // Should not throw ClassCastException + NestedProjectedRow projection = NestedProjectedRow.create(dataSchema, projectedSchema); + assertThat(projection).isNotNull(); + + Map msData = new HashMap<>(); + msData.put(BinaryString.fromString("hello"), 2); + GenericRow row = GenericRow.of(42, new GenericMap(msData)); + InternalRow projected = projection.replaceRow(row); + + assertThat(projected.getFieldCount()).isEqualTo(1); + InternalMap ms = projected.getMap(0); + assertThat(ms.size()).isEqualTo(1); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/BlockInput.java b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockInput.java new file mode 100644 index 000000000000..e169ce239875 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockInput.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.Timestamp; + +/** A cursor over a byte array that reads primitives in little-endian order. */ +class BlockInput { + + final byte[] data; + int position; + + BlockInput(byte[] data) { + this.data = data; + this.position = 0; + } + + boolean readBoolean() { + return data[position++] != 0; + } + + byte readByte() { + return data[position++]; + } + + short readShort() { + short v = (short) ((data[position] & 0xFF) | ((data[position + 1] & 0xFF) << 8)); + position += 2; + return v; + } + + int readInt() { + int v = + (data[position] & 0xFF) + | ((data[position + 1] & 0xFF) << 8) + | ((data[position + 2] & 0xFF) << 16) + | ((data[position + 3] & 0xFF) << 24); + position += 4; + return v; + } + + long readLong() { + long v = + (data[position] & 0xFFL) + | ((data[position + 1] & 0xFFL) << 8) + | ((data[position + 2] & 0xFFL) << 16) + | ((data[position + 3] & 0xFFL) << 24) + | ((data[position + 4] & 0xFFL) << 32) + | ((data[position + 5] & 0xFFL) << 40) + | ((data[position + 6] & 0xFFL) << 48) + | ((data[position + 7] & 0xFFL) << 56); + position += 8; + return v; + } + + float readFloat() { + return Float.intBitsToFloat(readInt()); + } + + double readDouble() { + return Double.longBitsToDouble(readLong()); + } + + int readVarInt() { + int result = 0; + int shift = 0; + while (true) { + byte b = data[position++]; + result |= (b & 0x7F) << shift; + if ((b & 0x80) == 0) { + return result; + } + shift += 7; + } + } + + BinaryString readString() { + int length = readVarInt(); + BinaryString s = BinaryString.fromBytes(data, position, length); + position += length; + return s; + } + + byte[] readBytes() { + int length = readVarInt(); + byte[] result = new byte[length]; + System.arraycopy(data, position, result, 0, length); + position += length; + return result; + } + + Decimal readDecimal(int precision, int scale) { + return Decimal.isCompact(precision) + ? Decimal.fromUnscaledLong(readLong(), precision, scale) + : Decimal.fromUnscaledBytes(readBytes(), precision, scale); + } + + Timestamp readTimestamp(int precision) { + if (Timestamp.isCompact(precision)) { + return Timestamp.fromEpochMillis(readLong()); + } + long millis = readLong(); + int nanos = readVarInt(); + return Timestamp.fromEpochMillis(millis, nanos); + } + + static int readIntLE(byte[] buf, int offset) { + return (buf[offset] & 0xFF) + | ((buf[offset + 1] & 0xFF) << 8) + | ((buf[offset + 2] & 0xFF) << 16) + | ((buf[offset + 3] & 0xFF) << 24); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/BlockOutput.java b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockOutput.java new file mode 100644 index 000000000000..3e457a7832ce --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockOutput.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.Timestamp; + +/** A resizable byte buffer with little-endian primitive write operations. */ +class BlockOutput { + + byte[] buffer; + int position; + + BlockOutput(int initialCapacity) { + this.buffer = new byte[initialCapacity]; + this.position = 0; + } + + void writeBoolean(boolean value) { + ensureCapacity(1); + buffer[position++] = (byte) (value ? 1 : 0); + } + + void writeByte(byte value) { + ensureCapacity(1); + buffer[position++] = value; + } + + void writeShort(short value) { + ensureCapacity(2); + buffer[position++] = (byte) (value & 0xFF); + buffer[position++] = (byte) ((value >>> 8) & 0xFF); + } + + void writeInt(int value) { + ensureCapacity(4); + buffer[position++] = (byte) (value & 0xFF); + buffer[position++] = (byte) ((value >>> 8) & 0xFF); + buffer[position++] = (byte) ((value >>> 16) & 0xFF); + buffer[position++] = (byte) ((value >>> 24) & 0xFF); + } + + void writeLong(long value) { + ensureCapacity(8); + buffer[position++] = (byte) (value & 0xFF); + buffer[position++] = (byte) ((value >>> 8) & 0xFF); + buffer[position++] = (byte) ((value >>> 16) & 0xFF); + buffer[position++] = (byte) ((value >>> 24) & 0xFF); + buffer[position++] = (byte) ((value >>> 32) & 0xFF); + buffer[position++] = (byte) ((value >>> 40) & 0xFF); + buffer[position++] = (byte) ((value >>> 48) & 0xFF); + buffer[position++] = (byte) ((value >>> 56) & 0xFF); + } + + void writeFloat(float value) { + writeInt(Float.floatToRawIntBits(value)); + } + + void writeDouble(double value) { + writeLong(Double.doubleToRawLongBits(value)); + } + + void writeVarInt(int value) { + ensureCapacity(5); + while ((value & ~0x7F) != 0) { + buffer[position++] = (byte) ((value & 0x7F) | 0x80); + value >>>= 7; + } + buffer[position++] = (byte) value; + } + + void writeBytes(byte[] value) { + writeVarInt(value.length); + ensureCapacity(value.length); + System.arraycopy(value, 0, buffer, position, value.length); + position += value.length; + } + + void writeDecimal(Decimal value, int precision) { + if (Decimal.isCompact(precision)) { + writeLong(value.toUnscaledLong()); + } else { + writeBytes(value.toUnscaledBytes()); + } + } + + void writeTimestamp(Timestamp value, int precision) { + if (Timestamp.isCompact(precision)) { + writeLong(value.getMillisecond()); + } else { + writeLong(value.getMillisecond()); + writeVarInt(value.getNanoOfMillisecond()); + } + } + + void ensureCapacity(int additional) { + int required = position + additional; + if (required > buffer.length) { + int newSize = Math.max(buffer.length * 2, required); + byte[] newBuffer = new byte[newSize]; + System.arraycopy(buffer, 0, newBuffer, 0, position); + buffer = newBuffer; + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/BlockPrefetcher.java b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockPrefetcher.java new file mode 100644 index 000000000000..e50210dc4cf6 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/BlockPrefetcher.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.fs.VectoredReadable; + +import java.io.Closeable; +import java.io.IOException; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; + +/** + * Prefetches and decompresses blocks ahead of consumption. + * + *

Two IO strategies depending on whether the stream supports positional reads: + * + *

+ */ +class BlockPrefetcher implements Closeable { + + private final SeekableInputStream inputStream; + private final ReadStrategy strategy; + + BlockPrefetcher(SeekableInputStream inputStream, RowBlockIndex blockIndex, int[] blocksToRead) { + this.inputStream = inputStream; + if (inputStream instanceof VectoredReadable) { + this.strategy = + new VectoredReadStrategy( + (VectoredReadable) inputStream, blockIndex, blocksToRead); + } else { + this.strategy = new SequentialReadStrategy(inputStream, blockIndex, blocksToRead); + } + } + + byte[] nextBlock() throws IOException { + return strategy.nextBlock(); + } + + int currentBlockIdx() { + return strategy.currentBlockIdx(); + } + + @Override + public void close() throws IOException { + strategy.close(); + inputStream.close(); + } + + static byte[] awaitFuture(CompletableFuture future) throws IOException { + try { + return future.get(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException("Interrupted while waiting for prefetch", e); + } catch (ExecutionException e) { + Throwable cause = e.getCause(); + if (cause instanceof IOException) { + throw (IOException) cause; + } + throw new IOException("Prefetch failed", cause); + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/ReadStrategy.java b/paimon-format/src/main/java/org/apache/paimon/format/row/ReadStrategy.java new file mode 100644 index 000000000000..2a5a07167903 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/ReadStrategy.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import java.io.Closeable; +import java.io.IOException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import static org.apache.paimon.utils.ThreadUtils.newDaemonThreadFactory; + +/** Strategy for reading and prefetching compressed blocks. */ +interface ReadStrategy extends Closeable { + + ExecutorService IO_POOL = + Executors.newCachedThreadPool(newDaemonThreadFactory("ROW-FORMAT-IO")); + + byte[] nextBlock() throws IOException; + + int currentBlockIdx(); +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockIndex.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockIndex.java new file mode 100644 index 000000000000..2987d28bf9c5 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockIndex.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.utils.DeltaVarintCompressor; +import org.apache.paimon.utils.IOUtils; +import org.apache.paimon.utils.VarLengthIntUtils; + +import java.io.IOException; + +/** Block index that maps row numbers to block locations. */ +class RowBlockIndex { + + private final long[] blockOffsets; + private final long[] blockCompressedSizes; + private final long[] blockUncompressedSizes; + private final long[] blockRowStarts; + + RowBlockIndex( + long[] blockCompressedSizes, long[] blockUncompressedSizes, long[] blockRowStarts) { + this.blockCompressedSizes = blockCompressedSizes; + this.blockUncompressedSizes = blockUncompressedSizes; + this.blockRowStarts = blockRowStarts; + this.blockOffsets = computeOffsets(blockCompressedSizes); + } + + int blockCount() { + return blockCompressedSizes.length; + } + + long blockOffset(int blockIdx) { + return blockOffsets[blockIdx]; + } + + long blockCompressedSize(int blockIdx) { + return blockCompressedSizes[blockIdx]; + } + + long blockUncompressedSize(int blockIdx) { + return blockUncompressedSizes[blockIdx]; + } + + long blockRowStart(int blockIdx) { + return blockRowStarts[blockIdx]; + } + + void writeTo(PositionOutputStream out) throws IOException { + writeArray(out, DeltaVarintCompressor.compress(blockCompressedSizes)); + writeArray(out, DeltaVarintCompressor.compress(blockUncompressedSizes)); + writeArray(out, DeltaVarintCompressor.compress(blockRowStarts)); + } + + static RowBlockIndex readFrom(SeekableInputStream in, long indexOffset, int indexLength) + throws IOException { + in.seek(indexOffset); + byte[] indexData = new byte[indexLength]; + IOUtils.readFully(in, indexData); + return readFrom(indexData); + } + + static RowBlockIndex readFrom(byte[] indexData) { + int pos = 0; + int len1 = decodeVarInt(indexData, pos); + pos += varIntSize(len1); + long[] blockCompressedSizes = + DeltaVarintCompressor.decompress(extractBytes(indexData, pos, len1)); + pos += len1; + + int len2 = decodeVarInt(indexData, pos); + pos += varIntSize(len2); + long[] blockUncompressedSizes = + DeltaVarintCompressor.decompress(extractBytes(indexData, pos, len2)); + pos += len2; + + int len3 = decodeVarInt(indexData, pos); + pos += varIntSize(len3); + long[] blockRowStarts = + DeltaVarintCompressor.decompress(extractBytes(indexData, pos, len3)); + + return new RowBlockIndex(blockCompressedSizes, blockUncompressedSizes, blockRowStarts); + } + + private static long[] computeOffsets(long[] compressedSizes) { + long[] offsets = new long[compressedSizes.length]; + long offset = 0; + for (int i = 0; i < compressedSizes.length; i++) { + offsets[i] = offset; + offset += compressedSizes[i]; + } + return offsets; + } + + private static void writeArray(PositionOutputStream out, byte[] encoded) throws IOException { + byte[] lenBuf = new byte[VarLengthIntUtils.MAX_VAR_INT_SIZE]; + int lenBytes = VarLengthIntUtils.encodeInt(lenBuf, 0, encoded.length); + out.write(lenBuf, 0, lenBytes); + out.write(encoded); + } + + private static int decodeVarInt(byte[] data, int offset) { + int result = 0; + int shift = 0; + int pos = offset; + while (true) { + byte b = data[pos++]; + result |= (b & 0x7F) << shift; + if ((b & 0x80) == 0) { + return result; + } + shift += 7; + } + } + + private static int varIntSize(int value) { + int size = 1; + while ((value & ~0x7F) != 0) { + size++; + value >>>= 7; + } + return size; + } + + private static byte[] extractBytes(byte[] data, int offset, int length) { + byte[] result = new byte[length]; + System.arraycopy(data, offset, result, 0, length); + return result; + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockReader.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockReader.java new file mode 100644 index 000000000000..22027fc3cf50 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockReader.java @@ -0,0 +1,434 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.BinaryVector; +import org.apache.paimon.data.Blob; +import org.apache.paimon.data.GenericArray; +import org.apache.paimon.data.GenericMap; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalMap; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.variant.GenericVariant; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.MapType; +import org.apache.paimon.types.MultisetType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.types.VectorType; + +import java.util.HashMap; +import java.util.Map; + +import static org.apache.paimon.types.DataTypeChecks.getPrecision; +import static org.apache.paimon.types.DataTypeChecks.getScale; + +/** Reads rows from a decompressed block by local row index. */ +class RowBlockReader { + + private final BlockInput buf; + private final int rowCount; + private final int offsetArrayStart; + private final FieldReader[] fieldReaders; + private final int headerSizeInBytes; + + RowBlockReader(BlockInput buf, RowType rowType) { + this.buf = buf; + int len = buf.data.length; + this.rowCount = BlockInput.readIntLE(buf.data, len - 4); + this.offsetArrayStart = len - 4 - rowCount * 4; + + int arity = rowType.getFieldCount(); + this.headerSizeInBytes = (arity + 7) / 8; + this.fieldReaders = new FieldReader[arity]; + for (int i = 0; i < arity; i++) { + fieldReaders[i] = createFieldReader(rowType.getTypeAt(i)); + } + } + + int rowCount() { + return rowCount; + } + + InternalRow readRow(int localRowIndex) { + buf.position = BlockInput.readIntLE(buf.data, offsetArrayStart + localRowIndex * 4); + return readRow(headerSizeInBytes, fieldReaders); + } + + // ======================== Row Reading ======================== + + private InternalRow readRow(int headerSize, FieldReader[] readers) { + int headerStart = buf.position; + buf.position += headerSize; + + GenericRow row = new GenericRow(readers.length); + for (int i = 0; i < readers.length; i++) { + if ((buf.data[headerStart + i / 8] & (1 << (i % 8))) != 0) { + row.setField(i, null); + } else { + row.setField(i, readers[i].read()); + } + } + return row; + } + + // ======================== Field Reader Factory ======================== + + private FieldReader createFieldReader(DataType type) { + switch (type.getTypeRoot()) { + case CHAR: + case VARCHAR: + return new StringFieldReader(); + case BOOLEAN: + return new BooleanFieldReader(); + case BINARY: + case VARBINARY: + return new BinaryFieldReader(); + case DECIMAL: + return new DecimalFieldReader(getPrecision(type), getScale(type)); + case TINYINT: + return new TinyIntFieldReader(); + case SMALLINT: + return new SmallIntFieldReader(); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + return new IntFieldReader(); + case BIGINT: + return new BigIntFieldReader(); + case FLOAT: + return new FloatFieldReader(); + case DOUBLE: + return new DoubleFieldReader(); + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return new TimestampFieldReader(getPrecision(type)); + case VARIANT: + return new VariantFieldReader(); + case BLOB: + return new BlobFieldReader(); + case VECTOR: + { + VectorType vectorType = (VectorType) type; + return new VectorFieldReader(vectorType.getElementType()); + } + case ARRAY: + { + DataType elementType = ((ArrayType) type).getElementType(); + return new ArrayFieldReader(createFieldReader(elementType)); + } + case MULTISET: + { + DataType msElementType = ((MultisetType) type).getElementType(); + return new MapFieldReader( + createFieldReader(msElementType), createFieldReader(new IntType())); + } + case MAP: + { + MapType mapType = (MapType) type; + return new MapFieldReader( + createFieldReader(mapType.getKeyType()), + createFieldReader(mapType.getValueType())); + } + case ROW: + { + RowType nestedType = (RowType) type; + return new RowFieldReader(nestedType); + } + default: + throw new UnsupportedOperationException("Unsupported type: " + type.getTypeRoot()); + } + } + + // ======================== Complex Types ======================== + + private Object[] readElements(FieldReader elementReader) { + int size = buf.readVarInt(); + int nullBitmapBytes = (size + 7) / 8; + int nullStart = buf.position; + buf.position += nullBitmapBytes; + + Object[] elements = new Object[size]; + for (int i = 0; i < size; i++) { + if ((buf.data[nullStart + i / 8] & (1 << (i % 8))) != 0) { + elements[i] = null; + } else { + elements[i] = elementReader.read(); + } + } + return elements; + } + + private InternalArray readArray(FieldReader elementReader) { + return new GenericArray(readElements(elementReader)); + } + + private InternalMap readMap(FieldReader keyReader, FieldReader valueReader) { + Object[] keys = readElements(keyReader); + Object[] values = readElements(valueReader); + Map map = new HashMap<>(keys.length); + for (int i = 0; i < keys.length; i++) { + map.put(keys[i], values[i]); + } + return new GenericMap(map); + } + + // ======================== Interface ======================== + + interface FieldReader { + Object read(); + } + + // ======================== FieldReader Implementations ======================== + + private class StringFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readString(); + } + } + + private class BooleanFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readBoolean(); + } + } + + private class BinaryFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readBytes(); + } + } + + private class DecimalFieldReader implements FieldReader { + private final int precision; + private final int scale; + + DecimalFieldReader(int precision, int scale) { + this.precision = precision; + this.scale = scale; + } + + @Override + public Object read() { + return buf.readDecimal(precision, scale); + } + } + + private class TinyIntFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readByte(); + } + } + + private class SmallIntFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readShort(); + } + } + + private class IntFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readInt(); + } + } + + private class BigIntFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readLong(); + } + } + + private class FloatFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readFloat(); + } + } + + private class DoubleFieldReader implements FieldReader { + @Override + public Object read() { + return buf.readDouble(); + } + } + + private class TimestampFieldReader implements FieldReader { + private final int precision; + + TimestampFieldReader(int precision) { + this.precision = precision; + } + + @Override + public Object read() { + return buf.readTimestamp(precision); + } + } + + private class VariantFieldReader implements FieldReader { + @Override + public Object read() { + byte[] value = buf.readBytes(); + byte[] metadata = buf.readBytes(); + return new GenericVariant(value, metadata); + } + } + + private class BlobFieldReader implements FieldReader { + @Override + public Object read() { + return Blob.fromData(buf.readBytes()); + } + } + + private class VectorFieldReader implements FieldReader { + private final DataType elementType; + + VectorFieldReader(DataType elementType) { + this.elementType = elementType; + } + + @Override + public Object read() { + int size = buf.readVarInt(); + InternalArray array = readVectorElements(size); + return BinaryVector.fromInternalArray(array, elementType); + } + + private InternalArray readVectorElements(int size) { + switch (elementType.getTypeRoot()) { + case BOOLEAN: + { + boolean[] arr = new boolean[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readBoolean(); + } + return new GenericArray(arr); + } + case TINYINT: + { + byte[] arr = new byte[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readByte(); + } + return new GenericArray(arr); + } + case SMALLINT: + { + short[] arr = new short[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readShort(); + } + return new GenericArray(arr); + } + case INTEGER: + { + int[] arr = new int[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readInt(); + } + return new GenericArray(arr); + } + case BIGINT: + { + long[] arr = new long[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readLong(); + } + return new GenericArray(arr); + } + case FLOAT: + { + float[] arr = new float[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readFloat(); + } + return new GenericArray(arr); + } + case DOUBLE: + { + double[] arr = new double[size]; + for (int i = 0; i < size; i++) { + arr[i] = buf.readDouble(); + } + return new GenericArray(arr); + } + default: + throw new UnsupportedOperationException( + "Unsupported vector element type: " + elementType); + } + } + } + + private class ArrayFieldReader implements FieldReader { + private final FieldReader elementReader; + + ArrayFieldReader(FieldReader elementReader) { + this.elementReader = elementReader; + } + + @Override + public Object read() { + return readArray(elementReader); + } + } + + private class MapFieldReader implements FieldReader { + private final FieldReader keyReader; + private final FieldReader valueReader; + + MapFieldReader(FieldReader keyReader, FieldReader valueReader) { + this.keyReader = keyReader; + this.valueReader = valueReader; + } + + @Override + public Object read() { + return readMap(keyReader, valueReader); + } + } + + private class RowFieldReader implements FieldReader { + private final int nestedHeaderSize; + private final FieldReader[] nestedReaders; + + RowFieldReader(RowType nestedType) { + int arity = nestedType.getFieldCount(); + this.nestedHeaderSize = (arity + 7) / 8; + this.nestedReaders = new FieldReader[arity]; + for (int i = 0; i < arity; i++) { + nestedReaders[i] = createFieldReader(nestedType.getTypeAt(i)); + } + } + + @Override + public Object read() { + return readRow(nestedHeaderSize, nestedReaders); + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockWriter.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockWriter.java new file mode 100644 index 000000000000..4a7f7be0fed5 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowBlockWriter.java @@ -0,0 +1,383 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.DataGetters; +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalMap; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.InternalVector; +import org.apache.paimon.data.variant.Variant; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.MapType; +import org.apache.paimon.types.MultisetType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.types.VectorType; +import org.apache.paimon.utils.IntArrayList; + +import static org.apache.paimon.types.DataTypeChecks.getPrecision; +import static org.apache.paimon.types.DataTypeChecks.getScale; + +/** + * Accumulates rows by serializing directly into a block buffer. + * + *

Block layout (uncompressed): + * + *

+ * [row_0 bytes][row_1 bytes]...[row_N bytes]
+ * [offset_0 (int32 LE)][offset_1]...[offset_N]
+ * [row_count (int32 LE)]
+ * 
+ */ +class RowBlockWriter { + + private final BlockOutput buf; + private final IntArrayList offsets; + private final FieldWriter[] fieldWriters; + private final int headerSizeInBytes; + + RowBlockWriter(BlockOutput buf, RowType rowType) { + this.buf = buf; + this.offsets = new IntArrayList(64); + int arity = rowType.getFieldCount(); + this.headerSizeInBytes = (arity + 7) / 8; + this.fieldWriters = new FieldWriter[arity]; + for (int i = 0; i < arity; i++) { + fieldWriters[i] = createFieldWriter(rowType.getTypeAt(i)); + } + } + + void writeRow(InternalRow row) { + offsets.add(buf.position); + writeRow(row, headerSizeInBytes, fieldWriters); + } + + private void writeRow(InternalRow row, int headerSize, FieldWriter[] writers) { + int headerStart = buf.position; + buf.ensureCapacity(headerSize); + for (int i = 0; i < headerSize; i++) { + buf.buffer[headerStart + i] = 0; + } + buf.position += headerSize; + for (int i = 0; i < writers.length; i++) { + if (row.isNullAt(i)) { + buf.buffer[headerStart + i / 8] |= (byte) (1 << (i % 8)); + } else { + writers[i].write(row, i); + } + } + } + + int rowCount() { + return offsets.size(); + } + + int estimatedSize() { + return buf.position + offsets.size() * 4 + 4; + } + + byte[] finish() { + int totalSize = buf.position + offsets.size() * 4 + 4; + buf.ensureCapacity(offsets.size() * 4 + 4); + for (int i = 0; i < offsets.size(); i++) { + RowFileFooter.writeIntLE(buf.buffer, buf.position, offsets.get(i)); + buf.position += 4; + } + RowFileFooter.writeIntLE(buf.buffer, buf.position, offsets.size()); + buf.position += 4; + + byte[] result = new byte[totalSize]; + System.arraycopy(buf.buffer, 0, result, 0, totalSize); + return result; + } + + void reset() { + offsets.clear(); + buf.position = 0; + } + + // ======================== Factory ======================== + + private FieldWriter createFieldWriter(DataType type) { + switch (type.getTypeRoot()) { + case CHAR: + case VARCHAR: + return new StringFieldWriter(); + case BOOLEAN: + return new BooleanFieldWriter(); + case BINARY: + case VARBINARY: + return new BinaryFieldWriter(); + case DECIMAL: + return new DecimalFieldWriter(getPrecision(type), getScale(type)); + case TINYINT: + return new TinyIntFieldWriter(); + case SMALLINT: + return new SmallIntFieldWriter(); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + return new IntFieldWriter(); + case BIGINT: + return new BigIntFieldWriter(); + case FLOAT: + return new FloatFieldWriter(); + case DOUBLE: + return new DoubleFieldWriter(); + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return new TimestampFieldWriter(getPrecision(type)); + case VARIANT: + return new VariantFieldWriter(); + case BLOB: + return new BlobFieldWriter(); + case VECTOR: + return new VectorFieldWriter( + createFieldWriter(((VectorType) type).getElementType())); + case ARRAY: + return new ArrayFieldWriter(createFieldWriter(((ArrayType) type).getElementType())); + case MULTISET: + { + DataType elemType = ((MultisetType) type).getElementType(); + return new MapFieldWriter( + createFieldWriter(elemType), createFieldWriter(new IntType())); + } + case MAP: + { + MapType mapType = (MapType) type; + return new MapFieldWriter( + createFieldWriter(mapType.getKeyType()), + createFieldWriter(mapType.getValueType())); + } + case ROW: + return new RowFieldWriter((RowType) type); + default: + throw new UnsupportedOperationException("Unsupported type: " + type.getTypeRoot()); + } + } + + // ======================== Complex Type Helpers ======================== + + private void writeArray(InternalArray array, FieldWriter elementWriter) { + int size = array.size(); + buf.writeVarInt(size); + int nullBitmapBytes = (size + 7) / 8; + buf.ensureCapacity(nullBitmapBytes); + int nullStart = buf.position; + for (int i = 0; i < nullBitmapBytes; i++) { + buf.buffer[buf.position++] = 0; + } + for (int i = 0; i < size; i++) { + if (array.isNullAt(i)) { + buf.buffer[nullStart + i / 8] |= (byte) (1 << (i % 8)); + } else { + elementWriter.write(array, i); + } + } + } + + private void writeMap(InternalMap map, FieldWriter keyWriter, FieldWriter valueWriter) { + writeArray(map.keyArray(), keyWriter); + writeArray(map.valueArray(), valueWriter); + } + + // ======================== Interface ======================== + + interface FieldWriter { + void write(DataGetters data, int i); + } + + // ======================== FieldWriter Implementations ======================== + + private class StringFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeBytes(data.getString(i).toBytes()); + } + } + + private class BooleanFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeBoolean(data.getBoolean(i)); + } + } + + private class BinaryFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeBytes(data.getBinary(i)); + } + } + + private class DecimalFieldWriter implements FieldWriter { + private final int precision; + private final int scale; + + DecimalFieldWriter(int precision, int scale) { + this.precision = precision; + this.scale = scale; + } + + @Override + public void write(DataGetters data, int i) { + buf.writeDecimal(data.getDecimal(i, precision, scale), precision); + } + } + + private class TinyIntFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeByte(data.getByte(i)); + } + } + + private class SmallIntFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeShort(data.getShort(i)); + } + } + + private class IntFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeInt(data.getInt(i)); + } + } + + private class BigIntFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeLong(data.getLong(i)); + } + } + + private class FloatFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeFloat(data.getFloat(i)); + } + } + + private class DoubleFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeDouble(data.getDouble(i)); + } + } + + private class TimestampFieldWriter implements FieldWriter { + private final int precision; + + TimestampFieldWriter(int precision) { + this.precision = precision; + } + + @Override + public void write(DataGetters data, int i) { + buf.writeTimestamp(data.getTimestamp(i, precision), precision); + } + } + + private class VariantFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + Variant v = data.getVariant(i); + buf.writeBytes(v.value()); + buf.writeBytes(v.metadata()); + } + } + + private class BlobFieldWriter implements FieldWriter { + @Override + public void write(DataGetters data, int i) { + buf.writeBytes(data.getBlob(i).toData()); + } + } + + private class VectorFieldWriter implements FieldWriter { + private final FieldWriter elemWriter; + + VectorFieldWriter(FieldWriter elemWriter) { + this.elemWriter = elemWriter; + } + + @Override + public void write(DataGetters data, int i) { + InternalVector vector = data.getVector(i); + int size = vector.size(); + buf.writeVarInt(size); + for (int j = 0; j < size; j++) { + elemWriter.write(vector, j); + } + } + } + + private class ArrayFieldWriter implements FieldWriter { + private final FieldWriter elemWriter; + + ArrayFieldWriter(FieldWriter elemWriter) { + this.elemWriter = elemWriter; + } + + @Override + public void write(DataGetters data, int i) { + writeArray(data.getArray(i), elemWriter); + } + } + + private class MapFieldWriter implements FieldWriter { + private final FieldWriter keyWriter; + private final FieldWriter valueWriter; + + MapFieldWriter(FieldWriter keyWriter, FieldWriter valueWriter) { + this.keyWriter = keyWriter; + this.valueWriter = valueWriter; + } + + @Override + public void write(DataGetters data, int i) { + writeMap(data.getMap(i), keyWriter, valueWriter); + } + } + + private class RowFieldWriter implements FieldWriter { + private final int nestedHeaderSize; + private final FieldWriter[] nestedWriters; + private final int numFields; + + RowFieldWriter(RowType nestedType) { + int arity = nestedType.getFieldCount(); + this.numFields = arity; + this.nestedHeaderSize = (arity + 7) / 8; + this.nestedWriters = new FieldWriter[arity]; + for (int j = 0; j < arity; j++) { + nestedWriters[j] = createFieldWriter(nestedType.getTypeAt(j)); + } + } + + @Override + public void write(DataGetters data, int i) { + writeRow(data.getRow(i, numFields), nestedHeaderSize, nestedWriters); + } + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFooter.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFooter.java new file mode 100644 index 000000000000..c6d0026d11f0 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFooter.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.fs.SeekableInputStream; + +import java.io.IOException; + +/** Fixed 32-byte footer at the end of a row file. */ +class RowFileFooter { + + static final int FOOTER_SIZE = 32; + static final int MAGIC = 0x524F5753; // "ROWS" + static final byte VERSION = 1; + + final long totalRowCount; + final int blockCount; + final long indexOffset; + final int indexLength; + + RowFileFooter(long totalRowCount, int blockCount, long indexOffset, int indexLength) { + this.totalRowCount = totalRowCount; + this.blockCount = blockCount; + this.indexOffset = indexOffset; + this.indexLength = indexLength; + } + + void writeTo(PositionOutputStream out) throws IOException { + byte[] buf = new byte[FOOTER_SIZE]; + writeLongLE(buf, 0, totalRowCount); + writeIntLE(buf, 8, blockCount); + writeLongLE(buf, 12, indexOffset); + writeIntLE(buf, 20, indexLength); + buf[24] = VERSION; + // bytes 25-27 reserved (zeros) + writeIntLE(buf, 28, MAGIC); + out.write(buf); + } + + static RowFileFooter readFrom(SeekableInputStream in, long fileSize) throws IOException { + in.seek(fileSize - FOOTER_SIZE); + byte[] buf = new byte[FOOTER_SIZE]; + readFully(in, buf); + return readFrom(buf, 0); + } + + static RowFileFooter readFrom(byte[] buf, int offset) throws IOException { + int magic = readIntLE(buf, offset + 28); + if (magic != MAGIC) { + throw new IOException( + String.format( + "Invalid row file magic: expected 0x%08X, got 0x%08X", MAGIC, magic)); + } + + byte version = buf[offset + 24]; + if (version != VERSION) { + throw new IOException("Unsupported row file version: " + version); + } + + long totalRowCount = readLongLE(buf, offset); + int blockCount = readIntLE(buf, offset + 8); + long indexOffset = readLongLE(buf, offset + 12); + int indexLength = readIntLE(buf, offset + 20); + + return new RowFileFooter(totalRowCount, blockCount, indexOffset, indexLength); + } + + private static void readFully(SeekableInputStream in, byte[] buf) throws IOException { + int off = 0; + while (off < buf.length) { + int read = in.read(buf, off, buf.length - off); + if (read < 0) { + throw new IOException("Unexpected end of file"); + } + off += read; + } + } + + static void writeIntLE(byte[] buf, int offset, int value) { + buf[offset] = (byte) (value & 0xFF); + buf[offset + 1] = (byte) ((value >>> 8) & 0xFF); + buf[offset + 2] = (byte) ((value >>> 16) & 0xFF); + buf[offset + 3] = (byte) ((value >>> 24) & 0xFF); + } + + static int readIntLE(byte[] buf, int offset) { + return (buf[offset] & 0xFF) + | ((buf[offset + 1] & 0xFF) << 8) + | ((buf[offset + 2] & 0xFF) << 16) + | ((buf[offset + 3] & 0xFF) << 24); + } + + static void writeLongLE(byte[] buf, int offset, long value) { + buf[offset] = (byte) (value & 0xFF); + buf[offset + 1] = (byte) ((value >>> 8) & 0xFF); + buf[offset + 2] = (byte) ((value >>> 16) & 0xFF); + buf[offset + 3] = (byte) ((value >>> 24) & 0xFF); + buf[offset + 4] = (byte) ((value >>> 32) & 0xFF); + buf[offset + 5] = (byte) ((value >>> 40) & 0xFF); + buf[offset + 6] = (byte) ((value >>> 48) & 0xFF); + buf[offset + 7] = (byte) ((value >>> 56) & 0xFF); + } + + static long readLongLE(byte[] buf, int offset) { + return (buf[offset] & 0xFFL) + | ((buf[offset + 1] & 0xFFL) << 8) + | ((buf[offset + 2] & 0xFFL) << 16) + | ((buf[offset + 3] & 0xFFL) << 24) + | ((buf[offset + 4] & 0xFFL) << 32) + | ((buf[offset + 5] & 0xFFL) << 40) + | ((buf[offset + 6] & 0xFFL) << 48) + | ((buf[offset + 7] & 0xFFL) << 56); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormat.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormat.java new file mode 100644 index 000000000000..8683dd4fde28 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormat.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FileFormatFactory.FormatContext; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.options.MemorySize; +import org.apache.paimon.predicate.Predicate; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.NestedProjectedRow; + +import javax.annotation.Nullable; + +import java.util.List; + +/** Row-store file format with block-level ZSTD compression and O(1) row-number lookup. */ +public class RowFileFormat extends FileFormat { + + private static final int DEFAULT_BLOCK_SIZE = 65536; + + private final int blockSize; + private final int zstdLevel; + + public RowFileFormat(FormatContext formatContext) { + super(RowFileFormatFactory.IDENTIFIER); + this.zstdLevel = formatContext.zstdLevel(); + MemorySize bs = formatContext.blockSize(); + this.blockSize = bs != null ? (int) bs.getBytes() : DEFAULT_BLOCK_SIZE; + } + + @Override + public FormatReaderFactory createReaderFactory( + RowType dataSchemaRowType, + RowType projectedRowType, + @Nullable List filters) { + NestedProjectedRow projection = + NestedProjectedRow.create(dataSchemaRowType, projectedRowType); + return new RowFormatReaderFactory(dataSchemaRowType, projection); + } + + @Override + public FormatWriterFactory createWriterFactory(RowType type) { + return new RowFormatWriterFactory(type, blockSize, zstdLevel); + } + + @Override + public void validateDataFields(RowType rowType) { + // Row format supports all Paimon data types + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormatFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormatFactory.java new file mode 100644 index 000000000000..a9f3364d313b --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileFormatFactory.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FileFormatFactory; + +/** Factory for the row-store file format. */ +public class RowFileFormatFactory implements FileFormatFactory { + + public static final String IDENTIFIER = "row"; + + @Override + public String identifier() { + return IDENTIFIER; + } + + @Override + public FileFormat create(FormatContext formatContext) { + return new RowFileFormat(formatContext); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileRecordIterator.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileRecordIterator.java new file mode 100644 index 000000000000..4746d728e751 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFileRecordIterator.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.fs.Path; +import org.apache.paimon.reader.FileRecordIterator; +import org.apache.paimon.utils.NestedProjectedRow; + +import javax.annotation.Nullable; + +import java.io.IOException; + +/** Iterator over rows within a single decompressed block. */ +class RowFileRecordIterator implements FileRecordIterator { + + private final Path filePath; + private final RowBlockReader blockReader; + @Nullable private final NestedProjectedRow projection; + private final long blockStartRow; + @Nullable private final int[] selectedLocalIndices; + + private int cursor; + private long currentPosition; + + RowFileRecordIterator( + Path filePath, + RowBlockReader blockReader, + @Nullable NestedProjectedRow projection, + long blockStartRow) { + this(filePath, blockReader, projection, blockStartRow, null); + } + + RowFileRecordIterator( + Path filePath, + RowBlockReader blockReader, + @Nullable NestedProjectedRow projection, + long blockStartRow, + @Nullable int[] selectedLocalIndices) { + this.filePath = filePath; + this.blockReader = blockReader; + this.projection = projection; + this.blockStartRow = blockStartRow; + this.selectedLocalIndices = selectedLocalIndices; + this.cursor = 0; + this.currentPosition = blockStartRow; + } + + @Override + public long returnedPosition() { + return currentPosition; + } + + @Override + public Path filePath() { + return filePath; + } + + @Nullable + @Override + public InternalRow next() throws IOException { + if (selectedLocalIndices != null) { + return nextSelected(selectedLocalIndices); + } else { + return nextSequential(); + } + } + + @Nullable + private InternalRow nextSequential() { + if (cursor >= blockReader.rowCount()) { + return null; + } + currentPosition = blockStartRow + cursor; + InternalRow row = blockReader.readRow(cursor); + cursor++; + return applyProjection(row); + } + + @Nullable + private InternalRow nextSelected(int[] selectedLocalIndices) { + if (cursor >= selectedLocalIndices.length) { + return null; + } + int localIdx = selectedLocalIndices[cursor]; + currentPosition = blockStartRow + localIdx; + InternalRow row = blockReader.readRow(localIdx); + cursor++; + return applyProjection(row); + } + + private InternalRow applyProjection(InternalRow row) { + if (projection != null) { + return projection.replaceRow(row); + } + return row; + } + + @Override + public void releaseBatch() {} +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java new file mode 100644 index 000000000000..7310b3dfab9b --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReader.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.reader.FileRecordIterator; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.NestedProjectedRow; +import org.apache.paimon.utils.RoaringBitmap32; + +import javax.annotation.Nullable; + +import java.io.IOException; + +/** Reader for row-store format files. Reads block by block and returns row iterators. */ +public class RowFormatReader implements FileRecordReader { + + private final Path filePath; + private final RowFileFooter footer; + private final RowBlockIndex blockIndex; + private final RowType rowType; + @Nullable private final NestedProjectedRow projection; + @Nullable private final RoaringBitmap32 selection; + private final BlockPrefetcher prefetcher; + + RowFormatReader( + SeekableInputStream inputStream, + Path filePath, + RowFileFooter footer, + RowBlockIndex blockIndex, + RowType rowType, + @Nullable NestedProjectedRow projection, + @Nullable RoaringBitmap32 selection) { + this.filePath = filePath; + this.footer = footer; + this.blockIndex = blockIndex; + this.rowType = rowType; + this.projection = projection; + this.selection = selection; + this.prefetcher = + new BlockPrefetcher( + inputStream, + blockIndex, + computeBlocksToRead(blockIndex, footer.totalRowCount, selection)); + } + + @Nullable + @Override + public FileRecordIterator readBatch() throws IOException { + byte[] decompressed = prefetcher.nextBlock(); + if (decompressed == null) { + return null; + } + + int blockIdx = prefetcher.currentBlockIdx(); + long blockStartRow = blockIndex.blockRowStart(blockIdx); + RowBlockReader blockReader = new RowBlockReader(new BlockInput(decompressed), rowType); + + if (selection != null) { + long blockEndRow = blockEndRow(blockIdx); + int[] localIndices = computeSelectedLocalIndices(selection, blockStartRow, blockEndRow); + return new RowFileRecordIterator( + filePath, blockReader, projection, blockStartRow, localIndices); + } else { + return new RowFileRecordIterator(filePath, blockReader, projection, blockStartRow); + } + } + + @Override + public void close() throws IOException { + prefetcher.close(); + } + + private long blockEndRow(int blockIdx) { + if (blockIdx + 1 < blockIndex.blockCount()) { + return blockIndex.blockRowStart(blockIdx + 1); + } + return footer.totalRowCount; + } + + private static int[] computeBlocksToRead( + RowBlockIndex blockIndex, long totalRowCount, @Nullable RoaringBitmap32 selection) { + int blockCount = blockIndex.blockCount(); + if (selection == null) { + int[] all = new int[blockCount]; + for (int i = 0; i < blockCount; i++) { + all[i] = i; + } + return all; + } + + int[] blocks = new int[blockCount]; + int count = 0; + for (int i = 0; i < blockCount; i++) { + long blockStart = blockIndex.blockRowStart(i); + long blockEnd = (i + 1 < blockCount) ? blockIndex.blockRowStart(i + 1) : totalRowCount; + if (selection.intersects(blockStart, blockEnd)) { + blocks[count++] = i; + } + } + + int[] result = new int[count]; + System.arraycopy(blocks, 0, result, 0, count); + return result; + } + + private static int[] computeSelectedLocalIndices( + RoaringBitmap32 selection, long blockStartRow, long blockEndRow) { + int capacity = (int) (blockEndRow - blockStartRow); + int[] indices = new int[capacity]; + int count = 0; + + long current = selection.nextValue((int) blockStartRow); + while (current >= 0 && current < blockEndRow) { + indices[count++] = (int) (current - blockStartRow); + if (current == Integer.MAX_VALUE) { + break; + } + current = selection.nextValue((int) current + 1); + } + + if (count == capacity) { + return indices; + } + int[] result = new int[count]; + System.arraycopy(indices, 0, result, 0, count); + return result; + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java new file mode 100644 index 000000000000..1cab7626bef5 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatReaderFactory.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.IOUtils; +import org.apache.paimon.utils.NestedProjectedRow; + +import javax.annotation.Nullable; + +import java.io.IOException; + +/** Factory for creating {@link RowFormatReader}. */ +public class RowFormatReaderFactory implements FormatReaderFactory { + + private static final int TAIL_PREFETCH_SIZE = 64 * 1024; + + private final RowType rowType; + @Nullable private final NestedProjectedRow projection; + + public RowFormatReaderFactory(RowType rowType, @Nullable NestedProjectedRow projection) { + this.rowType = rowType; + this.projection = projection; + } + + @Override + public FileRecordReader createReader(Context context) throws IOException { + FileIO fileIO = context.fileIO(); + Path path = context.filePath(); + long fileSize = context.fileSize(); + + SeekableInputStream in = fileIO.newInputStream(path); + + int tailSize = (int) Math.min(TAIL_PREFETCH_SIZE, fileSize); + long tailOffset = fileSize - tailSize; + in.seek(tailOffset); + byte[] tailBuf = new byte[tailSize]; + IOUtils.readFully(in, tailBuf); + + RowFileFooter footer = + RowFileFooter.readFrom(tailBuf, tailSize - RowFileFooter.FOOTER_SIZE); + + RowBlockIndex blockIndex; + if (footer.indexOffset >= tailOffset) { + int indexOffsetInBuf = (int) (footer.indexOffset - tailOffset); + byte[] indexData = new byte[footer.indexLength]; + System.arraycopy(tailBuf, indexOffsetInBuf, indexData, 0, footer.indexLength); + blockIndex = RowBlockIndex.readFrom(indexData); + } else { + blockIndex = RowBlockIndex.readFrom(in, footer.indexOffset, footer.indexLength); + } + + return new RowFormatReader( + in, path, footer, blockIndex, rowType, projection, context.selection()); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriter.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriter.java new file mode 100644 index 000000000000..ff9ee1112bd6 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriter.java @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.compression.ZstdBlockCompressor; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.LongArrayList; + +import java.io.IOException; + +/** Writer that produces row-store format files with block-level ZSTD compression. */ +public class RowFormatWriter implements FormatWriter { + + private final PositionOutputStream out; + private final ZstdBlockCompressor compressor; + private final int blockSizeThreshold; + private final RowBlockWriter blockWriter; + + private final LongArrayList blockCompressedSizes; + private final LongArrayList blockUncompressedSizes; + private final LongArrayList blockRowStarts; + + private long totalRowCount; + + public RowFormatWriter( + PositionOutputStream out, RowType rowType, int blockSize, int zstdLevel) { + this.out = out; + this.compressor = new ZstdBlockCompressor(zstdLevel); + this.blockSizeThreshold = blockSize; + this.blockWriter = new RowBlockWriter(new BlockOutput(blockSize), rowType); + this.blockCompressedSizes = new LongArrayList(128); + this.blockUncompressedSizes = new LongArrayList(128); + this.blockRowStarts = new LongArrayList(128); + } + + @Override + public void addElement(InternalRow element) throws IOException { + blockWriter.writeRow(element); + totalRowCount++; + + if (blockWriter.estimatedSize() >= blockSizeThreshold) { + flushBlock(); + } + } + + @Override + public boolean reachTargetSize(boolean suggestedCheck, long targetSize) throws IOException { + if (!suggestedCheck) { + return false; + } + return out.getPos() >= targetSize; + } + + @Override + public void close() throws IOException { + flushBlock(); + + long indexOffset = out.getPos(); + RowBlockIndex index = + new RowBlockIndex( + blockCompressedSizes.toArray(), + blockUncompressedSizes.toArray(), + blockRowStarts.toArray()); + index.writeTo(out); + int indexLength = (int) (out.getPos() - indexOffset); + + RowFileFooter footer = + new RowFileFooter( + totalRowCount, blockCompressedSizes.size(), indexOffset, indexLength); + footer.writeTo(out); + + out.flush(); + out.close(); + } + + private void flushBlock() throws IOException { + if (blockWriter.rowCount() == 0) { + return; + } + + blockRowStarts.add(totalRowCount - blockWriter.rowCount()); + + byte[] uncompressed = blockWriter.finish(); + blockUncompressedSizes.add(uncompressed.length); + + int maxCompressedSize = compressor.getMaxCompressedSize(uncompressed.length); + byte[] compressed = new byte[maxCompressedSize]; + int compressedLen = + compressor.compress(uncompressed, 0, uncompressed.length, compressed, 0); + + out.write(compressed, 0, compressedLen); + blockCompressedSizes.add(compressedLen); + + blockWriter.reset(); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriterFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriterFactory.java new file mode 100644 index 000000000000..e208d7102d99 --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/RowFormatWriterFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.format.FormatWriterFactory; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.types.RowType; + +import java.io.IOException; + +/** Factory for creating {@link RowFormatWriter}. */ +public class RowFormatWriterFactory implements FormatWriterFactory { + + private final RowType rowType; + private final int blockSize; + private final int zstdLevel; + + public RowFormatWriterFactory(RowType rowType, int blockSize, int zstdLevel) { + this.rowType = rowType; + this.blockSize = blockSize; + this.zstdLevel = zstdLevel; + } + + @Override + public FormatWriter create(PositionOutputStream out, String compression) throws IOException { + return new RowFormatWriter(out, rowType, blockSize, zstdLevel); + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/SequentialReadStrategy.java b/paimon-format/src/main/java/org/apache/paimon/format/row/SequentialReadStrategy.java new file mode 100644 index 000000000000..fbd2fb397a4c --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/SequentialReadStrategy.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.compression.ZstdBlockDecompressor; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.utils.IOUtils; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.concurrent.CompletableFuture; + +/** + * Reads blocks sequentially in a background thread. + * + *

One block is prefetched ahead so that IO and decompression can overlap. + */ +class SequentialReadStrategy implements ReadStrategy { + + private final SeekableInputStream inputStream; + private final RowBlockIndex blockIndex; + private final ZstdBlockDecompressor decompressor; + private final int[] blocksToRead; + private CompletableFuture nextFuture; + private int nextSubmit; + private int nextConsume; + + SequentialReadStrategy( + SeekableInputStream inputStream, RowBlockIndex blockIndex, int[] blocksToRead) { + this.inputStream = inputStream; + this.blockIndex = blockIndex; + this.decompressor = new ZstdBlockDecompressor(); + this.blocksToRead = blocksToRead; + this.nextSubmit = 0; + this.nextConsume = 0; + submitNext(); + } + + @Override + public byte[] nextBlock() throws IOException { + if (nextConsume >= blocksToRead.length) { + return null; + } + + byte[] compressed; + if (nextFuture != null) { + compressed = BlockPrefetcher.awaitFuture(nextFuture); + nextFuture = null; + } else { + compressed = readBlock(blocksToRead[nextConsume]); + } + nextConsume++; + submitNext(); + + int blockIdx = blocksToRead[nextConsume - 1]; + int uncompressedSize = (int) blockIndex.blockUncompressedSize(blockIdx); + byte[] decompressed = new byte[uncompressedSize]; + decompressor.decompress(compressed, 0, compressed.length, decompressed, 0); + return decompressed; + } + + @Override + public int currentBlockIdx() { + if (nextConsume <= 0 || nextConsume > blocksToRead.length) { + return -1; + } + return blocksToRead[nextConsume - 1]; + } + + @Override + public void close() { + if (nextFuture != null) { + nextFuture.cancel(true); + nextFuture = null; + } + } + + private void submitNext() { + if (nextSubmit < blocksToRead.length) { + int blockIdx = blocksToRead[nextSubmit++]; + nextFuture = + CompletableFuture.supplyAsync( + () -> { + try { + return readBlock(blockIdx); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }, + IO_POOL); + } + } + + private byte[] readBlock(int blockIdx) throws IOException { + int compressedSize = (int) blockIndex.blockCompressedSize(blockIdx); + byte[] buf = new byte[compressedSize]; + inputStream.seek(blockIndex.blockOffset(blockIdx)); + IOUtils.readFully(inputStream, buf); + return buf; + } +} diff --git a/paimon-format/src/main/java/org/apache/paimon/format/row/VectoredReadStrategy.java b/paimon-format/src/main/java/org/apache/paimon/format/row/VectoredReadStrategy.java new file mode 100644 index 000000000000..a1305bdf5adf --- /dev/null +++ b/paimon-format/src/main/java/org/apache/paimon/format/row/VectoredReadStrategy.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.compression.ZstdBlockDecompressor; +import org.apache.paimon.fs.VectoredReadable; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.CompletableFuture; + +/** + * Reads blocks using positional reads ({@code preadFully}) with IO coalescing and concurrent + * prefetch. + * + *

Adjacent blocks are merged into larger IO requests to reduce QPS. Multiple merged ranges are + * prefetched concurrently since {@code preadFully} is thread-safe. + */ +class VectoredReadStrategy implements ReadStrategy { + + static final int HOLE_SIZE_LIMIT = 256 * 1024; + static final int RANGE_SIZE_LIMIT = 2 * 1024 * 1024; + private static final int PREFETCH_COUNT = 4; + + private final VectoredReadable readable; + private final RowBlockIndex blockIndex; + private final ZstdBlockDecompressor decompressor; + private final List mergedRanges; + private final Queue> prefetchQueue; + private int nextRangeToSubmit; + private int currentRangeIdx; + private byte[] currentRangeData; + private int currentBlockInRange; + + VectoredReadStrategy(VectoredReadable readable, RowBlockIndex blockIndex, int[] blocksToRead) { + this.readable = readable; + this.blockIndex = blockIndex; + this.decompressor = new ZstdBlockDecompressor(); + this.mergedRanges = coalesceRanges(blocksToRead, blockIndex); + this.prefetchQueue = new ArrayDeque<>(PREFETCH_COUNT); + this.nextRangeToSubmit = 0; + this.currentRangeIdx = -1; + this.currentBlockInRange = 0; + fillPrefetch(); + } + + @Override + public byte[] nextBlock() throws IOException { + if (currentRangeIdx < 0 + || currentBlockInRange >= mergedRanges.get(currentRangeIdx).blockIndices.length) { + advanceToNextRange(); + } + if (currentRangeIdx >= mergedRanges.size()) { + return null; + } + + MergedRange range = mergedRanges.get(currentRangeIdx); + int blockIdx = range.blockIndices[currentBlockInRange]; + int offsetInBuf = (int) (blockIndex.blockOffset(blockIdx) - range.offset); + int compressedSize = (int) blockIndex.blockCompressedSize(blockIdx); + int uncompressedSize = (int) blockIndex.blockUncompressedSize(blockIdx); + + byte[] decompressed = new byte[uncompressedSize]; + decompressor.decompress(currentRangeData, offsetInBuf, compressedSize, decompressed, 0); + + currentBlockInRange++; + return decompressed; + } + + @Override + public int currentBlockIdx() { + if (currentRangeIdx < 0 || currentRangeIdx >= mergedRanges.size()) { + return -1; + } + MergedRange range = mergedRanges.get(currentRangeIdx); + return range.blockIndices[currentBlockInRange - 1]; + } + + @Override + public void close() { + for (CompletableFuture f : prefetchQueue) { + f.cancel(true); + } + prefetchQueue.clear(); + } + + private void advanceToNextRange() throws IOException { + currentRangeIdx++; + currentBlockInRange = 0; + + if (currentRangeIdx >= mergedRanges.size()) { + currentRangeData = null; + return; + } + + CompletableFuture future = prefetchQueue.poll(); + if (future != null) { + currentRangeData = BlockPrefetcher.awaitFuture(future); + } else { + currentRangeData = readRange(mergedRanges.get(currentRangeIdx)); + } + fillPrefetch(); + } + + private void fillPrefetch() { + while (prefetchQueue.size() < PREFETCH_COUNT && nextRangeToSubmit < mergedRanges.size()) { + int rangeIdx = nextRangeToSubmit++; + MergedRange range = mergedRanges.get(rangeIdx); + prefetchQueue.add( + CompletableFuture.supplyAsync( + () -> { + try { + return readRange(range); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }, + IO_POOL)); + } + } + + private byte[] readRange(MergedRange range) throws IOException { + byte[] buf = new byte[range.length]; + readable.preadFully(range.offset, buf, 0, range.length); + return buf; + } + + // ======================== Range Coalescing ======================== + + static List coalesceRanges(int[] blocksToRead, RowBlockIndex blockIndex) { + List result = new ArrayList<>(); + if (blocksToRead.length == 0) { + return result; + } + + int rangeStart = 0; + long rangeOffset = blockIndex.blockOffset(blocksToRead[0]); + long rangeEnd = rangeOffset + blockIndex.blockCompressedSize(blocksToRead[0]); + + for (int i = 1; i < blocksToRead.length; i++) { + int blockIdx = blocksToRead[i]; + long blockOffset = blockIndex.blockOffset(blockIdx); + long blockEnd = blockOffset + blockIndex.blockCompressedSize(blockIdx); + long gap = blockOffset - rangeEnd; + long newLength = blockEnd - rangeOffset; + + if (gap < HOLE_SIZE_LIMIT && newLength <= RANGE_SIZE_LIMIT) { + rangeEnd = blockEnd; + } else { + result.add(buildRange(blocksToRead, rangeStart, i, rangeOffset, rangeEnd)); + rangeStart = i; + rangeOffset = blockOffset; + rangeEnd = blockEnd; + } + } + result.add( + buildRange(blocksToRead, rangeStart, blocksToRead.length, rangeOffset, rangeEnd)); + return result; + } + + private static MergedRange buildRange( + int[] blocksToRead, int from, int to, long rangeOffset, long rangeEnd) { + int[] indices = new int[to - from]; + System.arraycopy(blocksToRead, from, indices, 0, indices.length); + return new MergedRange(rangeOffset, (int) (rangeEnd - rangeOffset), indices); + } + + // ======================== MergedRange ======================== + + static class MergedRange { + final long offset; + final int length; + final int[] blockIndices; + + MergedRange(long offset, int length, int[] blockIndices) { + this.offset = offset; + this.length = length; + this.blockIndices = blockIndices; + } + } +} diff --git a/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory b/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory index 80cfe4b946b8..f34a5af57e48 100644 --- a/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory +++ b/paimon-format/src/main/resources/META-INF/services/org.apache.paimon.format.FileFormatFactory @@ -20,3 +20,4 @@ org.apache.paimon.format.csv.CsvFileFormatFactory org.apache.paimon.format.text.TextFileFormatFactory org.apache.paimon.format.json.JsonFileFormatFactory org.apache.paimon.format.blob.BlobFileFormatFactory +org.apache.paimon.format.row.RowFileFormatFactory diff --git a/paimon-format/src/test/java/org/apache/paimon/format/row/BlockPrefetcherTest.java b/paimon-format/src/test/java/org/apache/paimon/format/row/BlockPrefetcherTest.java new file mode 100644 index 000000000000..080ce6759e68 --- /dev/null +++ b/paimon-format/src/test/java/org/apache/paimon/format/row/BlockPrefetcherTest.java @@ -0,0 +1,481 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FormatReaderContext; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.options.Options; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.types.VarCharType; +import org.apache.paimon.utils.RoaringBitmap32; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for {@link BlockPrefetcher}. */ +public class BlockPrefetcherTest { + + @TempDir java.nio.file.Path tempDir; + + @Test + public void testCoalesceAdjacentBlocks() { + long[] compressedSizes = {100, 100, 100, 100, 100}; + long[] uncompressedSizes = {200, 200, 200, 200, 200}; + long[] rowStarts = {0, 10, 20, 30, 40}; + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + int[] blocksToRead = {0, 1, 2, 3, 4}; + List ranges = + VectoredReadStrategy.coalesceRanges(blocksToRead, index); + + assertThat(ranges).hasSize(1); + assertThat(ranges.get(0).offset).isEqualTo(0); + assertThat(ranges.get(0).length).isEqualTo(500); + assertThat(ranges.get(0).blockIndices).containsExactly(0, 1, 2, 3, 4); + } + + @Test + public void testCoalesceWithLargeGap() { + long[] compressedSizes = {100, 100, 100}; + long[] uncompressedSizes = {200, 200, 200}; + long[] rowStarts = {0, 10, 20}; + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + int[] blocksToRead = {0, 2}; + List ranges = + VectoredReadStrategy.coalesceRanges(blocksToRead, index); + + // gap between block 0 end (100) and block 2 start (200) is 100, within HOLE_SIZE_LIMIT + assertThat(ranges).hasSize(1); + assertThat(ranges.get(0).blockIndices).containsExactly(0, 2); + } + + @Test + public void testCoalesceSplitsByHoleSize() { + // Create blocks with large gaps (> 256KB) + int numBlocks = 3; + long[] compressedSizes = new long[numBlocks]; + long[] uncompressedSizes = new long[numBlocks]; + long[] rowStarts = new long[numBlocks]; + Arrays.fill(compressedSizes, 1024); + Arrays.fill(uncompressedSizes, 2048); + + // Block offsets: 0, 300*1024 (gap=299KB > 256KB), 600*1024 + compressedSizes[0] = 1024; + compressedSizes[1] = 1024; + compressedSizes[2] = 1024; + + // Override to create large gaps: block 0 at offset 0 size 1024, + // block 1 at offset 300*1024 (but offsets are derived from prefix sum) + // So we need block 0 compressedSize = 300*1024 to put block 1 at that offset + compressedSizes[0] = 300 * 1024; + compressedSizes[1] = 300 * 1024; + compressedSizes[2] = 1024; + rowStarts[0] = 0; + rowStarts[1] = 100; + rowStarts[2] = 200; + + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + // blocksToRead = [0, 2]: gap between block 0 end and block 2 start + // block 0 end = 300*1024, block 2 start = 600*1024, gap = 300*1024 > 256KB + int[] blocksToRead = {0, 2}; + List ranges = + VectoredReadStrategy.coalesceRanges(blocksToRead, index); + + assertThat(ranges).hasSize(2); + assertThat(ranges.get(0).blockIndices).containsExactly(0); + assertThat(ranges.get(1).blockIndices).containsExactly(2); + } + + @Test + public void testCoalesceSplitsByRangeSize() { + // Create many blocks that exceed RANGE_SIZE_LIMIT (2MB) when merged + int numBlocks = 30; + long[] compressedSizes = new long[numBlocks]; + long[] uncompressedSizes = new long[numBlocks]; + long[] rowStarts = new long[numBlocks]; + Arrays.fill(compressedSizes, 100 * 1024); // 100KB each + Arrays.fill(uncompressedSizes, 200 * 1024); + for (int i = 0; i < numBlocks; i++) { + rowStarts[i] = i * 100L; + } + + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + int[] blocksToRead = new int[numBlocks]; + for (int i = 0; i < numBlocks; i++) { + blocksToRead[i] = i; + } + + List ranges = + VectoredReadStrategy.coalesceRanges(blocksToRead, index); + + // 2MB / 100KB = 20 blocks per range, so 30 blocks should split into 2 ranges + assertThat(ranges.size()).isGreaterThan(1); + for (VectoredReadStrategy.MergedRange range : ranges) { + assertThat(range.length).isLessThanOrEqualTo(2 * 1024 * 1024); + } + } + + @Test + public void testCoalesceEmptyInput() { + long[] compressedSizes = {100}; + long[] uncompressedSizes = {200}; + long[] rowStarts = {0}; + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + List ranges = + VectoredReadStrategy.coalesceRanges(new int[0], index); + + assertThat(ranges).isEmpty(); + } + + @Test + public void testCoalesceSingleBlock() { + long[] compressedSizes = {1024}; + long[] uncompressedSizes = {2048}; + long[] rowStarts = {0}; + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + int[] blocksToRead = {0}; + List ranges = + VectoredReadStrategy.coalesceRanges(blocksToRead, index); + + assertThat(ranges).hasSize(1); + assertThat(ranges.get(0).offset).isEqualTo(0); + assertThat(ranges.get(0).length).isEqualTo(1024); + assertThat(ranges.get(0).blockIndices).containsExactly(0); + } + + @Test + public void testCoalesceNonContiguousBlocks() { + long[] compressedSizes = {100, 100, 100, 100, 100}; + long[] uncompressedSizes = {200, 200, 200, 200, 200}; + long[] rowStarts = {0, 10, 20, 30, 40}; + RowBlockIndex index = new RowBlockIndex(compressedSizes, uncompressedSizes, rowStarts); + + int[] blocksToRead = {0, 2, 4}; + List ranges = + VectoredReadStrategy.coalesceRanges(blocksToRead, index); + + // All gaps are small (100 bytes), so everything merges into one range + assertThat(ranges).hasSize(1); + assertThat(ranges.get(0).offset).isEqualTo(0); + assertThat(ranges.get(0).length).isEqualTo(500); + assertThat(ranges.get(0).blockIndices).containsExactly(0, 2, 4); + } + + @Test + public void testPrefetcherReadsNonContiguousBlocks() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "prefetch_non_contig.row"); + Options options = new Options(); + options.setString("file.block-size", "256b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List rows = new ArrayList<>(); + for (int i = 0; i < 500; i++) { + rows.add(GenericRow.of(i)); + } + writeRows(format, rowType, path, rows); + + // Select only specific non-contiguous rows that span multiple blocks + RoaringBitmap32 selection = new RoaringBitmap32(); + selection.add(0); + selection.add(50); + selection.add(200); + selection.add(499); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path), selection)); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(row.getInt(0))); + reader.close(); + + assertThat(result).containsExactly(0, 50, 200, 499); + } + + @Test + public void testPrefetcherWithManyMergedRanges() throws IOException { + RowType rowType = + RowType.builder() + .fields(Arrays.asList(new IntType(), new VarCharType(1000))) + .build(); + + Path path = new Path(tempDir.toUri().toString(), "prefetch_multi_range.row"); + Options options = new Options(); + options.setString("file.block-size", "512b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List rows = new ArrayList<>(); + for (int i = 0; i < 2000; i++) { + rows.add(GenericRow.of(i, BinaryString.fromString("value_" + i))); + } + writeRows(format, rowType, path, rows); + + // Read all rows through the prefetcher (no selection) + List result = readAllRows(format, rowType, path); + assertThat(result).hasSize(2000); + for (int i = 0; i < 2000; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + assertThat(result.get(i).getString(1).toString()).isEqualTo("value_" + i); + } + } + + @Test + public void testPrefetcherWithSelectionAcrossManyRanges() throws IOException { + RowType rowType = + RowType.builder() + .fields(Arrays.asList(new IntType(), new VarCharType(2000))) + .build(); + + Path path = new Path(tempDir.toUri().toString(), "prefetch_sel_ranges.row"); + Options options = new Options(); + options.setString("file.block-size", "256b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + // Write rows with large strings to ensure many blocks + List rows = new ArrayList<>(); + StringBuilder sb = new StringBuilder(); + for (int j = 0; j < 200; j++) { + sb.append('x'); + } + String padding = sb.toString(); + for (int i = 0; i < 1000; i++) { + rows.add(GenericRow.of(i, BinaryString.fromString(padding + i))); + } + writeRows(format, rowType, path, rows); + + // Select rows spread widely across the file + RoaringBitmap32 selection = new RoaringBitmap32(); + selection.add(0); + selection.add(100); + selection.add(500); + selection.add(750); + selection.add(999); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path), selection)); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(row.getInt(0))); + reader.close(); + + assertThat(result).containsExactly(0, 100, 500, 750, 999); + } + + @Test + public void testPrefetcherSingleBlockFile() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "single_block.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + for (int i = 0; i < 10; i++) { + rows.add(GenericRow.of(i)); + } + writeRows(format, rowType, path, rows); + + List result = readAllRows(format, rowType, path); + assertThat(result).hasSize(10); + for (int i = 0; i < 10; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + } + } + + @Test + public void testPrefetcherEmptyBlocksToRead() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "empty_sel.row"); + Options options = new Options(); + options.setString("file.block-size", "256b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List rows = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + rows.add(GenericRow.of(i)); + } + writeRows(format, rowType, path, rows); + + // Selection that matches no rows + RoaringBitmap32 selection = new RoaringBitmap32(); + selection.add(9999); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path), selection)); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(row.getInt(0))); + reader.close(); + + assertThat(result).isEmpty(); + } + + @Test + public void testPrefetchSlidingWindow() throws IOException { + // Many ranges exceeding PREFETCH_COUNT to verify sliding window works + RowType rowType = + RowType.builder() + .fields(Arrays.asList(new IntType(), new VarCharType(5000))) + .build(); + + Path path = new Path(tempDir.toUri().toString(), "sliding_window.row"); + Options options = new Options(); + options.setString("file.block-size", "256b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + // Large rows to ensure many blocks, each block ~ 1 row + List rows = new ArrayList<>(); + StringBuilder sb = new StringBuilder(); + for (int j = 0; j < 500; j++) { + sb.append('a'); + } + String padding = sb.toString(); + for (int i = 0; i < 200; i++) { + rows.add(GenericRow.of(i, BinaryString.fromString(padding + i))); + } + writeRows(format, rowType, path, rows); + + // Read all rows - will create many merged ranges, well beyond PREFETCH_COUNT + List result = readAllRows(format, rowType, path); + assertThat(result).hasSize(200); + for (int i = 0; i < 200; i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + } + } + + @Test + public void testPrefetcherWithSparseSelection() throws IOException { + // Select 1 row per block across many blocks to stress non-contiguous iteration + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "sparse_sel.row"); + Options options = new Options(); + options.setString("file.block-size", "64b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List rows = new ArrayList<>(); + for (int i = 0; i < 1000; i++) { + rows.add(GenericRow.of(i)); + } + writeRows(format, rowType, path, rows); + + // Select every 100th row + RoaringBitmap32 selection = new RoaringBitmap32(); + List expectedValues = new ArrayList<>(); + for (int i = 0; i < 1000; i += 100) { + selection.add(i); + expectedValues.add(i); + } + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path), selection)); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(row.getInt(0))); + reader.close(); + + assertThat(result).containsExactlyElementsOf(expectedValues); + } + + // ======================== Helpers ======================== + + private void writeRows(FileFormat format, RowType rowType, Path path, List rows) + throws IOException { + LocalFileIO fileIO = new LocalFileIO(); + PositionOutputStream out = fileIO.newOutputStream(path, false); + FormatWriter writer = format.createWriterFactory(rowType).create(out, "zstd"); + for (InternalRow row : rows) { + writer.addElement(row); + } + writer.close(); + } + + private List readAllRows(FileFormat format, RowType rowType, Path path) + throws IOException { + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + List result = new ArrayList<>(); + reader.forEachRemaining( + row -> { + GenericRow copy = new GenericRow(rowType.getFieldCount()); + for (int i = 0; i < rowType.getFieldCount(); i++) { + if (row.isNullAt(i)) { + copy.setField(i, null); + } else { + switch (rowType.getTypeAt(i).getTypeRoot()) { + case INTEGER: + copy.setField(i, row.getInt(i)); + break; + case VARCHAR: + copy.setField(i, row.getString(i)); + break; + default: + throw new UnsupportedOperationException(); + } + } + } + result.add(copy); + }); + reader.close(); + return result; + } +} diff --git a/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java b/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java new file mode 100644 index 000000000000..1e2478bb43ab --- /dev/null +++ b/paimon-format/src/test/java/org/apache/paimon/format/row/RowFormatReadWriteTest.java @@ -0,0 +1,1403 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.format.row; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.GenericArray; +import org.apache.paimon.data.GenericMap; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.Timestamp; +import org.apache.paimon.data.variant.GenericVariant; +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.format.FormatReaderContext; +import org.apache.paimon.format.FormatReaderFactory; +import org.apache.paimon.format.FormatWriter; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.PositionOutputStream; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.options.Options; +import org.apache.paimon.reader.FileRecordIterator; +import org.apache.paimon.reader.FileRecordReader; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.BigIntType; +import org.apache.paimon.types.BooleanType; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DateType; +import org.apache.paimon.types.DecimalType; +import org.apache.paimon.types.DoubleType; +import org.apache.paimon.types.FloatType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.MapType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.types.SmallIntType; +import org.apache.paimon.types.TimestampType; +import org.apache.paimon.types.TinyIntType; +import org.apache.paimon.types.VarBinaryType; +import org.apache.paimon.types.VarCharType; +import org.apache.paimon.types.VariantType; +import org.apache.paimon.utils.RoaringBitmap32; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for the row-store file format. */ +public class RowFormatReadWriteTest { + + @TempDir java.nio.file.Path tempDir; + + @Test + public void testBasicReadWrite() throws IOException { + RowType rowType = + RowType.builder() + .fields(Arrays.asList(new IntType(), new VarCharType(100))) + .build(); + + Path path = new Path(tempDir.toUri().toString(), "test.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(1, BinaryString.fromString("hello"))); + expected.add(GenericRow.of(2, BinaryString.fromString("world"))); + expected.add(GenericRow.of(3, BinaryString.fromString("paimon"))); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(expected.size()); + for (int i = 0; i < expected.size(); i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(expected.get(i).getInt(0)); + assertThat(result.get(i).getString(1)).isEqualTo(expected.get(i).getString(1)); + } + } + + @Test + public void testAllPrimitiveTypes() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "f_boolean", new BooleanType()), + new DataField(1, "f_tinyint", new TinyIntType()), + new DataField(2, "f_smallint", new SmallIntType()), + new DataField(3, "f_int", new IntType()), + new DataField(4, "f_bigint", new BigIntType()), + new DataField(5, "f_float", new FloatType()), + new DataField(6, "f_double", new DoubleType()), + new DataField(7, "f_string", new VarCharType(200)), + new DataField(8, "f_binary", new VarBinaryType(200)), + new DataField(9, "f_date", new DateType()), + new DataField(10, "f_decimal_compact", new DecimalType(10, 2)), + new DataField(11, "f_decimal_large", new DecimalType(30, 10)), + new DataField(12, "f_timestamp_compact", new TimestampType(3)), + new DataField(13, "f_timestamp_full", new TimestampType(9)))); + + Path path = new Path(tempDir.toUri().toString(), "all_types.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + expected.add( + GenericRow.of( + true, + (byte) 127, + (short) 32000, + Integer.MAX_VALUE, + Long.MAX_VALUE, + 3.14f, + 2.718281828, + BinaryString.fromString("hello world"), + new byte[] {1, 2, 3, 4, 5}, + 18000, + Decimal.fromBigDecimal(new BigDecimal("12345678.99"), 10, 2), + Decimal.fromBigDecimal( + new BigDecimal("12345678901234567890.1234567890"), 30, 10), + Timestamp.fromEpochMillis(1700000000000L), + Timestamp.fromEpochMillis(1700000000000L, 123456))); + + expected.add( + GenericRow.of( + false, + (byte) -128, + (short) -32000, + Integer.MIN_VALUE, + Long.MIN_VALUE, + -0.0f, + Double.MAX_VALUE, + BinaryString.fromString(""), + new byte[0], + 0, + Decimal.fromBigDecimal(new BigDecimal("-99999999.99"), 10, 2), + Decimal.fromBigDecimal( + new BigDecimal("-12345678901234567890.1234567890"), 30, 10), + Timestamp.fromEpochMillis(0L), + Timestamp.fromEpochMillis(0L, 0))); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(2); + for (int rowIdx = 0; rowIdx < 2; rowIdx++) { + InternalRow actual = result.get(rowIdx); + InternalRow exp = expected.get(rowIdx); + assertThat(actual.getBoolean(0)).isEqualTo(exp.getBoolean(0)); + assertThat(actual.getByte(1)).isEqualTo(exp.getByte(1)); + assertThat(actual.getShort(2)).isEqualTo(exp.getShort(2)); + assertThat(actual.getInt(3)).isEqualTo(exp.getInt(3)); + assertThat(actual.getLong(4)).isEqualTo(exp.getLong(4)); + assertThat(actual.getFloat(5)).isEqualTo(exp.getFloat(5)); + assertThat(actual.getDouble(6)).isEqualTo(exp.getDouble(6)); + assertThat(actual.getString(7)).isEqualTo(exp.getString(7)); + assertThat(actual.getBinary(8)).isEqualTo(exp.getBinary(8)); + assertThat(actual.getInt(9)).isEqualTo(exp.getInt(9)); + assertThat(actual.getDecimal(10, 10, 2)).isEqualTo(exp.getDecimal(10, 10, 2)); + assertThat(actual.getDecimal(11, 30, 10)).isEqualTo(exp.getDecimal(11, 30, 10)); + assertThat(actual.getTimestamp(12, 3)).isEqualTo(exp.getTimestamp(12, 3)); + assertThat(actual.getTimestamp(13, 9)).isEqualTo(exp.getTimestamp(13, 9)); + } + } + + @Test + public void testNullValues() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "f_int", new IntType(true)), + new DataField(1, "f_string", new VarCharType(true, 100)), + new DataField(2, "f_double", new DoubleType(true)), + new DataField(3, "f_binary", new VarBinaryType(true, 100)), + new DataField(4, "f_decimal", new DecimalType(true, 10, 2)), + new DataField(5, "f_timestamp", new TimestampType(true, 9)))); + + Path path = new Path(tempDir.toUri().toString(), "nulls.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(null, null, null, null, null, null)); + expected.add( + GenericRow.of( + 42, + BinaryString.fromString("not null"), + 3.14, + new byte[] {1}, + Decimal.fromBigDecimal(new BigDecimal("1.23"), 10, 2), + Timestamp.fromEpochMillis(1000L, 999))); + expected.add( + GenericRow.of(null, BinaryString.fromString("partial"), null, null, null, null)); + expected.add( + GenericRow.of(100, null, 2.0, null, null, Timestamp.fromEpochMillis(2000L, 0))); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(4); + + assertThat(result.get(0).isNullAt(0)).isTrue(); + assertThat(result.get(0).isNullAt(1)).isTrue(); + assertThat(result.get(0).isNullAt(2)).isTrue(); + assertThat(result.get(0).isNullAt(3)).isTrue(); + assertThat(result.get(0).isNullAt(4)).isTrue(); + assertThat(result.get(0).isNullAt(5)).isTrue(); + + assertThat(result.get(1).getInt(0)).isEqualTo(42); + assertThat(result.get(1).getString(1)).isEqualTo(BinaryString.fromString("not null")); + assertThat(result.get(1).getDouble(2)).isEqualTo(3.14); + assertThat(result.get(1).getBinary(3)).isEqualTo(new byte[] {1}); + assertThat(result.get(1).getDecimal(4, 10, 2)) + .isEqualTo(Decimal.fromBigDecimal(new BigDecimal("1.23"), 10, 2)); + assertThat(result.get(1).getTimestamp(5, 9)) + .isEqualTo(Timestamp.fromEpochMillis(1000L, 999)); + + assertThat(result.get(2).isNullAt(0)).isTrue(); + assertThat(result.get(2).getString(1)).isEqualTo(BinaryString.fromString("partial")); + assertThat(result.get(2).isNullAt(2)).isTrue(); + + assertThat(result.get(3).getInt(0)).isEqualTo(100); + assertThat(result.get(3).isNullAt(1)).isTrue(); + assertThat(result.get(3).getDouble(2)).isEqualTo(2.0); + } + + @Test + public void testArrayType() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "f_int_array", new ArrayType(new IntType())), + new DataField( + 1, "f_string_array", new ArrayType(new VarCharType(100))), + new DataField( + 2, + "f_nullable_array", + new ArrayType(true, new IntType(true))))); + + Path path = new Path(tempDir.toUri().toString(), "arrays.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + expected.add( + GenericRow.of( + new GenericArray(new Object[] {1, 2, 3}), + new GenericArray( + new Object[] { + BinaryString.fromString("a"), BinaryString.fromString("b") + }), + new GenericArray(new Object[] {1, null, 3, null, 5}))); + expected.add( + GenericRow.of( + new GenericArray(new Object[0]), + new GenericArray(new Object[] {BinaryString.fromString("")}), + null)); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(2); + + assertThat(result.get(0).getArray(0).size()).isEqualTo(3); + assertThat(result.get(0).getArray(0).getInt(0)).isEqualTo(1); + assertThat(result.get(0).getArray(0).getInt(1)).isEqualTo(2); + assertThat(result.get(0).getArray(0).getInt(2)).isEqualTo(3); + + assertThat(result.get(0).getArray(1).size()).isEqualTo(2); + assertThat(result.get(0).getArray(1).getString(0)).isEqualTo(BinaryString.fromString("a")); + assertThat(result.get(0).getArray(1).getString(1)).isEqualTo(BinaryString.fromString("b")); + + assertThat(result.get(0).getArray(2).size()).isEqualTo(5); + assertThat(result.get(0).getArray(2).getInt(0)).isEqualTo(1); + assertThat(result.get(0).getArray(2).isNullAt(1)).isTrue(); + assertThat(result.get(0).getArray(2).getInt(2)).isEqualTo(3); + assertThat(result.get(0).getArray(2).isNullAt(3)).isTrue(); + assertThat(result.get(0).getArray(2).getInt(4)).isEqualTo(5); + + assertThat(result.get(1).getArray(0).size()).isEqualTo(0); + assertThat(result.get(1).getArray(1).size()).isEqualTo(1); + assertThat(result.get(1).isNullAt(2)).isTrue(); + } + + @Test + public void testMapType() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField( + 0, + "f_map", + new MapType(new VarCharType(100), new IntType(true))))); + + Path path = new Path(tempDir.toUri().toString(), "maps.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + Map map1 = new HashMap<>(); + map1.put(BinaryString.fromString("a"), 1); + map1.put(BinaryString.fromString("b"), 2); + map1.put(BinaryString.fromString("c"), null); + + Map map2 = new HashMap<>(); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(new GenericMap(map1))); + expected.add(GenericRow.of(new GenericMap(map2))); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(2); + assertThat(result.get(0).getMap(0).size()).isEqualTo(3); + assertThat(result.get(1).getMap(0).size()).isEqualTo(0); + } + + @Test + public void testMapWithNullKeys() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField( + 0, + "f_map", + new MapType( + new VarCharType(true, 100), new IntType(true))))); + + Path path = new Path(tempDir.toUri().toString(), "null_key_maps.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + Map map = new HashMap<>(); + map.put(null, 100); + map.put(BinaryString.fromString("key"), null); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(new GenericMap(map))); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(1); + assertThat(result.get(0).getMap(0).size()).isEqualTo(2); + } + + @Test + public void testNestedRow() throws IOException { + RowType innerType = + new RowType( + Arrays.asList( + new DataField(0, "x", new IntType()), + new DataField(1, "y", new VarCharType(100)))); + + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "nested", innerType), + new DataField(2, "nullable_nested", innerType))); + + Path path = new Path(tempDir.toUri().toString(), "nested.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(1, GenericRow.of(10, BinaryString.fromString("inner")), null)); + expected.add( + GenericRow.of( + 2, + GenericRow.of(20, null), + GenericRow.of(30, BinaryString.fromString("deep")))); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(2); + + assertThat(result.get(0).getInt(0)).isEqualTo(1); + InternalRow nested0 = result.get(0).getRow(1, 2); + assertThat(nested0.getInt(0)).isEqualTo(10); + assertThat(nested0.getString(1)).isEqualTo(BinaryString.fromString("inner")); + assertThat(result.get(0).isNullAt(2)).isTrue(); + + assertThat(result.get(1).getInt(0)).isEqualTo(2); + InternalRow nested1 = result.get(1).getRow(1, 2); + assertThat(nested1.getInt(0)).isEqualTo(20); + assertThat(nested1.isNullAt(1)).isTrue(); + InternalRow nested2 = result.get(1).getRow(2, 2); + assertThat(nested2.getInt(0)).isEqualTo(30); + assertThat(nested2.getString(1)).isEqualTo(BinaryString.fromString("deep")); + } + + @Test + public void testDeeplyNestedTypes() throws IOException { + RowType innerRowType = + new RowType( + Arrays.asList( + new DataField(0, "v", new IntType()), + new DataField(1, "arr", new ArrayType(new IntType())))); + + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField( + 1, + "nested_array", + new ArrayType(new ArrayType(new IntType()))), + new DataField( + 2, + "map_of_arrays", + new MapType( + new VarCharType(50), new ArrayType(new IntType()))), + new DataField(3, "array_of_rows", new ArrayType(innerRowType)))); + + Path path = new Path(tempDir.toUri().toString(), "deeply_nested.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + GenericArray innerArr1 = new GenericArray(new Object[] {1, 2, 3}); + GenericArray innerArr2 = new GenericArray(new Object[] {4, 5}); + GenericArray nestedArray = new GenericArray(new Object[] {innerArr1, innerArr2}); + + Map mapOfArrays = new HashMap<>(); + mapOfArrays.put(BinaryString.fromString("x"), new GenericArray(new Object[] {10, 20})); + mapOfArrays.put(BinaryString.fromString("y"), new GenericArray(new Object[] {30})); + + GenericRow innerRow1 = GenericRow.of(100, new GenericArray(new Object[] {1, 2})); + GenericRow innerRow2 = GenericRow.of(200, new GenericArray(new Object[] {3})); + GenericArray arrayOfRows = new GenericArray(new Object[] {innerRow1, innerRow2}); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(1, nestedArray, new GenericMap(mapOfArrays), arrayOfRows)); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(1); + InternalRow row = result.get(0); + assertThat(row.getInt(0)).isEqualTo(1); + + assertThat(row.getArray(1).size()).isEqualTo(2); + assertThat(row.getArray(1).getArray(0).getInt(0)).isEqualTo(1); + assertThat(row.getArray(1).getArray(0).getInt(2)).isEqualTo(3); + assertThat(row.getArray(1).getArray(1).getInt(0)).isEqualTo(4); + + assertThat(row.getMap(2).size()).isEqualTo(2); + + assertThat(row.getArray(3).size()).isEqualTo(2); + InternalRow readInner0 = row.getArray(3).getRow(0, 2); + assertThat(readInner0.getInt(0)).isEqualTo(100); + assertThat(readInner0.getArray(1).getInt(0)).isEqualTo(1); + assertThat(readInner0.getArray(1).getInt(1)).isEqualTo(2); + InternalRow readInner1 = row.getArray(3).getRow(1, 2); + assertThat(readInner1.getInt(0)).isEqualTo(200); + } + + @Test + public void testVariantType() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "v", new VariantType()))); + + Path path = new Path(tempDir.toUri().toString(), "variant.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + GenericVariant v1 = GenericVariant.fromJson("{\"key\": 123}"); + GenericVariant v2 = GenericVariant.fromJson("[1, 2, 3]"); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(1, v1)); + expected.add(GenericRow.of(2, v2)); + expected.add(GenericRow.of(3, null)); + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(3); + assertThat(result.get(0).getInt(0)).isEqualTo(1); + assertThat(result.get(0).getVariant(1).value()).isEqualTo(v1.value()); + assertThat(result.get(0).getVariant(1).metadata()).isEqualTo(v1.metadata()); + assertThat(result.get(1).getVariant(1).value()).isEqualTo(v2.value()); + assertThat(result.get(1).getVariant(1).metadata()).isEqualTo(v2.metadata()); + assertThat(result.get(2).isNullAt(1)).isTrue(); + } + + @Test + public void testMultipleBlocks() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "multi_block.row"); + Options options = new Options(); + options.setString("file.block-size", "1kb"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List expected = new ArrayList<>(); + for (int i = 0; i < 10000; i++) { + expected.add(GenericRow.of(i)); + } + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(expected.size()); + for (int i = 0; i < expected.size(); i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(i); + } + } + + @Test + public void testRowPositionTracking() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "positions.row"); + Options options = new Options(); + options.setString("file.block-size", "512b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List expected = new ArrayList<>(); + for (int i = 0; i < 1000; i++) { + expected.add(GenericRow.of(i)); + } + + writeRows(format, rowType, path, expected); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + long expectedPosition = 0; + FileRecordIterator batch; + while ((batch = reader.readBatch()) != null) { + InternalRow row; + while ((row = batch.next()) != null) { + assertThat(batch.returnedPosition()).isEqualTo(expectedPosition); + assertThat(row.getInt(0)).isEqualTo((int) expectedPosition); + expectedPosition++; + } + batch.releaseBatch(); + } + assertThat(expectedPosition).isEqualTo(1000); + reader.close(); + } + + @Test + public void testProjection() throws IOException { + RowType fullType = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new VarCharType(100)), + new DataField(2, "c", new IntType()))); + + RowType projectedType = new RowType(Arrays.asList(new DataField(2, "c", new IntType()))); + + Path path = new Path(tempDir.toUri().toString(), "projection.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add(GenericRow.of(1, BinaryString.fromString("a"), 100)); + rows.add(GenericRow.of(2, BinaryString.fromString("b"), 200)); + rows.add(GenericRow.of(3, BinaryString.fromString("c"), 300)); + + writeRows(format, fullType, path, rows); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(fullType, projectedType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(GenericRow.of(row.getInt(0)))); + reader.close(); + + assertThat(result.size()).isEqualTo(3); + assertThat(result.get(0).getInt(0)).isEqualTo(100); + assertThat(result.get(1).getInt(0)).isEqualTo(200); + assertThat(result.get(2).getInt(0)).isEqualTo(300); + } + + @Test + public void testProjectionMultipleColumns() throws IOException { + RowType fullType = + new RowType( + Arrays.asList( + new DataField(0, "a", new IntType()), + new DataField(1, "b", new VarCharType(100)), + new DataField(2, "c", new DoubleType()), + new DataField(3, "d", new BigIntType()))); + + RowType projectedType = + new RowType( + Arrays.asList( + new DataField(2, "c", new DoubleType()), + new DataField(0, "a", new IntType()))); + + Path path = new Path(tempDir.toUri().toString(), "projection_multi.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add(GenericRow.of(1, BinaryString.fromString("x"), 1.1, 100L)); + rows.add(GenericRow.of(2, BinaryString.fromString("y"), 2.2, 200L)); + + writeRows(format, fullType, path, rows); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(fullType, projectedType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(GenericRow.of(row.getDouble(0), row.getInt(1)))); + reader.close(); + + assertThat(result.size()).isEqualTo(2); + assertThat(result.get(0).getDouble(0)).isEqualTo(1.1); + assertThat(result.get(0).getInt(1)).isEqualTo(1); + assertThat(result.get(1).getDouble(0)).isEqualTo(2.2); + assertThat(result.get(1).getInt(1)).isEqualTo(2); + } + + @Test + public void testSelection() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "selection.row"); + Options options = new Options(); + options.setString("file.block-size", "256b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List expected = new ArrayList<>(); + for (int i = 0; i < 500; i++) { + expected.add(GenericRow.of(i)); + } + + writeRows(format, rowType, path, expected); + + RoaringBitmap32 selection = new RoaringBitmap32(); + selection.add(0); + selection.add(10); + selection.add(100); + selection.add(499); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path), selection)); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(row.getInt(0))); + reader.close(); + + assertThat(result).containsExactly(0, 10, 100, 499); + } + + @Test + public void testSelectionSkipsEntireBlocks() throws IOException { + RowType rowType = + RowType.builder() + .fields(Arrays.asList(new IntType(), new VarCharType(100))) + .build(); + + Path path = new Path(tempDir.toUri().toString(), "selection_skip.row"); + Options options = new Options(); + options.setString("file.block-size", "256b"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + List rows = new ArrayList<>(); + for (int i = 0; i < 1000; i++) { + rows.add(GenericRow.of(i, BinaryString.fromString("val" + i))); + } + + writeRows(format, rowType, path, rows); + + RoaringBitmap32 selection = new RoaringBitmap32(); + selection.add(0); + selection.add(999); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path), selection)); + + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(row.getInt(0))); + reader.close(); + + assertThat(result).containsExactly(0, 999); + } + + @Test + public void testLargeVariableLengthData() throws IOException { + RowType rowType = + RowType.builder() + .fields(Arrays.asList(new IntType(), new VarCharType(10000))) + .build(); + + Path path = new Path(tempDir.toUri().toString(), "large_strings.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + Random random = new Random(42); + List expected = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + StringBuilder sb = new StringBuilder(); + int len = random.nextInt(5000) + 100; + for (int j = 0; j < len; j++) { + sb.append((char) ('a' + random.nextInt(26))); + } + expected.add(GenericRow.of(i, BinaryString.fromString(sb.toString()))); + } + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(expected.size()); + for (int i = 0; i < expected.size(); i++) { + assertThat(result.get(i).getInt(0)).isEqualTo(expected.get(i).getInt(0)); + assertThat(result.get(i).getString(1)).isEqualTo(expected.get(i).getString(1)); + } + } + + @Test + public void testEmptyFile() throws IOException { + RowType rowType = RowType.builder().fields(Arrays.asList(new IntType())).build(); + + Path path = new Path(tempDir.toUri().toString(), "empty.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + writeRows(format, rowType, path, new ArrayList<>()); + List result = readAllRows(format, rowType, path); + + assertThat(result).isEmpty(); + } + + @Test + public void testManyColumns() throws IOException { + int numCols = 100; + List fields = new ArrayList<>(); + for (int i = 0; i < numCols; i++) { + fields.add(new DataField(i, "f" + i, new IntType(true))); + } + RowType rowType = new RowType(fields); + + Path path = new Path(tempDir.toUri().toString(), "many_cols.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + for (int r = 0; r < 50; r++) { + Object[] values = new Object[numCols]; + for (int c = 0; c < numCols; c++) { + values[c] = (r + c) % 3 == 0 ? null : r * numCols + c; + } + expected.add(GenericRow.of(values)); + } + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(50); + for (int r = 0; r < 50; r++) { + for (int c = 0; c < numCols; c++) { + if ((r + c) % 3 == 0) { + assertThat(result.get(r).isNullAt(c)).isTrue(); + } else { + assertThat(result.get(r).getInt(c)).isEqualTo(r * numCols + c); + } + } + } + } + + @Test + public void testRandomizedRoundTrip() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "f_int", new IntType(true)), + new DataField(1, "f_long", new BigIntType(true)), + new DataField(2, "f_str", new VarCharType(true, 500)), + new DataField(3, "f_double", new DoubleType(true)), + new DataField(4, "f_bool", new BooleanType(true)), + new DataField(5, "f_bytes", new VarBinaryType(true, 500)))); + + Path path = new Path(tempDir.toUri().toString(), "random.row"); + Options options = new Options(); + options.setString("file.block-size", "2kb"); + FileFormat format = FileFormat.fromIdentifier("row", options); + + Random random = new Random(12345); + int numRows = 5000; + List expected = new ArrayList<>(numRows); + for (int i = 0; i < numRows; i++) { + Object[] values = new Object[6]; + values[0] = random.nextBoolean() ? null : random.nextInt(); + values[1] = random.nextBoolean() ? null : random.nextLong(); + values[2] = + random.nextBoolean() + ? null + : BinaryString.fromString(randomString(random, random.nextInt(200))); + values[3] = random.nextBoolean() ? null : random.nextDouble(); + values[4] = random.nextBoolean() ? null : random.nextBoolean(); + values[5] = random.nextBoolean() ? null : randomBytes(random, random.nextInt(100)); + expected.add(GenericRow.of(values)); + } + + writeRows(format, rowType, path, expected); + List result = readAllRows(format, rowType, path); + + assertThat(result.size()).isEqualTo(numRows); + for (int i = 0; i < numRows; i++) { + InternalRow exp = expected.get(i); + InternalRow act = result.get(i); + for (int c = 0; c < 6; c++) { + assertThat(act.isNullAt(c)) + .as("row %d col %d null mismatch", i, c) + .isEqualTo(exp.isNullAt(c)); + } + if (!act.isNullAt(0)) { + assertThat(act.getInt(0)).isEqualTo(exp.getInt(0)); + } + if (!act.isNullAt(1)) { + assertThat(act.getLong(1)).isEqualTo(exp.getLong(1)); + } + if (!act.isNullAt(2)) { + assertThat(act.getString(2)).isEqualTo(exp.getString(2)); + } + if (!act.isNullAt(3)) { + assertThat(act.getDouble(3)).isEqualTo(exp.getDouble(3)); + } + if (!act.isNullAt(4)) { + assertThat(act.getBoolean(4)).isEqualTo(exp.getBoolean(4)); + } + if (!act.isNullAt(5)) { + assertThat(act.getBinary(5)).isEqualTo(exp.getBinary(5)); + } + } + } + + @Test + public void testBlobAndVectorTypes() throws IOException { + RowType rowType = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "data", new org.apache.paimon.types.BlobType()), + new DataField( + 2, + "embedding", + new org.apache.paimon.types.VectorType( + 4, new FloatType())))); + + Path path = new Path(tempDir.toUri().toString(), "blob_vector.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + LocalFileIO fileIO = new LocalFileIO(); + PositionOutputStream out = fileIO.newOutputStream(path, false); + FormatWriter writer = format.createWriterFactory(rowType).create(out, "zstd"); + writer.addElement( + GenericRow.of( + 1, + org.apache.paimon.data.Blob.fromData(new byte[] {10, 20, 30}), + org.apache.paimon.data.BinaryVector.fromPrimitiveArray( + new float[] {1.0f, 2.0f, 3.0f, 4.0f}))); + writer.addElement(GenericRow.of(2, null, null)); + writer.close(); + + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List result = new ArrayList<>(); + reader.forEachRemaining( + row -> { + GenericRow copy = new GenericRow(3); + copy.setField(0, row.getInt(0)); + copy.setField(1, row.isNullAt(1) ? null : row.getBlob(1)); + copy.setField(2, row.isNullAt(2) ? null : row.getVector(2)); + result.add(copy); + }); + reader.close(); + + assertThat(result.size()).isEqualTo(2); + InternalRow row1 = result.get(0); + assertThat(row1.getInt(0)).isEqualTo(1); + assertThat(row1.getBlob(1).toData()).isEqualTo(new byte[] {10, 20, 30}); + org.apache.paimon.data.InternalVector vec = row1.getVector(2); + assertThat(vec.size()).isEqualTo(4); + assertThat(vec.getFloat(0)).isEqualTo(1.0f); + assertThat(vec.getFloat(1)).isEqualTo(2.0f); + assertThat(vec.getFloat(2)).isEqualTo(3.0f); + assertThat(vec.getFloat(3)).isEqualTo(4.0f); + + InternalRow row2 = result.get(1); + assertThat(row2.getInt(0)).isEqualTo(2); + assertThat(row2.isNullAt(1)).isTrue(); + assertThat(row2.isNullAt(2)).isTrue(); + } + + @Test + public void testNestedRowProjection() throws IOException { + RowType innerType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", innerType))); + + Path path = new Path(tempDir.toUri().toString(), "nested_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List expected = new ArrayList<>(); + expected.add(GenericRow.of(1, GenericRow.of(10, 100))); + expected.add(GenericRow.of(2, GenericRow.of(20, 200))); + writeRows(format, dataSchema, path, expected); + + // Read with projected type: only top-level 'r', nested only 'b' + RowType projectedInner = new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "r", projectedInner))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List result = new ArrayList<>(); + reader.forEachRemaining( + row -> { + // projectedSchema is ROW> + // row.getRow(0, 1) should return the projected nested row with only 'b' + InternalRow nested = row.getRow(0, 1); + result.add(GenericRow.of(nested.getInt(0))); + }); + reader.close(); + + // nested.getInt(0) should be 'b' value (100, 200), not 'a' value (10, 20) + assertThat(result.size()).isEqualTo(2); + assertThat(result.get(0).getInt(0)).isEqualTo(100); + assertThat(result.get(1).getInt(0)).isEqualTo(200); + } + + @Test + public void testDeeplyNestedProjection() throws IOException { + // data: ROW>> + RowType level2 = + new RowType( + Arrays.asList( + new DataField(20, "a", new IntType()), + new DataField(21, "b", new IntType()), + new DataField(22, "c", new IntType()))); + RowType level1 = + new RowType( + Arrays.asList( + new DataField(10, "x", new IntType()), + new DataField(11, "l2", level2))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "l1", level1))); + + Path path = new Path(tempDir.toUri().toString(), "deep_nested_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add(GenericRow.of(1, GenericRow.of(10, GenericRow.of(100, 200, 300)))); + rows.add(GenericRow.of(2, GenericRow.of(20, GenericRow.of(400, 500, 600)))); + writeRows(format, dataSchema, path, rows); + + // projected: ROW>> + RowType projLevel2 = new RowType(Arrays.asList(new DataField(22, "c", new IntType()))); + RowType projLevel1 = new RowType(Arrays.asList(new DataField(11, "l2", projLevel2))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "l1", projLevel1))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List results = new ArrayList<>(); + reader.forEachRemaining( + row -> { + InternalRow l1 = row.getRow(0, 1); + InternalRow l2 = l1.getRow(0, 1); + results.add(l2.getInt(0)); + }); + reader.close(); + + assertThat(results).containsExactly(300, 600); + } + + @Test + public void testNestedProjectionWithNullRows() throws IOException { + // data: ROW> + RowType innerType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", innerType))); + + Path path = new Path(tempDir.toUri().toString(), "nested_null_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add(GenericRow.of(1, GenericRow.of(10, 100))); + rows.add(GenericRow.of(2, null)); + rows.add(GenericRow.of(3, GenericRow.of(30, 300))); + writeRows(format, dataSchema, path, rows); + + // projected: ROW> + RowType projectedInner = new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "r", projectedInner))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List nullFlags = new ArrayList<>(); + List values = new ArrayList<>(); + reader.forEachRemaining( + row -> { + boolean isNull = row.isNullAt(0); + nullFlags.add(new boolean[] {isNull}); + if (!isNull) { + values.add(row.getRow(0, 1).getInt(0)); + } + }); + reader.close(); + + assertThat(nullFlags.size()).isEqualTo(3); + assertThat(nullFlags.get(0)[0]).isFalse(); + assertThat(nullFlags.get(1)[0]).isTrue(); + assertThat(nullFlags.get(2)[0]).isFalse(); + assertThat(values).containsExactly(100, 300); + } + + @Test + public void testMultipleNestedRowsProjection() throws IOException { + // data: ROW, r2 ROW, id INT> + RowType nested1 = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType nested2 = + new RowType( + Arrays.asList( + new DataField(20, "x", new IntType()), + new DataField(21, "y", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "r1", nested1), + new DataField(1, "r2", nested2), + new DataField(2, "id", new IntType()))); + + Path path = new Path(tempDir.toUri().toString(), "multi_nested_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add(GenericRow.of(GenericRow.of(1, 2), GenericRow.of(3, 4), 100)); + rows.add(GenericRow.of(GenericRow.of(5, 6), GenericRow.of(7, 8), 200)); + writeRows(format, dataSchema, path, rows); + + // projected: ROW, r2 ROW> + RowType projNested1 = new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projNested2 = new RowType(Arrays.asList(new DataField(20, "x", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField(0, "r1", projNested1), + new DataField(1, "r2", projNested2))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List results = new ArrayList<>(); + reader.forEachRemaining( + row -> { + int b = row.getRow(0, 1).getInt(0); + int x = row.getRow(1, 1).getInt(0); + results.add(new int[] {b, x}); + }); + reader.close(); + + assertThat(results.size()).isEqualTo(2); + assertThat(results.get(0)).isEqualTo(new int[] {2, 3}); + assertThat(results.get(1)).isEqualTo(new int[] {6, 7}); + } + + @Test + public void testNestedProjectionWithFieldReordering() throws IOException { + // data: ROW> + RowType innerType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()), + new DataField(12, "c", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "id", new IntType()), + new DataField(1, "r", innerType))); + + Path path = new Path(tempDir.toUri().toString(), "nested_reorder_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add(GenericRow.of(1, GenericRow.of(10, 20, 30))); + rows.add(GenericRow.of(2, GenericRow.of(40, 50, 60))); + writeRows(format, dataSchema, path, rows); + + // projected: ROW> (reversed order, skip 'b') + RowType projectedInner = + new RowType( + Arrays.asList( + new DataField(12, "c", new IntType()), + new DataField(10, "a", new IntType()))); + RowType projectedSchema = new RowType(Arrays.asList(new DataField(1, "r", projectedInner))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List results = new ArrayList<>(); + reader.forEachRemaining( + row -> { + InternalRow nested = row.getRow(0, 2); + results.add(new int[] {nested.getInt(0), nested.getInt(1)}); + }); + reader.close(); + + // c, a order + assertThat(results.size()).isEqualTo(2); + assertThat(results.get(0)).isEqualTo(new int[] {30, 10}); + assertThat(results.get(1)).isEqualTo(new int[] {60, 40}); + } + + @Test + public void testArrayElementProjection() throws IOException { + // data: ROW>> + RowType elementType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType(Arrays.asList(new DataField(0, "arr", new ArrayType(elementType)))); + + Path path = new Path(tempDir.toUri().toString(), "array_elem_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + List rows = new ArrayList<>(); + rows.add( + GenericRow.of( + new GenericArray( + new Object[] {GenericRow.of(1, 100), GenericRow.of(2, 200)}))); + rows.add(GenericRow.of(new GenericArray(new Object[] {GenericRow.of(3, 300)}))); + writeRows(format, dataSchema, path, rows); + + // projected: ROW>> + RowType projectedElementType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField(0, "arr", new ArrayType(projectedElementType)))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List results = new ArrayList<>(); + reader.forEachRemaining( + row -> { + InternalRow.FieldGetter arrayGetter = + InternalRow.createFieldGetter(new ArrayType(projectedElementType), 0); + org.apache.paimon.data.InternalArray arr = row.getArray(0); + for (int i = 0; i < arr.size(); i++) { + results.add(arr.getRow(i, 1).getInt(0)); + } + }); + reader.close(); + + // Should get 'b' values (100, 200, 300), not 'a' values (1, 2, 3) + assertThat(results).containsExactly(100, 200, 300); + } + + @Test + public void testMapValueProjection() throws IOException { + // data: ROW>> + RowType valueType = + new RowType( + Arrays.asList( + new DataField(10, "a", new IntType()), + new DataField(11, "b", new IntType()))); + RowType dataSchema = + new RowType( + Arrays.asList( + new DataField(0, "m", new MapType(new IntType(), valueType)))); + + Path path = new Path(tempDir.toUri().toString(), "map_value_proj.row"); + FileFormat format = FileFormat.fromIdentifier("row", new Options()); + + Map mapData = new java.util.HashMap<>(); + mapData.put(1, GenericRow.of(10, 100)); + mapData.put(2, GenericRow.of(20, 200)); + List rows = new ArrayList<>(); + rows.add(GenericRow.of(new GenericMap(mapData))); + writeRows(format, dataSchema, path, rows); + + // projected: ROW>> + RowType projectedValueType = + new RowType(Arrays.asList(new DataField(11, "b", new IntType()))); + RowType projectedSchema = + new RowType( + Arrays.asList( + new DataField( + 0, "m", new MapType(new IntType(), projectedValueType)))); + + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(dataSchema, projectedSchema, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + + List results = new ArrayList<>(); + reader.forEachRemaining( + row -> { + org.apache.paimon.data.InternalMap m = row.getMap(0); + org.apache.paimon.data.InternalArray keys = m.keyArray(); + org.apache.paimon.data.InternalArray values = m.valueArray(); + for (int i = 0; i < m.size(); i++) { + results.add(values.getRow(i, 1).getInt(0)); + } + }); + reader.close(); + + // Should get 'b' values (100, 200), not 'a' values (10, 20) + assertThat(results).containsExactlyInAnyOrder(100, 200); + } + + // ======================== Helpers ======================== + + private void writeRows(FileFormat format, RowType rowType, Path path, List rows) + throws IOException { + LocalFileIO fileIO = new LocalFileIO(); + PositionOutputStream out = fileIO.newOutputStream(path, false); + FormatWriter writer = format.createWriterFactory(rowType).create(out, "zstd"); + for (InternalRow row : rows) { + writer.addElement(row); + } + writer.close(); + } + + private List readAllRows(FileFormat format, RowType rowType, Path path) + throws IOException { + LocalFileIO fileIO = new LocalFileIO(); + FormatReaderFactory readerFactory = + format.createReaderFactory(rowType, rowType, new ArrayList<>()); + FileRecordReader reader = + readerFactory.createReader( + new FormatReaderContext(fileIO, path, fileIO.getFileSize(path))); + List result = new ArrayList<>(); + reader.forEachRemaining(row -> result.add(copyRow(row, rowType))); + reader.close(); + return result; + } + + private GenericRow copyRow(InternalRow row, RowType rowType) { + int arity = rowType.getFieldCount(); + GenericRow copy = new GenericRow(arity); + for (int i = 0; i < arity; i++) { + if (row.isNullAt(i)) { + copy.setField(i, null); + } else { + copy.setField(i, copyField(row, i, rowType.getTypeAt(i))); + } + } + return copy; + } + + private Object copyField(InternalRow row, int i, org.apache.paimon.types.DataType type) { + switch (type.getTypeRoot()) { + case BOOLEAN: + return row.getBoolean(i); + case TINYINT: + return row.getByte(i); + case SMALLINT: + return row.getShort(i); + case INTEGER: + case DATE: + case TIME_WITHOUT_TIME_ZONE: + return row.getInt(i); + case BIGINT: + return row.getLong(i); + case FLOAT: + return row.getFloat(i); + case DOUBLE: + return row.getDouble(i); + case CHAR: + case VARCHAR: + return row.getString(i); + case BINARY: + case VARBINARY: + return row.getBinary(i); + case DECIMAL: + { + int p = ((org.apache.paimon.types.DecimalType) type).getPrecision(); + int s = ((org.apache.paimon.types.DecimalType) type).getScale(); + return row.getDecimal(i, p, s); + } + case TIMESTAMP_WITHOUT_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + { + int p = ((org.apache.paimon.types.TimestampType) type).getPrecision(); + return row.getTimestamp(i, p); + } + case VARIANT: + return row.getVariant(i); + case ARRAY: + return row.getArray(i); + case MAP: + return row.getMap(i); + case MULTISET: + return row.getMap(i); + case ROW: + return row.getRow(i, ((RowType) type).getFieldCount()); + default: + throw new UnsupportedOperationException("Unsupported: " + type); + } + } + + private static String randomString(Random random, int length) { + StringBuilder sb = new StringBuilder(length); + for (int i = 0; i < length; i++) { + sb.append((char) ('a' + random.nextInt(26))); + } + return sb.toString(); + } + + private static byte[] randomBytes(Random random, int length) { + byte[] bytes = new byte[length]; + random.nextBytes(bytes); + return bytes; + } +}