-
Notifications
You must be signed in to change notification settings - Fork 188
Paimon Source Support #742
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4a5b40b
a8051db
3ca95a7
8aefef9
fc72c38
a974188
4c62138
0504f9c
251e7c7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,138 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.xtable.paimon; | ||
|
|
||
| import java.io.IOException; | ||
| import java.time.Instant; | ||
| import java.util.List; | ||
|
|
||
| import lombok.extern.log4j.Log4j2; | ||
|
|
||
| import org.apache.paimon.Snapshot; | ||
| import org.apache.paimon.schema.SchemaManager; | ||
| import org.apache.paimon.schema.TableSchema; | ||
| import org.apache.paimon.table.FileStoreTable; | ||
| import org.apache.paimon.utils.SnapshotManager; | ||
|
|
||
| import org.apache.xtable.exception.ReadException; | ||
| import org.apache.xtable.model.*; | ||
| import org.apache.xtable.model.schema.InternalPartitionField; | ||
| import org.apache.xtable.model.schema.InternalSchema; | ||
| import org.apache.xtable.model.storage.DataLayoutStrategy; | ||
| import org.apache.xtable.model.storage.InternalDataFile; | ||
| import org.apache.xtable.model.storage.PartitionFileGroup; | ||
| import org.apache.xtable.model.storage.TableFormat; | ||
| import org.apache.xtable.spi.extractor.ConversionSource; | ||
|
|
||
| @Log4j2 | ||
| public class PaimonConversionSource implements ConversionSource<Snapshot> { | ||
|
|
||
| private final FileStoreTable paimonTable; | ||
| private final SchemaManager schemaManager; | ||
| private final SnapshotManager snapshotManager; | ||
|
|
||
| private final PaimonDataFileExtractor dataFileExtractor = PaimonDataFileExtractor.getInstance(); | ||
| private final PaimonSchemaExtractor schemaExtractor = PaimonSchemaExtractor.getInstance(); | ||
| private final PaimonPartitionExtractor partitionSpecExtractor = | ||
| PaimonPartitionExtractor.getInstance(); | ||
|
|
||
| public PaimonConversionSource(FileStoreTable paimonTable) { | ||
| this.paimonTable = paimonTable; | ||
| this.schemaManager = paimonTable.schemaManager(); | ||
| this.snapshotManager = paimonTable.snapshotManager(); | ||
| } | ||
|
|
||
| @Override | ||
| public InternalTable getTable(Snapshot snapshot) { | ||
| TableSchema paimonSchema = schemaManager.schema(snapshot.schemaId()); | ||
| InternalSchema internalSchema = schemaExtractor.toInternalSchema(paimonSchema); | ||
|
|
||
| List<String> partitionKeys = paimonTable.partitionKeys(); | ||
| List<InternalPartitionField> partitioningFields = | ||
| partitionSpecExtractor.toInternalPartitionFields(partitionKeys, internalSchema); | ||
|
|
||
| return InternalTable.builder() | ||
| .name(paimonTable.name()) | ||
| .tableFormat(TableFormat.PAIMON) | ||
| .readSchema(internalSchema) | ||
| .layoutStrategy(DataLayoutStrategy.HIVE_STYLE_PARTITION) | ||
| .basePath(paimonTable.location().toString()) | ||
| .partitioningFields(partitioningFields) | ||
| .latestCommitTime(Instant.ofEpochMilli(snapshot.timeMillis())) | ||
| .latestMetadataPath(snapshotManager.snapshotPath(snapshot.id()).toString()) | ||
| .build(); | ||
| } | ||
|
|
||
| @Override | ||
| public InternalTable getCurrentTable() { | ||
| Snapshot snapshot = getLastSnapshot(); | ||
| return getTable(snapshot); | ||
| } | ||
|
|
||
| @Override | ||
| public InternalSnapshot getCurrentSnapshot() { | ||
| Snapshot snapshot = getLastSnapshot(); | ||
| InternalTable internalTable = getTable(snapshot); | ||
| InternalSchema internalSchema = internalTable.getReadSchema(); | ||
| List<InternalDataFile> dataFiles = | ||
| dataFileExtractor.toInternalDataFiles(paimonTable, snapshot, internalSchema); | ||
|
|
||
| return InternalSnapshot.builder() | ||
| .table(internalTable) | ||
| .version(Long.toString(snapshot.timeMillis())) | ||
| .partitionedDataFiles(PartitionFileGroup.fromFiles(dataFiles)) | ||
| // TODO : Implement pending commits extraction, required for incremental sync | ||
| // https://github.com/apache/incubator-xtable/issues/754 | ||
| .sourceIdentifier(getCommitIdentifier(snapshot)) | ||
| .build(); | ||
| } | ||
|
|
||
| private Snapshot getLastSnapshot() { | ||
| SnapshotManager snapshotManager = paimonTable.snapshotManager(); | ||
| Snapshot snapshot = snapshotManager.latestSnapshot(); | ||
| if (snapshot == null) { | ||
| throw new ReadException("No snapshots found for table " + paimonTable.name()); | ||
| } | ||
| return snapshot; | ||
| } | ||
|
|
||
| @Override | ||
| public TableChange getTableChangeForCommit(Snapshot snapshot) { | ||
| throw new UnsupportedOperationException("Incremental Sync is not supported yet."); | ||
| } | ||
|
|
||
| @Override | ||
| public CommitsBacklog<Snapshot> getCommitsBacklog( | ||
| InstantsForIncrementalSync instantsForIncrementalSync) { | ||
| throw new UnsupportedOperationException("Incremental Sync is not supported yet."); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean isIncrementalSyncSafeFrom(Instant instant) { | ||
| return false; // Incremental sync is not supported yet | ||
| } | ||
|
|
||
| @Override | ||
| public String getCommitIdentifier(Snapshot snapshot) { | ||
| return Long.toString(snapshot.commitIdentifier()); | ||
| } | ||
|
|
||
| @Override | ||
| public void close() throws IOException {} | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,52 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.xtable.paimon; | ||
|
|
||
| import java.io.IOException; | ||
|
|
||
| import org.apache.paimon.Snapshot; | ||
| import org.apache.paimon.catalog.CatalogContext; | ||
| import org.apache.paimon.fs.FileIO; | ||
| import org.apache.paimon.fs.Path; | ||
| import org.apache.paimon.options.Options; | ||
| import org.apache.paimon.table.FileStoreTable; | ||
| import org.apache.paimon.table.FileStoreTableFactory; | ||
|
|
||
| import org.apache.xtable.conversion.ConversionSourceProvider; | ||
| import org.apache.xtable.conversion.SourceTable; | ||
| import org.apache.xtable.exception.ReadException; | ||
| import org.apache.xtable.spi.extractor.ConversionSource; | ||
|
|
||
| public class PaimonConversionSourceProvider extends ConversionSourceProvider<Snapshot> { | ||
| @Override | ||
| public ConversionSource<Snapshot> getConversionSourceInstance(SourceTable sourceTableConfig) { | ||
| try { | ||
| Options catalogOptions = new Options(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need any changes right now but will the user want to supply some custom options here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably yes, but I'm not sure what they look like right now... |
||
| CatalogContext context = CatalogContext.create(catalogOptions, hadoopConf); | ||
|
|
||
| Path path = new Path(sourceTableConfig.getDataPath()); | ||
| FileIO fileIO = FileIO.get(path, context); | ||
| FileStoreTable paimonTable = FileStoreTableFactory.create(fileIO, path); | ||
|
|
||
| return new PaimonConversionSource(paimonTable); | ||
| } catch (IOException e) { | ||
| throw new ReadException("Failed to read Paimon table from file system", e); | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,99 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.xtable.paimon; | ||
|
|
||
| import java.util.*; | ||
|
|
||
| import org.apache.paimon.Snapshot; | ||
| import org.apache.paimon.io.DataFileMeta; | ||
| import org.apache.paimon.manifest.ManifestEntry; | ||
| import org.apache.paimon.table.FileStoreTable; | ||
| import org.apache.paimon.table.source.snapshot.SnapshotReader; | ||
|
|
||
| import org.apache.xtable.model.schema.InternalSchema; | ||
| import org.apache.xtable.model.stat.ColumnStat; | ||
| import org.apache.xtable.model.storage.InternalDataFile; | ||
|
|
||
| public class PaimonDataFileExtractor { | ||
|
|
||
| private final PaimonPartitionExtractor partitionExtractor = | ||
| PaimonPartitionExtractor.getInstance(); | ||
|
|
||
| private static final PaimonDataFileExtractor INSTANCE = new PaimonDataFileExtractor(); | ||
|
|
||
| public static PaimonDataFileExtractor getInstance() { | ||
| return INSTANCE; | ||
| } | ||
|
|
||
| public List<InternalDataFile> toInternalDataFiles( | ||
| FileStoreTable table, Snapshot snapshot, InternalSchema internalSchema) { | ||
| List<InternalDataFile> result = new ArrayList<>(); | ||
| Iterator<ManifestEntry> manifestEntryIterator = | ||
| newSnapshotReader(table, snapshot).readFileIterator(); | ||
| while (manifestEntryIterator.hasNext()) { | ||
| result.add(toInternalDataFile(table, manifestEntryIterator.next(), internalSchema)); | ||
| } | ||
| return result; | ||
| } | ||
|
|
||
| private InternalDataFile toInternalDataFile( | ||
| FileStoreTable table, ManifestEntry entry, InternalSchema internalSchema) { | ||
| return InternalDataFile.builder() | ||
| .physicalPath(toFullPhysicalPath(table, entry)) | ||
| .fileSizeBytes(entry.file().fileSize()) | ||
| .lastModified(entry.file().creationTimeEpochMillis()) | ||
| .recordCount(entry.file().rowCount()) | ||
| .partitionValues( | ||
| partitionExtractor.toPartitionValues(table, entry.partition(), internalSchema)) | ||
| .columnStats(toColumnStats(entry.file())) | ||
| .build(); | ||
| } | ||
|
|
||
| private String toFullPhysicalPath(FileStoreTable table, ManifestEntry entry) { | ||
| String basePath = table.location().toString(); | ||
| String bucketPath = "bucket-" + entry.bucket(); | ||
| String filePath = entry.file().fileName(); | ||
|
|
||
| Optional<String> partitionPath = partitionExtractor.toPartitionPath(table, entry.partition()); | ||
| if (partitionPath.isPresent()) { | ||
| return String.join("/", basePath, partitionPath.get(), bucketPath, filePath); | ||
| } else { | ||
| return String.join("/", basePath, bucketPath, filePath); | ||
| } | ||
| } | ||
|
|
||
| private List<ColumnStat> toColumnStats(DataFileMeta file) { | ||
| // TODO: Implement logic to extract column stats from the file meta | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's track this as a separate GH issue if we are not already.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| // https://github.com/apache/incubator-xtable/issues/755 | ||
| return Collections.emptyList(); | ||
| } | ||
|
|
||
| private SnapshotReader newSnapshotReader(FileStoreTable table, Snapshot snapshot) { | ||
| // If the table has primary keys, we read only the top level files | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just curious, is this similar to the Hudi Merge on Read table?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, they are similar: https://paimon.apache.org/docs/1.2/primary-key-table/overview/#sorted-runs |
||
| // which means we can only consider fully compacted files. | ||
| if (!table.schema().primaryKeys().isEmpty()) { | ||
| return table | ||
| .newSnapshotReader() | ||
| .withLevel(table.coreOptions().numLevels() - 1) | ||
| .withSnapshot(snapshot); | ||
| } else { | ||
| return table.newSnapshotReader().withSnapshot(snapshot); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nitpick: Can you create a GH Issue to track the incremental sync work if one does not already exist?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#754