diff --git a/.github/workflows/utitcase-spark-4.x.yml b/.github/workflows/utitcase-spark-4.x.yml
index 56629110f503..993fa97ba2cf 100644
--- a/.github/workflows/utitcase-spark-4.x.yml
+++ b/.github/workflows/utitcase-spark-4.x.yml
@@ -61,7 +61,7 @@ jobs:
jvm_timezone=$(random_timezone)
echo "JVM timezone is set to $jvm_timezone"
test_modules=""
- for suffix in ut 4.0; do
+ for suffix in ut 4.0 4.1; do
test_modules+="org.apache.paimon:paimon-spark-${suffix}_2.13,"
done
test_modules="${test_modules%,}"
diff --git a/docs/content/spark/quick-start.md b/docs/content/spark/quick-start.md
index 58530ebcb73e..524d82a16352 100644
--- a/docs/content/spark/quick-start.md
+++ b/docs/content/spark/quick-start.md
@@ -30,7 +30,7 @@ under the License.
Paimon supports the following Spark versions with their respective Java and Scala compatibility. We recommend using the latest Spark version for a better experience.
-- Spark 4.x (including 4.0) : Pre-built with Java 17 and Scala 2.13
+- Spark 4.x (including 4.1, 4.0) : Pre-built with Java 17 and Scala 2.13
- Spark 3.x (including 3.5, 3.4, 3.3, 3.2) : Pre-built with Java 8 and Scala 2.12/2.13
@@ -40,6 +40,7 @@ Download the jar file with corresponding version.
| Version | Jar (Scala 2.12) | Jar (Scala 2.13) |
|-----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Spark 4.1 | - | [paimon-spark-4.1_2.13-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-4.1_2.13/{{< version >}}/paimon-spark-4.1_2.13-{{< version >}}.jar) |
| Spark 4.0 | - | [paimon-spark-4.0_2.13-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-4.0_2.13/{{< version >}}/paimon-spark-4.0_2.13-{{< version >}}.jar) |
| Spark 3.5 | [paimon-spark-3.5_2.12-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-3.5_2.12/{{< version >}}/paimon-spark-3.5_2.12-{{< version >}}.jar) | [paimon-spark-3.5_2.13-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-3.5_2.13/{{< version >}}/paimon-spark-3.5_2.13-{{< version >}}.jar) |
| Spark 3.4 | [paimon-spark-3.4_2.12-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-3.4_2.12/{{< version >}}/paimon-spark-3.4_2.12-{{< version >}}.jar) | [paimon-spark-3.4_2.13-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-3.4_2.13/{{< version >}}/paimon-spark-3.4_2.13-{{< version >}}.jar) |
@@ -52,6 +53,7 @@ Download the jar file with corresponding version.
| Version | Jar (Scala 2.12) | Jar (Scala 2.13) |
|-----------|-----------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------|
+| Spark 4.1 | - | [paimon-spark-4.1_2.13-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-4.1_2.13/{{< version >}}/) |
| Spark 4.0 | - | [paimon-spark-4.0_2.13-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-4.0_2.13/{{< version >}}/) |
| Spark 3.5 | [paimon-spark-3.5_2.12-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-3.5_2.12/{{< version >}}/) | [paimon-spark-3.5_2.13-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-3.5_2.13/{{< version >}}/) |
| Spark 3.4 | [paimon-spark-3.4_2.12-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-3.4_2.12/{{< version >}}/) | [paimon-spark-3.4_2.13-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-3.4_2.13/{{< version >}}/) |
@@ -73,6 +75,9 @@ mvn clean package -DskipTests -pl paimon-spark/paimon-spark-3.5 -am -Pscala-2.13
# build paimon spark 4.0
mvn clean package -DskipTests -pl paimon-spark/paimon-spark-4.0 -am -Pspark4
+
+# build paimon spark 4.1
+mvn clean package -DskipTests -pl paimon-spark/paimon-spark-4.1 -am -Pspark4
```
For Spark 3.5, you can find the bundled jar in `./paimon-spark/paimon-spark-3.5/target/paimon-spark-3.5_2.12-{{< version >}}.jar`.
diff --git a/paimon-spark/paimon-spark-4.1/pom.xml b/paimon-spark/paimon-spark-4.1/pom.xml
new file mode 100644
index 000000000000..74a30570cc5b
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/pom.xml
@@ -0,0 +1,168 @@
+
+
+
+ 4.0.0
+
+
+ org.apache.paimon
+ paimon-spark
+ 1.4-SNAPSHOT
+
+
+ paimon-spark-4.1_2.13
+ Paimon : Spark : 4.1 : 2.13
+
+
+ 4.1.1
+
+
+
+
+ org.apache.paimon
+ paimon-format
+
+
+
+ org.apache.paimon
+ paimon-spark4-common_${scala.binary.version}
+ ${project.version}
+
+
+ org.apache.spark
+ spark-sql-api_${scala.binary.version}
+
+
+
+
+
+ org.apache.paimon
+ paimon-spark-common_${scala.binary.version}
+ ${project.version}
+
+
+
+ org.apache.spark
+ spark-sql_${scala.binary.version}
+ ${spark.version}
+
+
+
+ org.apache.spark
+ spark-core_${scala.binary.version}
+ ${spark.version}
+
+
+
+ org.apache.spark
+ spark-catalyst_${scala.binary.version}
+ ${spark.version}
+
+
+
+ org.apache.spark
+ spark-hive_${scala.binary.version}
+ ${spark.version}
+
+
+
+
+
+ org.apache.paimon
+ paimon-spark-ut_${scala.binary.version}
+ ${project.version}
+ tests
+ test
+
+
+ *
+ *
+
+
+
+
+
+ org.apache.spark
+ spark-sql_${scala.binary.version}
+ ${spark.version}
+ tests
+ test
+
+
+ org.apache.spark
+ spark-connect-shims_${scala.binary.version}
+
+
+
+
+
+ org.apache.spark
+ spark-catalyst_${scala.binary.version}
+ ${spark.version}
+ tests
+ test
+
+
+
+ org.apache.spark
+ spark-core_${scala.binary.version}
+ ${spark.version}
+ tests
+ test
+
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+
+
+ shade-paimon
+ package
+
+ shade
+
+
+
+
+ *
+
+ com/github/luben/zstd/**
+ **/*libzstd-jni-*.so
+ **/*libzstd-jni-*.dll
+
+
+
+
+
+ org.apache.paimon:paimon-spark4-common_${scala.binary.version}
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/paimon/spark/catalyst/optimizer/MergePaimonScalarSubqueries.scala b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/paimon/spark/catalyst/optimizer/MergePaimonScalarSubqueries.scala
new file mode 100644
index 000000000000..e86195f1af0b
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/paimon/spark/catalyst/optimizer/MergePaimonScalarSubqueries.scala
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.catalyst.optimizer
+
+import org.apache.paimon.spark.PaimonScan
+
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, ExprId, ScalarSubquery, SortOrder}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation
+
+object MergePaimonScalarSubqueries extends MergePaimonScalarSubqueriesBase {
+
+ override def tryMergeDataSourceV2ScanRelation(
+ newV2ScanRelation: DataSourceV2ScanRelation,
+ cachedV2ScanRelation: DataSourceV2ScanRelation)
+ : Option[(LogicalPlan, AttributeMap[Attribute])] = {
+ (newV2ScanRelation, cachedV2ScanRelation) match {
+ case (
+ DataSourceV2ScanRelation(
+ newRelation,
+ newScan: PaimonScan,
+ newOutput,
+ newPartitioning,
+ newOrdering),
+ DataSourceV2ScanRelation(
+ cachedRelation,
+ cachedScan: PaimonScan,
+ _,
+ cachedPartitioning,
+ cacheOrdering)) =>
+ checkIdenticalPlans(newRelation, cachedRelation).flatMap {
+ outputMap =>
+ if (
+ samePartitioning(newPartitioning, cachedPartitioning, outputMap) && sameOrdering(
+ newOrdering,
+ cacheOrdering,
+ outputMap)
+ ) {
+ mergePaimonScan(newScan, cachedScan).map {
+ mergedScan =>
+ val mergedAttributes = mergedScan
+ .readSchema()
+ .map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)())
+ val cachedOutputNameMap = cachedRelation.output.map(a => a.name -> a).toMap
+ val mergedOutput =
+ mergedAttributes.map(a => cachedOutputNameMap.getOrElse(a.name, a))
+ val newV2ScanRelation =
+ cachedV2ScanRelation.copy(scan = mergedScan, output = mergedOutput)
+
+ val mergedOutputNameMap = mergedOutput.map(a => a.name -> a).toMap
+ val newOutputMap =
+ AttributeMap(newOutput.map(a => a -> mergedOutputNameMap(a.name).toAttribute))
+
+ newV2ScanRelation -> newOutputMap
+ }
+ } else {
+ None
+ }
+ }
+
+ case _ => None
+ }
+ }
+
+ private def sameOrdering(
+ newOrdering: Option[Seq[SortOrder]],
+ cachedOrdering: Option[Seq[SortOrder]],
+ outputAttrMap: AttributeMap[Attribute]): Boolean = {
+ val mappedNewOrdering = newOrdering.map(_.map(mapAttributes(_, outputAttrMap)))
+ mappedNewOrdering.map(_.map(_.canonicalized)) == cachedOrdering.map(_.map(_.canonicalized))
+ }
+
+ override protected def createScalarSubquery(plan: LogicalPlan, exprId: ExprId): ScalarSubquery = {
+ ScalarSubquery(plan, exprId = exprId)
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/PaimonStrategyHelper.scala b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/PaimonStrategyHelper.scala
new file mode 100644
index 000000000000..9fb3a7b54a25
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/PaimonStrategyHelper.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.catalog.CatalogUtils
+import org.apache.spark.sql.catalyst.plans.logical.TableSpec
+import org.apache.spark.sql.internal.StaticSQLConf.WAREHOUSE_PATH
+
+trait PaimonStrategyHelper {
+
+ def spark: SparkSession
+
+ protected def makeQualifiedDBObjectPath(location: String): String = {
+ CatalogUtils.makeQualifiedDBObjectPath(
+ spark.sharedState.conf.get(WAREHOUSE_PATH),
+ location,
+ spark.sharedState.hadoopConf)
+ }
+
+ protected def qualifyLocInTableSpec(tableSpec: TableSpec): TableSpec = {
+ tableSpec.copy(location = tableSpec.location.map(makeQualifiedDBObjectPath(_)))
+ }
+
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/shim/PaimonCreateTableAsSelectStrategy.scala b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/shim/PaimonCreateTableAsSelectStrategy.scala
new file mode 100644
index 000000000000..61e25b7c16a9
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/shim/PaimonCreateTableAsSelectStrategy.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.shim
+
+import org.apache.paimon.CoreOptions
+import org.apache.paimon.iceberg.IcebergOptions
+import org.apache.paimon.spark.SparkCatalog
+import org.apache.paimon.spark.catalog.FormatTableCatalog
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.analysis.ResolvedIdentifier
+import org.apache.spark.sql.catalyst.plans.logical.{CreateTableAsSelect, LogicalPlan, TableSpec}
+import org.apache.spark.sql.connector.catalog.StagingTableCatalog
+import org.apache.spark.sql.execution.{PaimonStrategyHelper, SparkPlan, SparkStrategy}
+import org.apache.spark.sql.execution.datasources.v2.CreateTableAsSelectExec
+
+import scala.collection.JavaConverters._
+
+case class PaimonCreateTableAsSelectStrategy(spark: SparkSession)
+ extends SparkStrategy
+ with PaimonStrategyHelper {
+
+ import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._
+
+ override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+ case CreateTableAsSelect(
+ ResolvedIdentifier(catalog: SparkCatalog, ident),
+ parts,
+ query,
+ tableSpec: TableSpec,
+ options,
+ ifNotExists,
+ true) =>
+ catalog match {
+ case _: StagingTableCatalog =>
+ throw new RuntimeException("Paimon can't extend StagingTableCatalog for now.")
+ case _ =>
+ val coreOptionKeys = CoreOptions.getOptions.asScala.map(_.key()).toSeq
+
+ // Include Iceberg compatibility options in table properties (fix for DataFrame writer options)
+ val icebergOptionKeys = IcebergOptions.getOptions.asScala.map(_.key()).toSeq
+
+ val allTableOptionKeys = coreOptionKeys ++ icebergOptionKeys
+
+ val (tableOptions, writeOptions) = options.partition {
+ case (key, _) => allTableOptionKeys.contains(key)
+ }
+ val newTableSpec = tableSpec.copy(properties = tableSpec.properties ++ tableOptions)
+
+ val isPartitionedFormatTable = {
+ catalog match {
+ case catalog: FormatTableCatalog =>
+ catalog.isFormatTable(newTableSpec.provider.orNull) && parts.nonEmpty
+ case _ => false
+ }
+ }
+
+ if (isPartitionedFormatTable) {
+ throw new UnsupportedOperationException(
+ "Using CTAS with partitioned format table is not supported yet.")
+ }
+
+ CreateTableAsSelectExec(
+ catalog.asTableCatalog,
+ ident,
+ parts,
+ query,
+ qualifyLocInTableSpec(newTableSpec),
+ writeOptions,
+ ifNotExists) :: Nil
+ }
+ case _ => Nil
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/resources/function/hive-test-udfs.jar b/paimon-spark/paimon-spark-4.1/src/test/resources/function/hive-test-udfs.jar
new file mode 100644
index 000000000000..a5bfa456f668
Binary files /dev/null and b/paimon-spark/paimon-spark-4.1/src/test/resources/function/hive-test-udfs.jar differ
diff --git a/paimon-spark/paimon-spark-4.1/src/test/resources/hive-site.xml b/paimon-spark/paimon-spark-4.1/src/test/resources/hive-site.xml
new file mode 100644
index 000000000000..bdf2bb090760
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/resources/hive-site.xml
@@ -0,0 +1,56 @@
+
+
+
+
+ hive.metastore.integral.jdo.pushdown
+ true
+
+
+
+ hive.metastore.schema.verification
+ false
+
+
+
+ hive.metastore.client.capability.check
+ false
+
+
+
+ datanucleus.schema.autoCreateTables
+ true
+
+
+
+ datanucleus.schema.autoCreateAll
+ true
+
+
+
+
+ datanucleus.connectionPoolingType
+ DBCP
+
+
+
+ hive.metastore.uris
+ thrift://localhost:9090
+ Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.
+
+
\ No newline at end of file
diff --git a/paimon-spark/paimon-spark-4.1/src/test/resources/log4j2-test.properties b/paimon-spark/paimon-spark-4.1/src/test/resources/log4j2-test.properties
new file mode 100644
index 000000000000..6f324f5863ac
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/resources/log4j2-test.properties
@@ -0,0 +1,38 @@
+################################################################################
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+# Set root logger level to OFF to not flood build logs
+# set manually to INFO for debugging purposes
+rootLogger.level = OFF
+rootLogger.appenderRef.test.ref = TestLogger
+
+appender.testlogger.name = TestLogger
+appender.testlogger.type = CONSOLE
+appender.testlogger.target = SYSTEM_ERR
+appender.testlogger.layout.type = PatternLayout
+appender.testlogger.layout.pattern = %-4r [%tid %t] %-5p %c %x - %m%n
+
+logger.kafka.name = kafka
+logger.kafka.level = OFF
+logger.kafka2.name = state.change
+logger.kafka2.level = OFF
+
+logger.zookeeper.name = org.apache.zookeeper
+logger.zookeeper.level = OFF
+logger.I0Itec.name = org.I0Itec
+logger.I0Itec.level = OFF
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonCDCSourceTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonCDCSourceTest.scala
new file mode 100644
index 000000000000..9b9393be7118
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonCDCSourceTest.scala
@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark
+
+import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.execution.streaming.runtime.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+
+class PaimonCDCSourceTest extends PaimonSparkTestBase with StreamTest {
+
+ import testImplicits._
+
+ test("Paimon CDC Source: batch write and streaming read change-log with default scan mode") {
+ withTempDir {
+ checkpointDir =>
+ val tableName = "T"
+ spark.sql(s"DROP TABLE IF EXISTS $tableName")
+ spark.sql(s"""
+ |CREATE TABLE $tableName (a INT, b STRING)
+ |TBLPROPERTIES (
+ | 'primary-key'='a',
+ | 'bucket'='2',
+ | 'changelog-producer' = 'lookup')
+ |""".stripMargin)
+
+ spark.sql(s"INSERT INTO $tableName VALUES (1, 'v_1')")
+ spark.sql(s"INSERT INTO $tableName VALUES (2, 'v_2')")
+ spark.sql(s"INSERT INTO $tableName VALUES (2, 'v_2_new')")
+
+ val table = loadTable(tableName)
+ val location = table.location().toString
+
+ val readStream = spark.readStream
+ .format("paimon")
+ .option("read.changelog", "true")
+ .load(location)
+ .writeStream
+ .format("memory")
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .queryName("mem_table")
+ .outputMode("append")
+ .start()
+
+ val currentResult = () => spark.sql("SELECT * FROM mem_table")
+ try {
+ readStream.processAllAvailable()
+ val expertResult1 = Row("+I", 1, "v_1") :: Row("+I", 2, "v_2_new") :: Nil
+ checkAnswer(currentResult(), expertResult1)
+
+ spark.sql(s"INSERT INTO $tableName VALUES (1, 'v_1_new'), (3, 'v_3')")
+ readStream.processAllAvailable()
+ val expertResult2 =
+ Row("+I", 1, "v_1") :: Row("-U", 1, "v_1") :: Row("+U", 1, "v_1_new") :: Row(
+ "+I",
+ 2,
+ "v_2_new") :: Row("+I", 3, "v_3") :: Nil
+ checkAnswer(currentResult(), expertResult2)
+ } finally {
+ readStream.stop()
+ }
+ }
+ }
+
+ test("Paimon CDC Source: batch write and streaming read change-log with scan.snapshot-id") {
+ withTempDir {
+ checkpointDir =>
+ val tableName = "T"
+ spark.sql(s"DROP TABLE IF EXISTS $tableName")
+ spark.sql(s"""
+ |CREATE TABLE $tableName (a INT, b STRING)
+ |TBLPROPERTIES (
+ | 'primary-key'='a',
+ | 'bucket'='2',
+ | 'changelog-producer' = 'lookup')
+ |""".stripMargin)
+
+ spark.sql(s"INSERT INTO $tableName VALUES (1, 'v_1')")
+ spark.sql(s"INSERT INTO $tableName VALUES (2, 'v_2')")
+ spark.sql(s"INSERT INTO $tableName VALUES (2, 'v_2_new')")
+
+ val table = loadTable(tableName)
+ val location = table.location().toString
+
+ val readStream = spark.readStream
+ .format("paimon")
+ .option("read.changelog", "true")
+ .option("scan.mode", "from-snapshot")
+ .option("scan.snapshot-id", 1)
+ .load(location)
+ .writeStream
+ .format("memory")
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .queryName("mem_table")
+ .outputMode("append")
+ .start()
+
+ val currentResult = () => spark.sql("SELECT * FROM mem_table")
+ try {
+ readStream.processAllAvailable()
+ val expertResult1 = Row("+I", 1, "v_1") :: Row("+I", 2, "v_2") :: Row(
+ "-U",
+ 2,
+ "v_2") :: Row("+U", 2, "v_2_new") :: Nil
+ checkAnswer(currentResult(), expertResult1)
+
+ spark.sql(s"INSERT INTO $tableName VALUES (1, 'v_1_new'), (3, 'v_3')")
+ readStream.processAllAvailable()
+ val expertResult2 =
+ Row("+I", 1, "v_1") :: Row("-U", 1, "v_1") :: Row("+U", 1, "v_1_new") :: Row(
+ "+I",
+ 2,
+ "v_2") :: Row("-U", 2, "v_2") :: Row("+U", 2, "v_2_new") :: Row("+I", 3, "v_3") :: Nil
+ checkAnswer(currentResult(), expertResult2)
+ } finally {
+ readStream.stop()
+ }
+ }
+ }
+
+ test("Paimon CDC Source: streaming write and streaming read change-log") {
+ withTempDirs {
+ (checkpointDir1, checkpointDir2) =>
+ val tableName = "T"
+ spark.sql(s"DROP TABLE IF EXISTS $tableName")
+ spark.sql(s"""
+ |CREATE TABLE $tableName (a INT, b STRING)
+ |TBLPROPERTIES (
+ | 'primary-key'='a',
+ | 'bucket'='2',
+ | 'changelog-producer' = 'lookup')
+ |""".stripMargin)
+
+ val table = loadTable(tableName)
+ val location = table.location().toString
+
+ // streaming write
+ val inputData = MemoryStream[(Int, String)]
+ val writeStream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir1.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ // streaming read
+ val readStream = spark.readStream
+ .format("paimon")
+ .option("read.changelog", "true")
+ .option("scan.mode", "from-snapshot")
+ .option("scan.snapshot-id", 1)
+ .load(location)
+ .writeStream
+ .format("memory")
+ .option("checkpointLocation", checkpointDir2.getCanonicalPath)
+ .queryName("mem_table")
+ .outputMode("append")
+ .start()
+
+ val currentResult = () => spark.sql("SELECT * FROM mem_table")
+ try {
+ inputData.addData((1, "v_1"))
+ writeStream.processAllAvailable()
+ readStream.processAllAvailable()
+ val expertResult1 = Row("+I", 1, "v_1") :: Nil
+ checkAnswer(currentResult(), expertResult1)
+
+ inputData.addData((2, "v_2"))
+ writeStream.processAllAvailable()
+ readStream.processAllAvailable()
+ val expertResult2 = Row("+I", 1, "v_1") :: Row("+I", 2, "v_2") :: Nil
+ checkAnswer(currentResult(), expertResult2)
+
+ inputData.addData((2, "v_2_new"))
+ writeStream.processAllAvailable()
+ readStream.processAllAvailable()
+ val expertResult3 = Row("+I", 1, "v_1") :: Row("+I", 2, "v_2") :: Row(
+ "-U",
+ 2,
+ "v_2") :: Row("+U", 2, "v_2_new") :: Nil
+ checkAnswer(currentResult(), expertResult3)
+
+ inputData.addData((1, "v_1_new"), (3, "v_3"))
+ writeStream.processAllAvailable()
+ readStream.processAllAvailable()
+ val expertResult4 =
+ Row("+I", 1, "v_1") :: Row("-U", 1, "v_1") :: Row("+U", 1, "v_1_new") :: Row(
+ "+I",
+ 2,
+ "v_2") :: Row("-U", 2, "v_2") :: Row("+U", 2, "v_2_new") :: Row("+I", 3, "v_3") :: Nil
+ checkAnswer(currentResult(), expertResult4)
+ } finally {
+ readStream.stop()
+ }
+ }
+ }
+
+ test("Paimon CDC Source: streaming read change-log with audit_log system table") {
+ withTable("T") {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(
+ s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a','bucket'='2', 'changelog-producer' = 'lookup')
+ |""".stripMargin)
+
+ val readStream = spark.readStream
+ .format("paimon")
+ .table("`T$audit_log`")
+ .writeStream
+ .format("memory")
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .queryName("mem_table")
+ .outputMode("append")
+ .start()
+
+ val currentResult = () => spark.sql("SELECT * FROM mem_table")
+ try {
+ spark.sql(s"INSERT INTO T VALUES (1, 'v_1')")
+ readStream.processAllAvailable()
+ checkAnswer(currentResult(), Row("+I", 1, "v_1") :: Nil)
+
+ spark.sql(s"INSERT INTO T VALUES (2, 'v_2')")
+ readStream.processAllAvailable()
+ checkAnswer(currentResult(), Row("+I", 1, "v_1") :: Row("+I", 2, "v_2") :: Nil)
+ } finally {
+ readStream.stop()
+ }
+ }
+ }
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSinkTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSinkTest.scala
new file mode 100644
index 000000000000..9935288db9a7
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSinkTest.scala
@@ -0,0 +1,365 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark
+
+import org.apache.paimon.Snapshot.CommitKind._
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.execution.streaming.runtime.MemoryStream
+import org.apache.spark.sql.functions.{col, mean, window}
+import org.apache.spark.sql.streaming.StreamTest
+
+import java.sql.Date
+
+class PaimonSinkTest extends PaimonSparkTestBase with StreamTest {
+
+ override protected def sparkConf: SparkConf = {
+ super.sparkConf.set("spark.sql.catalog.paimon.cache-enabled", "false")
+ }
+
+ import testImplicits._
+
+ test("Paimon Sink: forEachBatch") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ // define a change-log table and test `forEachBatch` api
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], id: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ inputData.addData((1, "a"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ inputData.addData((2, "b"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ inputData.addData((2, "b2"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Sink: append mode") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ // define a change-log table and sink into it in append mode
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .format("paimon")
+ .start(location)
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ inputData.addData((1, "a"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ inputData.addData((2, "b"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ inputData.addData((2, "b2"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Sink: complete mode") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ // define an append-only table and sink into it in complete mode
+ spark.sql(s"""
+ |CREATE TABLE T (city String, population Long)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData.toDS
+ .toDF("uid", "city")
+ .groupBy("city")
+ .count()
+ .toDF("city", "population")
+ .writeStream
+ .outputMode("complete")
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .format("paimon")
+ .start(location)
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY city")
+
+ try {
+ inputData.addData((1, "HZ"), (2, "BJ"), (3, "BJ"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row("BJ", 2L) :: Row("HZ", 1L) :: Nil)
+
+ inputData.addData((4, "SH"), (5, "BJ"), (6, "HZ"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row("BJ", 3L) :: Row("HZ", 2L) :: Row("SH", 1L) :: Nil)
+
+ inputData.addData((7, "HZ"), (8, "SH"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row("BJ", 3L) :: Row("HZ", 3L) :: Row("SH", 2L) :: Nil)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Sink: update mode") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ // define a change-log table and sink into it in update mode
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ intercept[RuntimeException] {
+ inputData
+ .toDF()
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .outputMode("update")
+ .format("paimon")
+ .start(location)
+ }
+ }
+ }
+ }
+
+ test("Paimon Sink: aggregation and watermark") {
+ withTempDir {
+ checkpointDir =>
+ // define an append-only table and sink into it with aggregation and watermark in append mode
+ spark.sql(s"""
+ |CREATE TABLE T (start Timestamp, stockId INT, avg_price DOUBLE)
+ |TBLPROPERTIES ('bucket'='3', 'bucket-key'='stockId')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Long, Int, Double)]
+ val data = inputData.toDS
+ .toDF("time", "stockId", "price")
+ .selectExpr("CAST(time AS timestamp) AS timestamp", "stockId", "price")
+ .withWatermark("timestamp", "10 seconds")
+ .groupBy(window($"timestamp", "5 seconds"), col("stockId"))
+ .agg(mean("price").as("avg_price"))
+ .select("window.start", "stockId", "avg_price")
+
+ val stream =
+ data.writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .format("paimon")
+ .start(location)
+
+ val query = () =>
+ spark.sql(
+ "SELECT CAST(start as BIGINT) AS start, stockId, avg_price FROM T ORDER BY start, stockId")
+
+ try {
+ inputData.addData((101L, 1, 1.0d), (102, 1, 2.0d), (104, 2, 20.0d))
+ stream.processAllAvailable()
+ inputData.addData((105L, 2, 40.0d), (107, 2, 60.0d), (115, 3, 300.0d))
+ stream.processAllAvailable()
+ inputData.addData((200L, 99, 99.9d))
+ stream.processAllAvailable()
+ checkAnswer(
+ query(),
+ Row(100L, 1, 1.5d) :: Row(100L, 2, 20.0d) :: Row(105L, 2, 50.0d) :: Row(
+ 115L,
+ 3,
+ 300.0d) :: Nil)
+ } finally {
+ if (stream != null) {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Sink: enable schema evolution") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ // define a change-log table and sink into it with schema evolution in append mode
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val date = Date.valueOf("2023-08-10")
+ spark.sql("INSERT INTO T VALUES (1, '2023-08-09'), (2, '2023-08-09')")
+ checkAnswer(
+ spark.sql("SELECT * FROM T ORDER BY a, b"),
+ Row(1, "2023-08-09") :: Row(2, "2023-08-09") :: Nil)
+
+ val inputData = MemoryStream[(Long, Date, Int)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b", "c")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .option("write.merge-schema", "true")
+ .option("write.merge-schema.explicit-cast", "true")
+ .format("paimon")
+ .start(location)
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ inputData.addData((1L, date, 123), (3L, date, 456))
+ stream.processAllAvailable()
+
+ checkAnswer(
+ query(),
+ Row(1L, date, 123) :: Row(2L, Date.valueOf("2023-08-09"), null) :: Row(
+ 3L,
+ date,
+ 456) :: Nil)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon SinK: set full-compaction.delta-commits with batch write") {
+ for (useV2Write <- Seq("true", "false")) {
+ withSparkSQLConf("spark.paimon.write.use-v2-write" -> useV2Write) {
+ withTable("t") {
+ sql("""
+ |CREATE TABLE t (
+ | a INT,
+ | b INT
+ |) TBLPROPERTIES (
+ | 'primary-key'='a',
+ | 'bucket'='1',
+ | 'full-compaction.delta-commits'='1'
+ |)
+ |""".stripMargin)
+
+ sql("INSERT INTO t VALUES (1, 1)")
+ sql("INSERT INTO t VALUES (2, 2)")
+ checkAnswer(sql("SELECT * FROM t ORDER BY a"), Seq(Row(1, 1), Row(2, 2)))
+ assert(loadTable("t").snapshotManager().latestSnapshot().commitKind == COMPACT)
+ }
+ }
+ }
+ }
+
+ test("Paimon SinK: set full-compaction.delta-commits with streaming write") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b INT)
+ |TBLPROPERTIES (
+ | 'primary-key'='a',
+ | 'bucket'='1',
+ | 'full-compaction.delta-commits'='2'
+ |)
+ |""".stripMargin)
+ val table = loadTable("T")
+ val location = table.location().toString
+
+ val inputData = MemoryStream[(Int, Int)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .format("paimon")
+ .start(location)
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ inputData.addData((1, 1))
+ stream.processAllAvailable()
+ checkAnswer(query(), Seq(Row(1, 1)))
+ assert(table.snapshotManager().latestSnapshot().commitKind == APPEND)
+
+ inputData.addData((2, 1))
+ stream.processAllAvailable()
+ checkAnswer(query(), Seq(Row(1, 1), Row(2, 1)))
+ assert(table.snapshotManager().latestSnapshot().commitKind == COMPACT)
+
+ inputData.addData((2, 2))
+ stream.processAllAvailable()
+ checkAnswer(query(), Seq(Row(1, 1), Row(2, 2)))
+ assert(table.snapshotManager().latestSnapshot().commitKind == APPEND)
+
+ inputData.addData((3, 1))
+ stream.processAllAvailable()
+ checkAnswer(query(), Seq(Row(1, 1), Row(2, 2), Row(3, 1)))
+ assert(table.snapshotManager().latestSnapshot().commitKind == COMPACT)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSparkTestBase.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSparkTestBase.scala
new file mode 100644
index 000000000000..3208609835f1
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSparkTestBase.scala
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark
+
+import org.apache.paimon.catalog.{Catalog, Identifier}
+import org.apache.paimon.data.GenericRow
+import org.apache.paimon.fs.FileIO
+import org.apache.paimon.fs.local.LocalFileIO
+import org.apache.paimon.spark.catalog.WithPaimonCatalog
+import org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions
+import org.apache.paimon.spark.sql.{SparkVersionSupport, WithTableOptions}
+import org.apache.paimon.table.FileStoreTable
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.connector.catalog.{Identifier => SparkIdentifier}
+import org.apache.spark.sql.connector.read.Scan
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2ScanRelation}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.paimon.Utils
+import org.apache.spark.sql.test.SharedSparkSession
+
+import java.io.File
+import java.util.{TimeZone, UUID}
+
+import scala.util.Random
+
+class PaimonSparkTestBase
+ extends QueryTest
+ with SharedSparkSession
+ with WithTableOptions
+ with SparkVersionSupport {
+
+ protected lazy val commitUser: String = UUID.randomUUID.toString
+
+ protected lazy val fileIO: FileIO = LocalFileIO.create
+
+ protected lazy val tempDBDir: File = Utils.createTempDir
+
+ protected def paimonCatalog: Catalog = {
+ spark.sessionState.catalogManager.currentCatalog.asInstanceOf[WithPaimonCatalog].paimonCatalog()
+ }
+
+ protected val dbName0: String = "test"
+
+ protected val tableName0: String = "T"
+
+ /** Add paimon ([[SparkCatalog]] in fileSystem) catalog */
+ override protected def sparkConf: SparkConf = {
+ val serializer = if (Random.nextBoolean()) {
+ "org.apache.spark.serializer.KryoSerializer"
+ } else {
+ "org.apache.spark.serializer.JavaSerializer"
+ }
+ super.sparkConf
+ .set("spark.sql.warehouse.dir", tempDBDir.getCanonicalPath)
+ .set("spark.sql.catalog.paimon", classOf[SparkCatalog].getName)
+ .set("spark.sql.catalog.paimon.warehouse", tempDBDir.getCanonicalPath)
+ .set("spark.sql.extensions", classOf[PaimonSparkSessionExtensions].getName)
+ .set("spark.serializer", serializer)
+ }
+
+ override protected def beforeAll(): Unit = {
+ super.beforeAll()
+ spark.sql(s"USE paimon")
+ spark.sql(s"CREATE DATABASE IF NOT EXISTS paimon.$dbName0")
+ spark.sql(s"USE paimon.$dbName0")
+ }
+
+ override protected def afterAll(): Unit = {
+ try {
+ spark.sql(s"USE paimon")
+ spark.sql(s"DROP TABLE IF EXISTS $dbName0.$tableName0")
+ spark.sql("USE default")
+ spark.sql(s"DROP DATABASE paimon.$dbName0 CASCADE")
+ } finally {
+ super.afterAll()
+ }
+ }
+
+ /** Default is paimon catalog */
+ override protected def beforeEach(): Unit = {
+ super.beforeAll()
+ spark.sql(s"USE paimon")
+ spark.sql(s"USE paimon.$dbName0")
+ spark.sql(s"DROP TABLE IF EXISTS $tableName0")
+ }
+
+ protected def withTempDirs(f: (File, File) => Unit): Unit = {
+ withTempDir(file1 => withTempDir(file2 => f(file1, file2)))
+ }
+
+ protected def withTimeZone(timeZone: String)(f: => Unit): Unit = {
+ withSparkSQLConf("spark.sql.session.timeZone" -> timeZone) {
+ val originTimeZone = TimeZone.getDefault
+ try {
+ TimeZone.setDefault(TimeZone.getTimeZone(timeZone))
+ f
+ } finally {
+ TimeZone.setDefault(originTimeZone)
+ }
+ }
+ }
+
+ // Since SPARK-46227 has changed the definition of withSQLConf that resulted in
+ // incompatibility between the Spark3.x and Spark4.x, So Paimon declare a separate method
+ // to provide the same function.
+ protected def withSparkSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
+ withSparkSQLConf0(pairs: _*)(f)
+ }
+
+ private def withSparkSQLConf0(pairs: (String, String)*)(f: => Unit): Unit = {
+ val conf = SQLConf.get
+ val (keys, values) = pairs.unzip
+ val currentValues = keys.map {
+ key =>
+ if (conf.contains(key)) {
+ Some(conf.getConfString(key))
+ } else {
+ None
+ }
+ }
+ (keys, values).zipped.foreach {
+ (k, v) =>
+ if (SQLConf.isStaticConfigKey(k)) {
+ throw new RuntimeException(s"Cannot modify the value of a static config: $k")
+ }
+ conf.setConfString(k, v)
+ }
+ try f
+ finally {
+ keys.zip(currentValues).foreach {
+ case (key, Some(value)) => conf.setConfString(key, value)
+ case (key, None) => conf.unsetConf(key)
+ }
+ }
+ }
+
+ def loadTable(tableName: String): FileStoreTable = {
+ loadTable(dbName0, tableName)
+ }
+
+ def loadTable(dbName: String, tableName: String): FileStoreTable = {
+ paimonCatalog.getTable(Identifier.create(dbName, tableName)).asInstanceOf[FileStoreTable]
+ }
+
+ protected def createRelationV2(tableName: String): DataSourceV2Relation = {
+ val sparkTable = SparkTable(loadTable(tableName))
+ DataSourceV2Relation.create(
+ sparkTable,
+ Some(spark.sessionState.catalogManager.currentCatalog),
+ Some(SparkIdentifier.of(Array(this.dbName0), tableName))
+ )
+ }
+
+ def getScan(sqlText: String): Scan = {
+ sql(sqlText).queryExecution.optimizedPlan
+ .collectFirst { case relation: DataSourceV2ScanRelation => relation }
+ .get
+ .scan
+ }
+
+ protected def getPaimonScan(sqlText: String): PaimonScan = {
+ getScan(sqlText).asInstanceOf[PaimonScan]
+ }
+
+ protected def getFormatTableScan(sqlText: String): PaimonFormatTableScan = {
+ getScan(sqlText).asInstanceOf[PaimonFormatTableScan]
+ }
+
+ object GenericRow {
+ def of(values: Any*): GenericRow = {
+ val row = new GenericRow(values.length)
+ values.zipWithIndex.foreach {
+ case (value, index) =>
+ row.setField(index, value)
+ }
+ row
+ }
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/AlterBranchProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/AlterBranchProcedureTest.scala
new file mode 100644
index 000000000000..df1df747897d
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/AlterBranchProcedureTest.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure
+
+import org.apache.paimon.spark.PaimonSparkTestBase
+
+import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.execution.streaming.runtime.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+
+class AlterBranchProcedureTest extends PaimonSparkTestBase with StreamTest {
+
+ import testImplicits._
+ test("Paimon Procedure: alter schema structure and test $branch syntax.") {
+ withTempDir {
+ checkpointDir =>
+ // define a change-log table and test `forEachBatch` api
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+ try {
+ // snapshot-1
+ inputData.addData((1, "a"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ // snapshot-2
+ inputData.addData((2, "b"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ // snapshot-3
+ inputData.addData((2, "b2"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil)
+
+ val table = loadTable("T")
+ val branchManager = table.branchManager()
+
+ // create branch with tag
+ checkAnswer(
+ spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 's_2', snapshot => 2)"),
+ Row(true) :: Nil)
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_branch(table => 'test.T', branch => 'snapshot_branch', tag => 's_2')"),
+ Row(true) :: Nil)
+ assert(branchManager.branchExists("snapshot_branch"))
+
+ spark.sql("INSERT INTO T VALUES (1, 'APPLE'), (2,'DOG'), (2, 'horse')")
+ spark.sql("ALTER TABLE `T$branch_snapshot_branch` ADD COLUMNS(c INT)")
+ spark.sql(
+ "INSERT INTO `T$branch_snapshot_branch` VALUES " + "(1,'cherry', 100), (2,'bird', 200), (3, 'wolf', 400)")
+
+ checkAnswer(
+ spark.sql("SELECT * FROM T ORDER BY a, b"),
+ Row(1, "APPLE") :: Row(2, "horse") :: Nil)
+ checkAnswer(
+ spark.sql("SELECT * FROM `T$branch_snapshot_branch` ORDER BY a, b,c"),
+ Row(1, "cherry", 100) :: Row(2, "bird", 200) :: Row(3, "wolf", 400) :: Nil)
+ assert(branchManager.branchExists("snapshot_branch"))
+ }
+ }
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/BranchProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/BranchProcedureTest.scala
new file mode 100644
index 000000000000..111e604b1ef0
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/BranchProcedureTest.scala
@@ -0,0 +1,184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure
+
+import org.apache.paimon.spark.PaimonSparkTestBase
+
+import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.execution.streaming.runtime.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+
+class BranchProcedureTest extends PaimonSparkTestBase with StreamTest {
+
+ import testImplicits._
+ test("Paimon Procedure: create, query, write and delete branch") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ // define a change-log table and test `forEachBatch` api
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ // snapshot-1
+ inputData.addData((1, "a"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ // snapshot-2
+ inputData.addData((2, "b"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ // snapshot-3
+ inputData.addData((2, "b2"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil)
+
+ // create tags
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_tag(table => 'test.T', tag => 'test_tag', snapshot => 2)"),
+ Row(true) :: Nil)
+ checkAnswer(
+ spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"),
+ Row("test_tag") :: Nil)
+
+ // create branch with tag
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_branch(table => 'test.T', branch => 'test_branch', tag => 'test_tag')"),
+ Row(true) :: Nil)
+ val table = loadTable("T")
+ val branchManager = table.branchManager()
+ assert(branchManager.branchExists("test_branch"))
+
+ // query from branch
+ checkAnswer(
+ spark.sql("SELECT * FROM `T$branch_test_branch` ORDER BY a"),
+ Row(1, "a") :: Row(2, "b") :: Nil
+ )
+ checkAnswer(
+ spark.read.format("paimon").option("branch", "test_branch").table("T").orderBy("a"),
+ Row(1, "a") :: Row(2, "b") :: Nil
+ )
+
+ // update branch
+ spark.sql("INSERT INTO `T$branch_test_branch` VALUES (3, 'c')")
+ checkAnswer(
+ spark.sql("SELECT * FROM `T$branch_test_branch` ORDER BY a"),
+ Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil
+ )
+ // create tags
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_tag(table => 'test.`T$branch_test_branch`', tag => 'test_tag2', snapshot => 3)"),
+ Row(true) :: Nil)
+
+ // create branch from another branch.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_branch(table => 'test.`T$branch_test_branch`', branch => 'test_branch2', tag => 'test_tag2')"),
+ Row(true) :: Nil)
+ checkAnswer(
+ spark.sql("SELECT * FROM `T$branch_test_branch2` ORDER BY a"),
+ Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil
+ )
+
+ // create empty branch
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_branch(table => 'test.T', branch => 'empty_branch')"),
+ Row(true) :: Nil)
+ assert(branchManager.branchExists("empty_branch"))
+ checkAnswer(
+ spark.sql("SELECT * FROM `T$branch_empty_branch` ORDER BY a"),
+ Nil
+ )
+
+ // delete branch
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.delete_branch(table => 'test.T', branch => 'test_branch')"),
+ Row(true) :: Nil)
+ assert(!branchManager.branchExists("test_branch"))
+ intercept[Exception] {
+ spark.sql("SELECT * FROM `T$branch_test_branch` ORDER BY a")
+ }
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Branch: read with scan.fallback-branch") {
+ withTable("T") {
+ sql("""
+ |CREATE TABLE T (
+ | dt STRING NOT NULL,
+ | name STRING NOT NULL,
+ | amount BIGINT
+ |) PARTITIONED BY (dt)
+ |""".stripMargin)
+
+ sql("ALTER TABLE T SET TBLPROPERTIES ('k1' = 'v1')")
+ sql("ALTER TABLE T SET TBLPROPERTIES ('k2' = 'v2')")
+
+ sql("CALL sys.create_branch('test.T', 'test')")
+ sql("ALTER TABLE T SET TBLPROPERTIES ('scan.fallback-branch' = 'test')")
+
+ sql(
+ "INSERT INTO `T$branch_test` VALUES ('20240725', 'apple', 4), ('20240725', 'peach', 10), ('20240726', 'cherry', 3), ('20240726', 'pear', 6)")
+ sql("INSERT INTO T VALUES ('20240725', 'apple', 5), ('20240725', 'banana', 7)")
+
+ checkAnswer(
+ sql("SELECT * FROM T ORDER BY amount"),
+ Seq(
+ Row("20240726", "cherry", 3),
+ Row("20240725", "apple", 5),
+ Row("20240726", "pear", 6),
+ Row("20240725", "banana", 7))
+ )
+
+ sql("ALTER TABLE T UNSET TBLPROPERTIES ('scan.fallback-branch')")
+ checkAnswer(
+ sql("SELECT * FROM T ORDER BY amount"),
+ Seq(Row("20240725", "apple", 5), Row("20240725", "banana", 7)))
+ }
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTest.scala
new file mode 100644
index 000000000000..322d50a62127
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure
+
+class CompactProcedureTest extends CompactProcedureTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTestBase.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTestBase.scala
new file mode 100644
index 000000000000..19f6bc25280e
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTestBase.scala
@@ -0,0 +1,1324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure
+
+import org.apache.paimon.Snapshot.CommitKind
+import org.apache.paimon.fs.Path
+import org.apache.paimon.spark.PaimonSparkTestBase
+import org.apache.paimon.spark.utils.SparkProcedureUtils
+import org.apache.paimon.table.FileStoreTable
+import org.apache.paimon.table.source.DataSplit
+
+import org.apache.spark.scheduler.{SparkListener, SparkListenerStageSubmitted}
+import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.execution.streaming.runtime.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+import org.assertj.core.api.Assertions
+import org.scalatest.time.Span
+
+import java.util
+
+import scala.collection.JavaConverters._
+import scala.util.Random
+
+/** Test compact procedure. See [[CompactProcedure]]. */
+abstract class CompactProcedureTestBase extends PaimonSparkTestBase with StreamTest {
+
+ import testImplicits._
+
+ // ----------------------- Minor Compact -----------------------
+
+ test("Paimon Procedure: compact aware bucket pk table with minor compact strategy") {
+ withTable("T") {
+ spark.sql(s"""
+ |CREATE TABLE T (id INT, value STRING, pt STRING)
+ |TBLPROPERTIES ('primary-key'='id, pt', 'bucket'='1', 'write-only'='true')
+ |PARTITIONED BY (pt)
+ |""".stripMargin)
+
+ val table = loadTable("T")
+
+ spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1'), (2, 'b', 'p2')")
+ spark.sql(s"INSERT INTO T VALUES (3, 'c', 'p1'), (4, 'd', 'p2')")
+
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.APPEND)).isTrue
+ Assertions.assertThat(lastSnapshotId(table)).isEqualTo(2)
+
+ spark.sql(
+ "CALL sys.compact(table => 'T', compact_strategy => 'minor'," +
+ "options => 'num-sorted-run.compaction-trigger=3')")
+
+ // Due to the limitation of parameter 'num-sorted-run.compaction-trigger' = 3, so compact is not
+ // performed.
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.APPEND)).isTrue
+ Assertions.assertThat(lastSnapshotId(table)).isEqualTo(2)
+
+ // Make par-p1 has 3 datafile and par-p2 has 2 datafile, so par-p2 will not be picked out to
+ // compact.
+ spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1')")
+
+ spark.sql(
+ "CALL sys.compact(table => 'T', compact_strategy => 'minor'," +
+ "options => 'num-sorted-run.compaction-trigger=3')")
+
+ Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4)
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue
+
+ val splits = table.newSnapshotReader.read.dataSplits
+ splits.forEach(
+ split => {
+ Assertions
+ .assertThat(split.dataFiles.size)
+ .isEqualTo(if (split.partition().getString(0).toString == "p2") 2 else 1)
+ })
+ }
+ }
+
+ // ----------------------- Sort Compact -----------------------
+
+ test("Paimon Procedure: sort compact") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b INT)
+ |TBLPROPERTIES ('bucket'='-1')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, Int)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ // test zorder sort
+ inputData.addData((0, 0))
+ inputData.addData((0, 1))
+ inputData.addData((0, 2))
+ inputData.addData((1, 0))
+ inputData.addData((1, 1))
+ inputData.addData((1, 2))
+ inputData.addData((2, 0))
+ inputData.addData((2, 1))
+ inputData.addData((2, 2))
+ stream.processAllAvailable()
+
+ val result = new util.ArrayList[Row]()
+ for (a <- 0 until 3) {
+ for (b <- 0 until 3) {
+ result.add(Row(a, b))
+ }
+ }
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result)
+
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.compact(table => 'T', order_strategy => 'zorder', order_by => 'a,b')"),
+ Row(true) :: Nil)
+
+ val result2 = new util.ArrayList[Row]()
+ result2.add(0, Row(0, 0))
+ result2.add(1, Row(0, 1))
+ result2.add(2, Row(1, 0))
+ result2.add(3, Row(1, 1))
+ result2.add(4, Row(0, 2))
+ result2.add(5, Row(1, 2))
+ result2.add(6, Row(2, 0))
+ result2.add(7, Row(2, 1))
+ result2.add(8, Row(2, 2))
+
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result2)
+
+ // test hilbert sort
+ val result3 = new util.ArrayList[Row]()
+ result3.add(0, Row(0, 0))
+ result3.add(1, Row(0, 1))
+ result3.add(2, Row(1, 1))
+ result3.add(3, Row(1, 0))
+ result3.add(4, Row(2, 0))
+ result3.add(5, Row(2, 1))
+ result3.add(6, Row(2, 2))
+ result3.add(7, Row(1, 2))
+ result3.add(8, Row(0, 2))
+
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.compact(table => 'T', order_strategy => 'hilbert', order_by => 'a,b')"),
+ Row(true) :: Nil)
+
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result3)
+
+ // test order sort
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.compact(table => 'T', order_strategy => 'order', order_by => 'a,b')"),
+ Row(true) :: Nil)
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: sort compact with partition") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (p INT, a INT, b INT)
+ |TBLPROPERTIES ('bucket'='-1')
+ |PARTITIONED BY (p)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, Int, Int)]
+ val stream = inputData
+ .toDS()
+ .toDF("p", "a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query0 = () => spark.sql("SELECT * FROM T WHERE p=0")
+ val query1 = () => spark.sql("SELECT * FROM T WHERE p=1")
+
+ try {
+ // test zorder sort
+ inputData.addData((0, 0, 0))
+ inputData.addData((0, 0, 1))
+ inputData.addData((0, 0, 2))
+ inputData.addData((0, 1, 0))
+ inputData.addData((0, 1, 1))
+ inputData.addData((0, 1, 2))
+ inputData.addData((0, 2, 0))
+ inputData.addData((0, 2, 1))
+ inputData.addData((0, 2, 2))
+
+ inputData.addData((1, 0, 0))
+ inputData.addData((1, 0, 1))
+ inputData.addData((1, 0, 2))
+ inputData.addData((1, 1, 0))
+ inputData.addData((1, 1, 1))
+ inputData.addData((1, 1, 2))
+ inputData.addData((1, 2, 0))
+ inputData.addData((1, 2, 1))
+ inputData.addData((1, 2, 2))
+ stream.processAllAvailable()
+
+ val result0 = new util.ArrayList[Row]()
+ for (a <- 0 until 3) {
+ for (b <- 0 until 3) {
+ result0.add(Row(0, a, b))
+ }
+ }
+ val result1 = new util.ArrayList[Row]()
+ for (a <- 0 until 3) {
+ for (b <- 0 until 3) {
+ result1.add(Row(1, a, b))
+ }
+ }
+ Assertions.assertThat(query0().collect()).containsExactlyElementsOf(result0)
+ Assertions.assertThat(query1().collect()).containsExactlyElementsOf(result1)
+
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.compact(table => 'T', partitions => 'p=0', order_strategy => 'zorder', order_by => 'a,b')"),
+ Row(true) :: Nil)
+
+ val result2 = new util.ArrayList[Row]()
+ result2.add(0, Row(0, 0, 0))
+ result2.add(1, Row(0, 0, 1))
+ result2.add(2, Row(0, 1, 0))
+ result2.add(3, Row(0, 1, 1))
+ result2.add(4, Row(0, 0, 2))
+ result2.add(5, Row(0, 1, 2))
+ result2.add(6, Row(0, 2, 0))
+ result2.add(7, Row(0, 2, 1))
+ result2.add(8, Row(0, 2, 2))
+
+ Assertions.assertThat(query0().collect()).containsExactlyElementsOf(result2)
+ Assertions.assertThat(query1().collect()).containsExactlyElementsOf(result1)
+
+ // test hilbert sort
+ val result3 = new util.ArrayList[Row]()
+ result3.add(0, Row(0, 0, 0))
+ result3.add(1, Row(0, 0, 1))
+ result3.add(2, Row(0, 1, 1))
+ result3.add(3, Row(0, 1, 0))
+ result3.add(4, Row(0, 2, 0))
+ result3.add(5, Row(0, 2, 1))
+ result3.add(6, Row(0, 2, 2))
+ result3.add(7, Row(0, 1, 2))
+ result3.add(8, Row(0, 0, 2))
+
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.compact(table => 'T', partitions => 'p=0', order_strategy => 'hilbert', order_by => 'a,b')"),
+ Row(true) :: Nil)
+
+ Assertions.assertThat(query0().collect()).containsExactlyElementsOf(result3)
+ Assertions.assertThat(query1().collect()).containsExactlyElementsOf(result1)
+
+ // test order sort
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.compact(table => 'T', partitions => 'p=0', order_strategy => 'order', order_by => 'a,b')"),
+ Row(true) :: Nil)
+ Assertions.assertThat(query0().collect()).containsExactlyElementsOf(result0)
+ Assertions.assertThat(query1().collect()).containsExactlyElementsOf(result1)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: sort compact with multi-partitions") {
+ Seq("order", "zorder").foreach {
+ orderStrategy =>
+ {
+ withTable("T") {
+ spark.sql(s"""
+ |CREATE TABLE T (id INT, pt STRING)
+ |PARTITIONED BY (pt)
+ |""".stripMargin)
+
+ spark.sql(s"""INSERT INTO T VALUES
+ |(1, 'p1'), (3, 'p1'),
+ |(1, 'p2'), (4, 'p2'),
+ |(3, 'p3'), (2, 'p3'),
+ |(1, 'p4'), (2, 'p4')
+ |""".stripMargin)
+
+ spark.sql(s"""INSERT INTO T VALUES
+ |(4, 'p1'), (2, 'p1'),
+ |(2, 'p2'), (3, 'p2'),
+ |(1, 'p3'), (4, 'p3'),
+ |(3, 'p4'), (4, 'p4')
+ |""".stripMargin)
+
+ checkAnswer(
+ spark.sql(
+ s"CALL sys.compact(table => 'T', order_strategy => '$orderStrategy', order_by => 'id')"),
+ Seq(true).toDF())
+
+ val result = List(Row(1), Row(2), Row(3), Row(4)).asJava
+ Seq("p1", "p2", "p3", "p4").foreach {
+ pt =>
+ Assertions
+ .assertThat(spark.sql(s"SELECT id FROM T WHERE pt='$pt'").collect())
+ .containsExactlyElementsOf(result)
+ }
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: sort compact with partition filter") {
+ withTable("t") {
+ sql("CREATE TABLE t (a INT, pt INT) PARTITIONED BY (pt)")
+ sql("INSERT INTO t VALUES (1, 1)")
+ sql("INSERT INTO t VALUES (2, 1)")
+ sql(
+ "CALL sys.compact(table => 't', order_strategy => 'order', where => 'pt = 1', order_by => 'a')")
+ val table = loadTable("t")
+ assert(table.latestSnapshot().get().commitKind.equals(CommitKind.OVERWRITE))
+ checkAnswer(sql("SELECT * FROM t ORDER BY a"), Seq(Row(1, 1), Row(2, 1)))
+ }
+ }
+
+ test("Paimon Procedure: compact for pk") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b INT)
+ |TBLPROPERTIES ('primary-key'='a,b', 'bucket'='1')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, Int)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ inputData.addData((0, 0))
+ inputData.addData((0, 1))
+ inputData.addData((0, 2))
+ inputData.addData((1, 0))
+ inputData.addData((1, 1))
+ inputData.addData((1, 2))
+ inputData.addData((2, 0))
+ inputData.addData((2, 1))
+ inputData.addData((2, 2))
+ stream.processAllAvailable()
+
+ val result = new util.ArrayList[Row]()
+ for (a <- 0 until 3) {
+ for (b <- 0 until 3) {
+ result.add(Row(a, b))
+ }
+ }
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result)
+ checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil)
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: compact aware bucket pk table") {
+ Seq(1, -1).foreach(
+ bucket => {
+ withTable("T") {
+ spark.sql(
+ s"""
+ |CREATE TABLE T (id INT, value STRING, pt STRING)
+ |TBLPROPERTIES ('primary-key'='id, pt', 'bucket'='$bucket', 'write-only'='true')
+ |PARTITIONED BY (pt)
+ |""".stripMargin)
+
+ val table = loadTable("T")
+
+ spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1'), (2, 'b', 'p2')")
+ spark.sql(s"INSERT INTO T VALUES (3, 'c', 'p1'), (4, 'd', 'p2')")
+
+ spark.sql("CALL sys.compact(table => 'T', partitions => 'pt=\"p1\"')")
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue
+ Assertions.assertThat(lastSnapshotId(table)).isEqualTo(3)
+
+ spark.sql(s"CALL sys.compact(table => 'T')")
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue
+ Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4)
+
+ // compact condition no longer met
+ spark.sql(s"CALL sys.compact(table => 'T')")
+ Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4)
+
+ checkAnswer(
+ spark.sql(s"SELECT * FROM T ORDER BY id"),
+ Row(1, "a", "p1") :: Row(2, "b", "p2") :: Row(3, "c", "p1") :: Row(4, "d", "p2") :: Nil)
+ }
+ })
+ }
+
+ test("Paimon Procedure: compact aware bucket pk table with many small files") {
+ Seq(3, -1).foreach(
+ bucket => {
+ withTable("T") {
+ spark.sql(
+ s"""
+ |CREATE TABLE T (id INT, value STRING, pt STRING)
+ |TBLPROPERTIES ('primary-key'='id, pt', 'bucket'='$bucket', 'write-only'='true',
+ |'source.split.target-size'='128m','source.split.open-file-cost'='32m') -- simulate multiple splits in a single bucket
+ |PARTITIONED BY (pt)
+ |""".stripMargin)
+
+ val table = loadTable("T")
+
+ val count = 100
+ for (i <- 0 until count) {
+ spark.sql(s"INSERT INTO T VALUES ($i, 'a', 'p${i % 2}')")
+ }
+
+ spark.sql(s"CALL sys.compact(table => 'T')")
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue
+ checkAnswer(spark.sql(s"SELECT COUNT(*) FROM T"), Row(count) :: Nil)
+ }
+ })
+ }
+
+ test("Paimon Procedure: compact unaware bucket append table") {
+ spark.sql(s"""
+ |CREATE TABLE T (id INT, value STRING, pt STRING)
+ |TBLPROPERTIES ('bucket'='-1', 'write-only'='true', 'compaction.min.file-num'='2')
+ |PARTITIONED BY (pt)
+ |""".stripMargin)
+
+ val table = loadTable("T")
+
+ spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1'), (2, 'b', 'p2')")
+ spark.sql(s"INSERT INTO T VALUES (3, 'c', 'p1'), (4, 'd', 'p2')")
+ spark.sql(s"INSERT INTO T VALUES (5, 'e', 'p1'), (6, 'f', 'p2')")
+
+ spark.sql("CALL sys.compact(table => 'T', partitions => 'pt=\"p1\"')")
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue
+ Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4)
+
+ spark.sql(s"CALL sys.compact(table => 'T')")
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue
+ Assertions.assertThat(lastSnapshotId(table)).isEqualTo(5)
+
+ // compact condition no longer met
+ spark.sql(s"CALL sys.compact(table => 'T')")
+ Assertions.assertThat(lastSnapshotId(table)).isEqualTo(5)
+
+ checkAnswer(
+ spark.sql(s"SELECT * FROM T ORDER BY id"),
+ Row(1, "a", "p1") :: Row(2, "b", "p2") :: Row(3, "c", "p1") :: Row(4, "d", "p2") :: Row(
+ 5,
+ "e",
+ "p1") :: Row(6, "f", "p2") :: Nil)
+ }
+
+ test("Paimon Procedure: compact unaware bucket append table with many small files") {
+ spark.sql(s"""
+ |CREATE TABLE T (id INT, value STRING, pt STRING)
+ |TBLPROPERTIES ('bucket'='-1', 'write-only'='true')
+ |PARTITIONED BY (pt)
+ |""".stripMargin)
+
+ val table = loadTable("T")
+
+ val count = 100
+ for (i <- 0 until count) {
+ spark.sql(s"INSERT INTO T VALUES ($i, 'a', 'p${i % 2}')")
+ }
+
+ spark.sql(s"CALL sys.compact(table => 'T')")
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue
+ checkAnswer(spark.sql(s"SELECT COUNT(*) FROM T"), Row(count) :: Nil)
+ }
+
+ test("Paimon Procedure: compact with wrong usage") {
+ spark.sql(s"""
+ |CREATE TABLE T (id INT, value STRING, pt STRING)
+ |TBLPROPERTIES ('bucket'='-1', 'write-only'='true')
+ |PARTITIONED BY (pt)
+ |""".stripMargin)
+
+ assert(intercept[IllegalArgumentException] {
+ spark.sql(
+ "CALL sys.compact(table => 'T', partitions => 'pt = \"p1\"', where => 'pt = \"p1\"')")
+ }.getMessage.contains("partitions and where cannot be used together"))
+
+ assert(intercept[IllegalArgumentException] {
+ spark.sql("CALL sys.compact(table => 'T', partitions => 'id = 1')")
+ }.getMessage.contains("Only partition predicate is supported"))
+
+ assert(intercept[IllegalArgumentException] {
+ spark.sql("CALL sys.compact(table => 'T', where => 'id > 1 AND pt = \"p1\"')")
+ }.getMessage.contains("Only partition predicate is supported"))
+
+ assert(intercept[IllegalArgumentException] {
+ spark.sql("CALL sys.compact(table => 'T', order_strategy => 'sort', order_by => 'pt')")
+ }.getMessage.contains("order_by should not contain partition cols"))
+
+ assert(intercept[IllegalArgumentException] {
+ spark.sql(
+ "CALL sys.compact(table => 'T', order_strategy => 'sort', order_by => 'id', partition_idle_time =>'5s')")
+ }.getMessage.contains("sort compact do not support 'partition_idle_time'"))
+ }
+
+ test("Paimon Procedure: compact with where") {
+ spark.sql(
+ s"""
+ |CREATE TABLE T (id INT, value STRING, dt STRING, hh INT)
+ |TBLPROPERTIES ('bucket'='1', 'bucket-key'='id', 'write-only'='true', 'compaction.min.file-num'='1')
+ |PARTITIONED BY (dt, hh)
+ |""".stripMargin)
+
+ val table = loadTable("T")
+ val fileIO = table.fileIO()
+
+ spark.sql(s"INSERT INTO T VALUES (1, '1', '2024-01-01', 0), (2, '2', '2024-01-01', 1)")
+ spark.sql(s"INSERT INTO T VALUES (3, '3', '2024-01-01', 0), (4, '4', '2024-01-01', 1)")
+ spark.sql(s"INSERT INTO T VALUES (5, '5', '2024-01-02', 0), (6, '6', '2024-01-02', 1)")
+ spark.sql(s"INSERT INTO T VALUES (7, '7', '2024-01-02', 0), (8, '8', '2024-01-02', 1)")
+
+ spark.sql("CALL sys.compact(table => 'T', where => 'dt = \"2024-01-01\" and hh >= 1')")
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue
+ Assertions
+ .assertThat(
+ fileIO.listStatus(new Path(table.location(), "dt=2024-01-01/hh=0/bucket-0")).length)
+ .isEqualTo(2)
+ Assertions
+ .assertThat(
+ fileIO.listStatus(new Path(table.location(), "dt=2024-01-01/hh=1/bucket-0")).length)
+ .isEqualTo(3)
+ Assertions
+ .assertThat(
+ fileIO.listStatus(new Path(table.location(), "dt=2024-01-02/hh=0/bucket-0")).length)
+ .isEqualTo(2)
+ Assertions
+ .assertThat(
+ fileIO.listStatus(new Path(table.location(), "dt=2024-01-02/hh=1/bucket-0")).length)
+ .isEqualTo(2)
+ }
+
+ test("Paimon test: toWhere method in CompactProcedure") {
+ val conditions = "f0=0,f1=0,f2=0;f0=1,f1=1,f2=1;f0=1,f1=2,f2=2;f3=3"
+
+ val where = SparkProcedureUtils.toWhere(conditions)
+ val whereExpected =
+ "(f0=0 AND f1=0 AND f2=0) OR (f0=1 AND f1=1 AND f2=1) OR (f0=1 AND f1=2 AND f2=2) OR (f3=3)"
+
+ Assertions.assertThat(where).isEqualTo(whereExpected)
+ }
+
+ test("Paimon Procedure: compact unaware bucket append table with option") {
+ spark.sql(s"""
+ |CREATE TABLE T (id INT, value STRING, pt STRING)
+ |TBLPROPERTIES ('bucket'='-1', 'write-only'='true')
+ |PARTITIONED BY (pt)
+ |""".stripMargin)
+
+ val table = loadTable("T")
+
+ spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1'), (2, 'b', 'p2')")
+ spark.sql(s"INSERT INTO T VALUES (3, 'c', 'p1'), (4, 'd', 'p2')")
+ spark.sql(s"INSERT INTO T VALUES (5, 'e', 'p1'), (6, 'f', 'p2')")
+
+ spark.sql(
+ "CALL sys.compact(table => 'T', partitions => 'pt=\"p1\"', options => 'compaction.min.file-num=2')")
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue
+ Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4)
+
+ spark.sql("CALL sys.compact(table => 'T', options => 'compaction.min.file-num=2')")
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue
+ Assertions.assertThat(lastSnapshotId(table)).isEqualTo(5)
+
+ // compact condition no longer met
+ spark.sql(s"CALL sys.compact(table => 'T')")
+ Assertions.assertThat(lastSnapshotId(table)).isEqualTo(5)
+
+ checkAnswer(
+ spark.sql(s"SELECT * FROM T ORDER BY id"),
+ Row(1, "a", "p1") :: Row(2, "b", "p2") :: Row(3, "c", "p1") :: Row(4, "d", "p2") ::
+ Row(5, "e", "p1") :: Row(6, "f", "p2") :: Nil)
+ }
+
+ test("Paimon Procedure: compact with partition_idle_time for pk table") {
+ Seq(1, -1).foreach(
+ bucket => {
+ withTable("T") {
+ val dynamicBucketArgs = if (bucket == -1) " ,'dynamic-bucket.initial-buckets'='1'" else ""
+ spark.sql(
+ s"""
+ |CREATE TABLE T (id INT, value STRING, dt STRING, hh INT)
+ |TBLPROPERTIES ('primary-key'='id, dt, hh', 'bucket'='$bucket', 'write-only'='true'$dynamicBucketArgs)
+ |PARTITIONED BY (dt, hh)
+ |""".stripMargin)
+
+ val table = loadTable("T")
+
+ spark.sql(s"INSERT INTO T VALUES (1, '1', '2024-01-01', 0), (2, '2', '2024-01-01', 1)")
+ spark.sql(s"INSERT INTO T VALUES (5, '5', '2024-01-02', 0), (6, '6', '2024-01-02', 1)")
+ spark.sql(s"INSERT INTO T VALUES (3, '3', '2024-01-01', 0), (4, '4', '2024-01-01', 1)")
+ spark.sql(s"INSERT INTO T VALUES (7, '7', '2024-01-02', 0), (8, '8', '2024-01-02', 1)")
+
+ Thread.sleep(10000);
+ spark.sql(s"INSERT INTO T VALUES (9, '9', '2024-01-01', 0), (10, '10', '2024-01-02', 0)")
+
+ spark.sql("CALL sys.compact(table => 'T', partition_idle_time => '10s')")
+ val dataSplits = table.newSnapshotReader.read.dataSplits.asScala.toList
+ Assertions
+ .assertThat(dataSplits.size)
+ .isEqualTo(4)
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue
+ for (dataSplit: DataSplit <- dataSplits) {
+ if (dataSplit.partition().getInt(1) == 0) {
+ Assertions
+ .assertThat(dataSplit.dataFiles().size())
+ .isEqualTo(3)
+ } else {
+ Assertions
+ .assertThat(dataSplit.dataFiles().size())
+ .isEqualTo(1)
+ }
+ }
+ }
+ })
+
+ }
+
+ test("Paimon Procedure: compact with partition_idle_time for unaware bucket append table") {
+ spark.sql(s"""
+ |CREATE TABLE T (id INT, value STRING, dt STRING, hh INT)
+ |TBLPROPERTIES ('bucket'='-1', 'write-only'='true', 'compaction.min.file-num'='2')
+ |PARTITIONED BY (dt, hh)
+ |""".stripMargin)
+
+ val table = loadTable("T")
+
+ spark.sql(s"INSERT INTO T VALUES (1, '1', '2024-01-01', 0), (2, '2', '2024-01-01', 1)")
+ spark.sql(s"INSERT INTO T VALUES (5, '5', '2024-01-02', 0), (6, '6', '2024-01-02', 1)")
+ spark.sql(s"INSERT INTO T VALUES (3, '3', '2024-01-01', 0), (4, '4', '2024-01-01', 1)")
+ spark.sql(s"INSERT INTO T VALUES (7, '7', '2024-01-02', 0), (8, '8', '2024-01-02', 1)")
+
+ Thread.sleep(10000);
+ spark.sql(s"INSERT INTO T VALUES (9, '9', '2024-01-01', 0), (10, '10', '2024-01-02', 0)")
+
+ spark.sql("CALL sys.compact(table => 'T', partition_idle_time => '10s')")
+ val dataSplits = table.newSnapshotReader.read.dataSplits.asScala.toList
+ Assertions
+ .assertThat(dataSplits.size)
+ .isEqualTo(4)
+ Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue
+ for (dataSplit: DataSplit <- dataSplits) {
+ if (dataSplit.partition().getInt(1) == 0) {
+ Assertions
+ .assertThat(dataSplit.dataFiles().size())
+ .isEqualTo(3)
+ } else {
+ Assertions
+ .assertThat(dataSplit.dataFiles().size())
+ .isEqualTo(1)
+ }
+ }
+ }
+
+ test("Paimon Procedure: test aware-bucket compaction read parallelism") {
+ spark.sql(s"""
+ |CREATE TABLE T (id INT, value STRING)
+ |TBLPROPERTIES ('primary-key'='id', 'bucket'='3', 'write-only'='true')
+ |""".stripMargin)
+
+ val table = loadTable("T")
+ for (i <- 1 to 10) {
+ sql(s"INSERT INTO T VALUES ($i, '$i')")
+ }
+ assertResult(10)(table.snapshotManager().snapshotCount())
+
+ val buckets = table.newSnapshotReader().bucketEntries().asScala.map(_.bucket()).distinct.size
+ assertResult(3)(buckets)
+
+ val taskBuffer = scala.collection.mutable.ListBuffer.empty[Int]
+ val listener = new SparkListener {
+ override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = {
+ taskBuffer += stageSubmitted.stageInfo.numTasks
+ }
+ }
+
+ try {
+ spark.sparkContext.addSparkListener(listener)
+
+ // spark.default.parallelism cannot be change in spark session
+ // sparkParallelism is 2, bucket is 3, use 2 as the read parallelism
+ spark.conf.set("spark.sql.shuffle.partitions", 2)
+ spark.sql("CALL sys.compact(table => 'T')")
+
+ // sparkParallelism is 5, bucket is 3, use 3 as the read parallelism
+ spark.conf.set("spark.sql.shuffle.partitions", 5)
+ spark.sql("CALL sys.compact(table => 'T')")
+
+ assertResult(Seq(2, 3))(taskBuffer)
+ } finally {
+ spark.sparkContext.removeSparkListener(listener)
+ }
+ }
+
+ test("Paimon Procedure: test unaware-bucket compaction read parallelism") {
+ spark.sql(s"""
+ |CREATE TABLE T (id INT, value STRING)
+ |TBLPROPERTIES ('bucket'='-1', 'write-only'='true')
+ |""".stripMargin)
+
+ val table = loadTable("T")
+ for (i <- 1 to 12) {
+ sql(s"INSERT INTO T VALUES ($i, '$i')")
+ }
+ assertResult(12)(table.snapshotManager().snapshotCount())
+
+ val buckets = table.newSnapshotReader().bucketEntries().asScala.map(_.bucket()).distinct.size
+ // only has bucket-0
+ assertResult(1)(buckets)
+
+ val taskBuffer = scala.collection.mutable.ListBuffer.empty[Int]
+ val listener = new SparkListener {
+ override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = {
+ taskBuffer += stageSubmitted.stageInfo.numTasks
+ }
+ }
+
+ try {
+ spark.sparkContext.addSparkListener(listener)
+
+ // spark.default.parallelism cannot be change in spark session
+ // sparkParallelism is 2, task groups is 6, use 2 as the read parallelism
+ spark.conf.set("spark.sql.shuffle.partitions", 2)
+ spark.sql(
+ "CALL sys.compact(table => 'T', options => 'source.split.open-file-cost=3200M, compaction.min.file-num=2')")
+
+ // sparkParallelism is 5, task groups is 1, use 1 as the read parallelism
+ spark.conf.set("spark.sql.shuffle.partitions", 5)
+ spark.sql(
+ "CALL sys.compact(table => 'T', options => 'source.split.open-file-cost=3200M, compaction.min.file-num=2')")
+
+ assertResult(Seq(2, 3))(taskBuffer)
+ } finally {
+ spark.sparkContext.removeSparkListener(listener)
+ }
+ }
+
+ test("Paimon Procedure: type cast in where") {
+ withTable("t") {
+ sql("""
+ |CREATE TABLE t (id INT, value STRING, day_part LONG)
+ |TBLPROPERTIES ('compaction.min.file-num'='2')
+ |PARTITIONED BY (day_part)
+ |""".stripMargin)
+ sql("INSERT INTO t VALUES (1, 'a', 20250810)")
+ sql("INSERT INTO t VALUES (2, 'b', 20250810)")
+ sql("INSERT INTO t VALUES (3, 'c', 20250811)")
+
+ sql("CALL sys.compact(table => 't', where => 'day_part < 20250811 and day_part > 20250809')")
+ val table = loadTable("t")
+ assert(table.snapshotManager().latestSnapshot().commitKind().equals(CommitKind.COMPACT))
+ }
+ }
+
+ test("Paimon Procedure: cluster for unpartitioned table") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(
+ s"""
+ |CREATE TABLE T (a INT, b INT, c STRING)
+ |TBLPROPERTIES ('bucket'='-1','num-levels'='6', 'num-sorted-run.compaction-trigger'='2', 'clustering.columns'='a,b', 'clustering.strategy'='zorder', 'clustering.incremental' = 'true')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b", "c")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ val random = new Random()
+ val randomStr = random.nextString(40)
+ // first write
+ inputData.addData((0, 0, randomStr))
+ inputData.addData((0, 1, randomStr))
+ inputData.addData((0, 2, randomStr))
+ inputData.addData((1, 0, randomStr))
+ inputData.addData((1, 1, randomStr))
+ inputData.addData((1, 2, randomStr))
+ inputData.addData((2, 0, randomStr))
+ inputData.addData((2, 1, randomStr))
+ inputData.addData((2, 2, randomStr))
+ stream.processAllAvailable()
+
+ val result = new util.ArrayList[Row]()
+ for (a <- 0 until 3) {
+ for (b <- 0 until 3) {
+ result.add(Row(a, b, randomStr))
+ }
+ }
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result)
+
+ // first cluster, the outputLevel should be 5
+ checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil)
+
+ // first cluster result
+ val result2 = new util.ArrayList[Row]()
+ result2.add(0, Row(0, 0, randomStr))
+ result2.add(1, Row(0, 1, randomStr))
+ result2.add(2, Row(1, 0, randomStr))
+ result2.add(3, Row(1, 1, randomStr))
+ result2.add(4, Row(0, 2, randomStr))
+ result2.add(5, Row(1, 2, randomStr))
+ result2.add(6, Row(2, 0, randomStr))
+ result2.add(7, Row(2, 1, randomStr))
+ result2.add(8, Row(2, 2, randomStr))
+
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result2)
+
+ var clusteredTable = loadTable("T")
+ checkSnapshot(clusteredTable)
+ var dataSplits = clusteredTable.newSnapshotReader().read().dataSplits()
+ Assertions.assertThat(dataSplits.size()).isEqualTo(1)
+ Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(1)
+ Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5)
+
+ // second write
+ inputData.addData((0, 3, null), (1, 3, null), (2, 3, null))
+ inputData.addData((3, 0, null), (3, 1, null), (3, 2, null), (3, 3, null))
+ stream.processAllAvailable()
+
+ val result3 = new util.ArrayList[Row]()
+ result3.addAll(result2)
+ for (a <- 0 until 3) {
+ result3.add(Row(a, 3, null))
+ }
+ for (b <- 0 until 4) {
+ result3.add(Row(3, b, null))
+ }
+
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result3)
+
+ // second cluster, the outputLevel should be 4
+ checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil)
+ // second cluster result, level-5 and level-4 are individually ordered
+ val result4 = new util.ArrayList[Row]()
+ result4.addAll(result2)
+ result4.add(Row(0, 3, null))
+ result4.add(Row(1, 3, null))
+ result4.add(Row(3, 0, null))
+ result4.add(Row(3, 1, null))
+ result4.add(Row(2, 3, null))
+ result4.add(Row(3, 2, null))
+ result4.add(Row(3, 3, null))
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result4)
+
+ clusteredTable = loadTable("T")
+ checkSnapshot(clusteredTable)
+ dataSplits = clusteredTable.newSnapshotReader().read().dataSplits()
+ Assertions.assertThat(dataSplits.size()).isEqualTo(1)
+ Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(2)
+ Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5)
+ Assertions.assertThat(dataSplits.get(0).dataFiles().get(1).level()).isEqualTo(4)
+
+ // full cluster
+ checkAnswer(
+ spark.sql("CALL paimon.sys.compact(table => 'T', compact_strategy => 'full')"),
+ Row(true) :: Nil)
+ val result5 = new util.ArrayList[Row]()
+ result5.add(Row(0, 0, randomStr))
+ result5.add(Row(0, 1, randomStr))
+ result5.add(Row(1, 0, randomStr))
+ result5.add(Row(1, 1, randomStr))
+ result5.add(Row(0, 2, randomStr))
+ result5.add(Row(0, 3, null))
+ result5.add(Row(1, 2, randomStr))
+ result5.add(Row(1, 3, null))
+ result5.add(Row(2, 0, randomStr))
+ result5.add(Row(2, 1, randomStr))
+ result5.add(Row(3, 0, null))
+ result5.add(Row(3, 1, null))
+ result5.add(Row(2, 2, randomStr))
+ result5.add(Row(2, 3, null))
+ result5.add(Row(3, 2, null))
+ result5.add(Row(3, 3, null))
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result5)
+
+ clusteredTable = loadTable("T")
+ checkSnapshot(clusteredTable)
+ dataSplits = clusteredTable.newSnapshotReader().read().dataSplits()
+ Assertions.assertThat(dataSplits.size()).isEqualTo(1)
+ Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(1)
+ Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: cluster for partitioned table") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(
+ s"""
+ |CREATE TABLE T (a INT, b INT, c STRING, pt INT)
+ |PARTITIONED BY (pt)
+ |TBLPROPERTIES ('bucket'='-1', 'num-levels'='6', 'num-sorted-run.compaction-trigger'='2', 'clustering.columns'='a,b', 'clustering.strategy'='zorder', 'clustering.incremental' = 'true')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, Int, String, Int)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b", "c", "pt")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY pt")
+
+ try {
+ val random = new Random()
+ val randomStr = random.nextString(50)
+ // first write
+ for (pt <- 0 until 2) {
+ val c = if (pt == 0) randomStr else null
+ inputData.addData((0, 0, c, pt))
+ inputData.addData((0, 1, c, pt))
+ inputData.addData((0, 2, c, pt))
+ inputData.addData((1, 0, c, pt))
+ inputData.addData((1, 1, c, pt))
+ inputData.addData((1, 2, c, pt))
+ inputData.addData((2, 0, c, pt))
+ inputData.addData((2, 1, c, pt))
+ inputData.addData((2, 2, c, pt))
+ }
+ stream.processAllAvailable()
+
+ val result = new util.ArrayList[Row]()
+ for (pt <- 0 until 2) {
+ for (a <- 0 until 3) {
+ for (b <- 0 until 3) {
+ val c = if (pt == 0) randomStr else null
+ result.add(Row(a, b, c, pt))
+ }
+ }
+ }
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result)
+
+ // first cluster, the outputLevel should be 5
+ checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil)
+
+ // first cluster result
+ val result2 = new util.ArrayList[Row]()
+ for (pt <- 0 until 2) {
+ val c = if (pt == 0) randomStr else null
+ result2.add(Row(0, 0, c, pt))
+ result2.add(Row(0, 1, c, pt))
+ result2.add(Row(1, 0, c, pt))
+ result2.add(Row(1, 1, c, pt))
+ result2.add(Row(0, 2, c, pt))
+ result2.add(Row(1, 2, c, pt))
+ result2.add(Row(2, 0, c, pt))
+ result2.add(Row(2, 1, c, pt))
+ result2.add(Row(2, 2, c, pt))
+ }
+
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result2)
+
+ var clusteredTable = loadTable("T")
+ checkSnapshot(clusteredTable)
+ var dataSplits = clusteredTable.newSnapshotReader().read().dataSplits()
+ Assertions.assertThat(dataSplits.size()).isEqualTo(2)
+ dataSplits.forEach(
+ dataSplit => {
+ Assertions.assertThat(dataSplit.dataFiles().size()).isEqualTo(1)
+ Assertions.assertThat(dataSplit.dataFiles().get(0).level()).isEqualTo(5)
+ })
+
+ // second write
+ for (pt <- 0 until 2) {
+ inputData.addData((0, 3, null, pt), (1, 3, null, pt), (2, 3, null, pt))
+ inputData.addData(
+ (3, 0, null, pt),
+ (3, 1, null, pt),
+ (3, 2, null, pt),
+ (3, 3, null, pt))
+ }
+ stream.processAllAvailable()
+
+ val result3 = new util.ArrayList[Row]()
+ for (pt <- 0 until 2) {
+ val c = if (pt == 0) randomStr else null
+ result3.add(Row(0, 0, c, pt))
+ result3.add(Row(0, 1, c, pt))
+ result3.add(Row(1, 0, c, pt))
+ result3.add(Row(1, 1, c, pt))
+ result3.add(Row(0, 2, c, pt))
+ result3.add(Row(1, 2, c, pt))
+ result3.add(Row(2, 0, c, pt))
+ result3.add(Row(2, 1, c, pt))
+ result3.add(Row(2, 2, c, pt))
+ for (a <- 0 until 3) {
+ result3.add(Row(a, 3, null, pt))
+ }
+ for (b <- 0 until 4) {
+ result3.add(Row(3, b, null, pt))
+ }
+ }
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result3)
+
+ // second cluster
+ checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil)
+ val result4 = new util.ArrayList[Row]()
+ // for partition-0: only file in level-0 will be picked for clustering, outputLevel is 4
+ result4.add(Row(0, 0, randomStr, 0))
+ result4.add(Row(0, 1, randomStr, 0))
+ result4.add(Row(1, 0, randomStr, 0))
+ result4.add(Row(1, 1, randomStr, 0))
+ result4.add(Row(0, 2, randomStr, 0))
+ result4.add(Row(1, 2, randomStr, 0))
+ result4.add(Row(2, 0, randomStr, 0))
+ result4.add(Row(2, 1, randomStr, 0))
+ result4.add(Row(2, 2, randomStr, 0))
+ result4.add(Row(0, 3, null, 0))
+ result4.add(Row(1, 3, null, 0))
+ result4.add(Row(3, 0, null, 0))
+ result4.add(Row(3, 1, null, 0))
+ result4.add(Row(2, 3, null, 0))
+ result4.add(Row(3, 2, null, 0))
+ result4.add(Row(3, 3, null, 0))
+ // for partition-1:all files will be picked for clustering, outputLevel is 5
+ result4.add(Row(0, 0, null, 1))
+ result4.add(Row(0, 1, null, 1))
+ result4.add(Row(1, 0, null, 1))
+ result4.add(Row(1, 1, null, 1))
+ result4.add(Row(0, 2, null, 1))
+ result4.add(Row(0, 3, null, 1))
+ result4.add(Row(1, 2, null, 1))
+ result4.add(Row(1, 3, null, 1))
+ result4.add(Row(2, 0, null, 1))
+ result4.add(Row(2, 1, null, 1))
+ result4.add(Row(3, 0, null, 1))
+ result4.add(Row(3, 1, null, 1))
+ result4.add(Row(2, 2, null, 1))
+ result4.add(Row(2, 3, null, 1))
+ result4.add(Row(3, 2, null, 1))
+ result4.add(Row(3, 3, null, 1))
+
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result4)
+
+ clusteredTable = loadTable("T")
+ checkSnapshot(clusteredTable)
+ dataSplits = clusteredTable.newSnapshotReader().read().dataSplits()
+ Assertions.assertThat(dataSplits.size()).isEqualTo(2)
+ dataSplits.forEach(
+ dataSplit => {
+ if (dataSplit.partition().getInt(0) == 1) {
+ // partition-1
+ Assertions.assertThat(dataSplit.dataFiles().size()).isEqualTo(1)
+ Assertions.assertThat(dataSplit.dataFiles().get(0).level()).isEqualTo(5)
+ } else {
+ // partition-0
+ Assertions.assertThat(dataSplit.dataFiles().size()).isEqualTo(2)
+ Assertions.assertThat(dataSplit.dataFiles().get(0).level()).isEqualTo(5)
+ Assertions.assertThat(dataSplit.dataFiles().get(1).level()).isEqualTo(4)
+ }
+ })
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: cluster for partitioned table with partition filter") {
+ sql(
+ """
+ |CREATE TABLE T (a INT, b INT, pt INT)
+ |PARTITIONED BY (pt)
+ |TBLPROPERTIES (
+ | 'bucket'='-1', 'num-levels'='6', 'num-sorted-run.compaction-trigger'='2',
+ | 'clustering.columns'='a,b', 'clustering.strategy'='zorder', 'clustering.incremental' = 'true'
+ |)
+ |""".stripMargin)
+
+ sql("INSERT INTO T VALUES (0, 0, 0), (0, 0, 1)")
+ sql("INSERT INTO T VALUES (0, 1, 0), (0, 1, 1)")
+ sql("INSERT INTO T VALUES (0, 2, 0), (0, 2, 1)")
+ sql("INSERT INTO T VALUES (1, 0, 0), (1, 0, 1)")
+ sql("INSERT INTO T VALUES (1, 1, 0), (1, 1, 1)")
+ sql("INSERT INTO T VALUES (1, 2, 0), (1, 2, 1)")
+ sql("INSERT INTO T VALUES (2, 0, 0), (2, 0, 1)")
+ sql("INSERT INTO T VALUES (2, 1, 0), (2, 1, 1)")
+ sql("INSERT INTO T VALUES (2, 2, 0), (2, 2, 1)")
+
+ sql("CALL sys.compact(table => 'T', where => 'pt = 0')")
+ checkAnswer(
+ sql("select distinct partition, level from `T$files` order by partition"),
+ Seq(Row("{0}", 5), Row("{1}", 0))
+ )
+
+ sql("CALL sys.compact(table => 'T', where => 'pt = 1')")
+ checkAnswer(
+ sql("select distinct partition, level from `T$files` order by partition"),
+ Seq(Row("{0}", 5), Row("{1}", 5))
+ )
+ }
+
+ test("Paimon Procedure: cluster with deletion vectors") {
+ failAfter(Span(5, org.scalatest.time.Minutes)) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(
+ s"""
+ |CREATE TABLE T (a INT, b INT, c STRING)
+ |TBLPROPERTIES ('bucket'='-1', 'deletion-vectors.enabled'='true','num-levels'='6', 'num-sorted-run.compaction-trigger'='2', 'clustering.columns'='a,b', 'clustering.strategy'='zorder', 'clustering.incremental' = 'true')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b", "c")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ val random = new Random()
+ val randomStr = random.nextString(40)
+ // first write
+ inputData.addData((0, 0, randomStr))
+ inputData.addData((0, 1, randomStr))
+ inputData.addData((0, 2, randomStr))
+ inputData.addData((1, 0, randomStr))
+ inputData.addData((1, 1, randomStr))
+ inputData.addData((1, 2, randomStr))
+ inputData.addData((2, 0, randomStr))
+ inputData.addData((2, 1, randomStr))
+ inputData.addData((2, 2, randomStr))
+ stream.processAllAvailable()
+
+ val result = new util.ArrayList[Row]()
+ for (a <- 0 until 3) {
+ for (b <- 0 until 3) {
+ result.add(Row(a, b, randomStr))
+ }
+ }
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result)
+
+ // first cluster, the outputLevel should be 5
+ checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil)
+
+ // first cluster result
+ val result2 = new util.ArrayList[Row]()
+ result2.add(0, Row(0, 0, randomStr))
+ result2.add(1, Row(0, 1, randomStr))
+ result2.add(2, Row(1, 0, randomStr))
+ result2.add(3, Row(1, 1, randomStr))
+ result2.add(4, Row(0, 2, randomStr))
+ result2.add(5, Row(1, 2, randomStr))
+ result2.add(6, Row(2, 0, randomStr))
+ result2.add(7, Row(2, 1, randomStr))
+ result2.add(8, Row(2, 2, randomStr))
+
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result2)
+
+ var clusteredTable = loadTable("T")
+ checkSnapshot(clusteredTable)
+ var dataSplits = clusteredTable.newSnapshotReader().read().dataSplits()
+ Assertions.assertThat(dataSplits.size()).isEqualTo(1)
+ Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(1)
+ Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5)
+
+ // second write
+ inputData.addData((0, 3, null), (1, 3, null), (2, 3, null))
+ inputData.addData((3, 0, null), (3, 1, null), (3, 2, null), (3, 3, null))
+ stream.processAllAvailable()
+
+ // delete (0,0), which is in level-5 file
+ spark.sql("DELETE FROM T WHERE a=0 and b=0;").collect()
+ // delete (0,3), which is in level-0 file
+ spark.sql("DELETE FROM T WHERE a=0 and b=3;").collect()
+
+ val result3 = new util.ArrayList[Row]()
+ result3.addAll(result2.subList(1, result2.size()))
+ for (a <- 1 until 3) {
+ result3.add(Row(a, 3, null))
+ }
+ for (b <- 0 until 4) {
+ result3.add(Row(3, b, null))
+ }
+
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result3)
+
+ // second cluster, the outputLevel should be 4. dv index for level-0 will be updated
+ // and dv index for level-5 will be retained
+ checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil)
+ // second cluster result, level-5 and level-4 are individually ordered
+ val result4 = new util.ArrayList[Row]()
+ result4.addAll(result2.subList(1, result2.size()))
+ result4.add(Row(1, 3, null))
+ result4.add(Row(3, 0, null))
+ result4.add(Row(3, 1, null))
+ result4.add(Row(2, 3, null))
+ result4.add(Row(3, 2, null))
+ result4.add(Row(3, 3, null))
+ Assertions.assertThat(query().collect()).containsExactlyElementsOf(result4)
+
+ clusteredTable = loadTable("T")
+ checkSnapshot(clusteredTable)
+ dataSplits = clusteredTable.newSnapshotReader().read().dataSplits()
+ Assertions.assertThat(dataSplits.size()).isEqualTo(1)
+ Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(2)
+ Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5)
+ Assertions.assertThat(dataSplits.get(0).deletionFiles().get().get(0)).isNotNull
+ Assertions.assertThat(dataSplits.get(0).dataFiles().get(1).level()).isEqualTo(4)
+ Assertions.assertThat(dataSplits.get(0).deletionFiles().get().get(1)).isNull()
+
+ // full cluster
+ checkAnswer(
+ spark.sql("CALL paimon.sys.compact(table => 'T', compact_strategy => 'full')"),
+ Row(true) :: Nil)
+ clusteredTable = loadTable("T")
+ checkSnapshot(clusteredTable)
+ dataSplits = clusteredTable.newSnapshotReader().read().dataSplits()
+ Assertions.assertThat(dataSplits.size()).isEqualTo(1)
+ Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(1)
+ Assertions.assertThat(dataSplits.get(0).deletionFiles().get().get(0)).isNull()
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ def checkSnapshot(table: FileStoreTable): Unit = {
+ Assertions
+ .assertThat(table.latestSnapshot().get().commitKind().toString)
+ .isEqualTo(CommitKind.COMPACT.toString)
+ }
+
+ def lastSnapshotCommand(table: FileStoreTable): CommitKind = {
+ table.snapshotManager().latestSnapshot().commitKind()
+ }
+
+ def lastSnapshotId(table: FileStoreTable): Long = {
+ table.snapshotManager().latestSnapshotId()
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateAndDeleteTagProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateAndDeleteTagProcedureTest.scala
new file mode 100644
index 000000000000..605f80e27ad3
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateAndDeleteTagProcedureTest.scala
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure
+
+import org.apache.paimon.spark.PaimonSparkTestBase
+
+import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.execution.streaming.runtime.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+
+class CreateAndDeleteTagProcedureTest extends PaimonSparkTestBase with StreamTest {
+
+ import testImplicits._
+
+ test("Paimon Procedure: create and delete tag") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ // define a change-log table and test `forEachBatch` api
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ // snapshot-1
+ inputData.addData((1, "a"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ // snapshot-2
+ inputData.addData((2, "b"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ // snapshot-3
+ inputData.addData((2, "b2"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil)
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_tag(" +
+ "table => 'test.T', tag => 'test_tag', time_retained => '5 d', snapshot => 2)"),
+ Row(true) :: Nil)
+ checkAnswer(
+ spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"),
+ Row("test_tag") :: Nil)
+ checkAnswer(
+ spark.sql("CALL paimon.sys.delete_tag(table => 'test.T', tag => 'test_tag')"),
+ Row(true) :: Nil)
+ checkAnswer(spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), Nil)
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_tag(table => 'test.T', tag => 'test_latestSnapshot_tag')"),
+ Row(true) :: Nil)
+ checkAnswer(
+ spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"),
+ Row("test_latestSnapshot_tag") :: Nil)
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.delete_tag(table => 'test.T', tag => 'test_latestSnapshot_tag')"),
+ Row(true) :: Nil)
+ checkAnswer(spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), Nil)
+
+ // create test_tag_1 and test_tag_2
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_tag(" +
+ "table => 'test.T', tag => 'test_tag_1', snapshot => 1)"),
+ Row(true) :: Nil)
+
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_tag(" +
+ "table => 'test.T', tag => 'test_tag_2', snapshot => 2)"),
+ Row(true) :: Nil)
+
+ checkAnswer(
+ spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"),
+ Row("test_tag_1") :: Row("test_tag_2") :: Nil)
+
+ // test rename_tag
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.rename_tag(table => 'test.T', tag => 'test_tag_1', target_tag => 'test_tag_3')"),
+ Row(true) :: Nil
+ )
+ checkAnswer(
+ spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"),
+ Row("test_tag_2") :: Row("test_tag_3") :: Nil)
+
+ // delete test_tag_2 and test_tag_3
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.delete_tag(table => 'test.T', tag => 'test_tag_2,test_tag_3')"),
+ Row(true) :: Nil)
+
+ checkAnswer(spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), Nil)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: create same tag with same snapshot") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ // define a change-log table and test `forEachBatch` api
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ // snapshot-1
+ inputData.addData((1, "a"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_tag(" +
+ "table => 'test.T', tag => 'test_tag', snapshot => 1)"),
+ Row(true) :: Nil)
+ checkAnswer(
+ spark.sql("SELECT count(*) FROM paimon.test.`T$tags` where tag_name = 'test_tag'"),
+ Row(1) :: Nil)
+
+ // throw exception "Tag test_tag already exists"
+ assertThrows[IllegalArgumentException] {
+ spark.sql(
+ "CALL paimon.sys.create_tag(" +
+ "table => 'test.T', tag => 'test_tag', time_retained => '5 d', snapshot => 1)")
+ }
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: delete tag not failed if tag not exists") {
+ spark.sql("CREATE TABLE T (id STRING, name STRING) USING PAIMON")
+
+ checkAnswer(
+ spark.sql("CALL paimon.sys.delete_tag(table => 'test.T', tag => 'test_tag')"),
+ Row(true) :: Nil)
+ }
+
+ test("Paimon Procedure: delete multiple tags") {
+ spark.sql("CREATE TABLE T (id INT, name STRING) USING PAIMON")
+ spark.sql("insert into T values (1, 'a')")
+
+ // create four tags
+ spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 'tag-1')")
+ spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 'tag-2')")
+ spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 'tag-3')")
+ spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 'tag-4')")
+ checkAnswer(spark.sql("SELECT count(*) FROM paimon.test.`T$tags`"), Row(4) :: Nil)
+
+ // multiple tags with no space
+ checkAnswer(
+ spark.sql("CALL paimon.sys.delete_tag(table => 'test.T', tag => 'tag-1,tag-2')"),
+ Row(true) :: Nil)
+ checkAnswer(
+ spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"),
+ Row("tag-3") :: Row("tag-4") :: Nil)
+
+ // multiple tags with space
+ checkAnswer(
+ spark.sql("CALL paimon.sys.delete_tag(table => 'test.T', tag => 'tag-3, tag-4')"),
+ Row(true) :: Nil)
+ checkAnswer(spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), Nil)
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateGlobalVectorIndexProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateGlobalVectorIndexProcedureTest.scala
new file mode 100644
index 000000000000..b9283d996cc6
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateGlobalVectorIndexProcedureTest.scala
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure
+
+import org.apache.paimon.utils.Range
+
+import scala.collection.JavaConverters._
+import scala.collection.immutable
+
+class CreateGlobalVectorIndexProcedureTest extends CreateGlobalIndexProcedureTest {
+ test("create lucene-vector-knn global index") {
+ withTable("T") {
+ spark.sql("""
+ |CREATE TABLE T (id INT, v ARRAY)
+ |TBLPROPERTIES (
+ | 'bucket' = '-1',
+ | 'global-index.row-count-per-shard' = '10000',
+ | 'row-tracking.enabled' = 'true',
+ | 'data-evolution.enabled' = 'true')
+ |""".stripMargin)
+
+ val values = (0 until 100)
+ .map(
+ i => s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)))")
+ .mkString(",")
+ spark.sql(s"INSERT INTO T VALUES $values")
+
+ val output =
+ spark
+ .sql("CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')")
+ .collect()
+ .head
+
+ assert(output.getBoolean(0))
+
+ val table = loadTable("T")
+ val indexEntries = table
+ .store()
+ .newIndexFileHandler()
+ .scanEntries()
+ .asScala
+ .filter(_.indexFile().indexType() == "lucene-vector-knn")
+
+ assert(indexEntries.nonEmpty)
+ val totalRowCount = indexEntries.map(_.indexFile().rowCount()).sum
+ assert(totalRowCount == 100L)
+ }
+ }
+
+ test("create lucene-vector-knn global index with partition") {
+ withTable("T") {
+ spark.sql("""
+ |CREATE TABLE T (id INT, v ARRAY, pt STRING)
+ |TBLPROPERTIES (
+ | 'bucket' = '-1',
+ | 'global-index.row-count-per-shard' = '10000',
+ | 'row-tracking.enabled' = 'true',
+ | 'data-evolution.enabled' = 'true')
+ | PARTITIONED BY (pt)
+ |""".stripMargin)
+
+ var values = (0 until 65000)
+ .map(
+ i =>
+ s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)), 'p0')")
+ .mkString(",")
+ spark.sql(s"INSERT INTO T VALUES $values")
+
+ values = (0 until 35000)
+ .map(
+ i =>
+ s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)), 'p1')")
+ .mkString(",")
+ spark.sql(s"INSERT INTO T VALUES $values")
+
+ values = (0 until 22222)
+ .map(
+ i =>
+ s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)), 'p0')")
+ .mkString(",")
+ spark.sql(s"INSERT INTO T VALUES $values")
+
+ val output =
+ spark
+ .sql("CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')")
+ .collect()
+ .head
+
+ assert(output.getBoolean(0))
+
+ val table = loadTable("T")
+ val indexEntries = table
+ .store()
+ .newIndexFileHandler()
+ .scanEntries()
+ .asScala
+ .filter(_.indexFile().indexType() == "lucene-vector-knn")
+
+ assert(indexEntries.nonEmpty)
+ val totalRowCount = indexEntries.map(_.indexFile().rowCount()).sum
+ assert(totalRowCount == 122222L)
+ }
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateTagFromTimestampProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateTagFromTimestampProcedureTest.scala
new file mode 100644
index 000000000000..b4f7d63086ae
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateTagFromTimestampProcedureTest.scala
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure
+
+import org.apache.paimon.spark.PaimonSparkTestBase
+import org.apache.paimon.utils.SnapshotNotExistException
+
+import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.execution.streaming.runtime.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+
+class CreateTagFromTimestampProcedureTest extends PaimonSparkTestBase with StreamTest {
+
+ import testImplicits._
+
+ test("Paimon Procedure: Create tags from snapshots commit-time ") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ try {
+
+ for (i <- 1 to 4) {
+ inputData.addData((i, "a"))
+ stream.processAllAvailable()
+ Thread.sleep(500L)
+ }
+
+ val table = loadTable("T")
+ val earliestCommitTime = table.snapshotManager.earliestSnapshot.timeMillis
+ val commitTime3 = table.snapshotManager.snapshot(3).timeMillis
+ val commitTime4 = table.snapshotManager.snapshot(4).timeMillis
+
+ // create tag from timestamp that earlier than the earliest snapshot commit time.
+ checkAnswer(
+ spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp(
+ |table => 'test.T',
+ | tag => 'test_tag',
+ | timestamp => ${earliestCommitTime - 1})""".stripMargin),
+ Row("test_tag", 1, earliestCommitTime, "null") :: Nil
+ )
+
+ // create tag from timestamp that equals to snapshot-3 commit time.
+ checkAnswer(
+ spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp(
+ |table => 'test.T',
+ | tag => 'test_tag2',
+ | timestamp => $commitTime3)""".stripMargin),
+ Row("test_tag2", 3, commitTime3, "null") :: Nil
+ )
+
+ // create tag from timestamp that later than snapshot-3 commit time.
+ checkAnswer(
+ spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp(
+ |table => 'test.T',
+ |tag => 'test_tag3',
+ |timestamp => ${commitTime3 + 1})""".stripMargin),
+ Row("test_tag3", 4, commitTime4, "null") :: Nil
+ )
+
+ // create tag from timestamp that later than the latest snapshot commit time and throw SnapshotNotExistException.
+ assertThrows[SnapshotNotExistException] {
+ spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp(
+ |table => 'test.T',
+ |tag => 'test_tag3',
+ |timestamp => ${Long.MaxValue})""".stripMargin)
+ }
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: Create tags from tags commit-time") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ try {
+ for (i <- 1 to 2) {
+ inputData.addData((i, "a"))
+ stream.processAllAvailable()
+ Thread.sleep(500L)
+ }
+
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_tag(" +
+ "table => 'test.T', tag => 'test_tag', snapshot => 1)"),
+ Row(true) :: Nil)
+
+ val table = loadTable("T")
+ val latestCommitTime = table.snapshotManager.latestSnapshot().timeMillis
+ val tagsCommitTime = table.tagManager().getOrThrow("test_tag").timeMillis
+ assert(latestCommitTime > tagsCommitTime)
+
+ // make snapshot 1 expire.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_snapshots(table => 'test.T', retain_max => 1, retain_min => 1)"),
+ Row(1) :: Nil)
+
+ // create tag from timestamp that earlier than the expired snapshot 1.
+ checkAnswer(
+ spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp(
+ |table => 'test.T',
+ | tag => 'test_tag1',
+ | timestamp => ${tagsCommitTime - 1})""".stripMargin),
+ Row("test_tag1", 1, tagsCommitTime, "null") :: Nil
+ )
+
+ // create tag from timestamp that later than the expired snapshot 1.
+ checkAnswer(
+ spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp(
+ |table => 'test.T',
+ |tag => 'test_tag2',
+ |timestamp => ${tagsCommitTime + 1})""".stripMargin),
+ Row("test_tag2", 2, latestCommitTime, "null") :: Nil
+ )
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpirePartitionsProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpirePartitionsProcedureTest.scala
new file mode 100644
index 000000000000..c7cdc0f517a7
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpirePartitionsProcedureTest.scala
@@ -0,0 +1,760 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure
+
+import org.apache.paimon.spark.PaimonSparkTestBase
+
+import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.execution.streaming.runtime.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+import org.assertj.core.api.Assertions.assertThatThrownBy
+
+/** IT Case for [[ExpirePartitionsProcedure]]. */
+class ExpirePartitionsProcedureTest extends PaimonSparkTestBase with StreamTest {
+
+ import testImplicits._
+
+ test("Paimon Procedure: expire partitions") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (k STRING, pt STRING)
+ |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1')
+ | PARTITIONED BY (pt)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(String, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("k", "pt")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ // snapshot-1
+ inputData.addData(("a", "2024-06-01"))
+ stream.processAllAvailable()
+
+ // This partition never expires.
+ inputData.addData(("Never-expire", "9999-09-09"))
+ stream.processAllAvailable()
+
+ checkAnswer(query(), Row("a", "2024-06-01") :: Row("Never-expire", "9999-09-09") :: Nil)
+ // call expire_partitions.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" +
+ ", timestamp_formatter => 'yyyy-MM-dd')"),
+ Row("pt=2024-06-01") :: Nil
+ )
+
+ checkAnswer(query(), Row("Never-expire", "9999-09-09") :: Nil)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon procedure : expire partitions show a list of expired partitions.") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (k STRING, pt STRING, hm STRING)
+ |TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1')
+ | PARTITIONED BY (pt,hm)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(String, String, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("k", "pt", "hm")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ // Show results : There are no expired partitions.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" +
+ ", timestamp_formatter => 'yyyy-MM-dd')"),
+ Row("No expired partitions.") :: Nil
+ )
+
+ // snapshot-1
+ inputData.addData(("a", "2024-06-01", "01:00"))
+ stream.processAllAvailable()
+ // snapshot-2
+ inputData.addData(("b", "2024-06-02", "02:00"))
+ stream.processAllAvailable()
+ // snapshot-3, never expires.
+ inputData.addData(("Never-expire", "9999-09-09", "99:99"))
+ stream.processAllAvailable()
+
+ checkAnswer(
+ query(),
+ Row("a", "2024-06-01", "01:00") :: Row("b", "2024-06-02", "02:00") :: Row(
+ "Never-expire",
+ "9999-09-09",
+ "99:99") :: Nil)
+
+ // Show a list of expired partitions.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T'" +
+ ", expiration_time => '1 d'" +
+ ", timestamp_formatter => 'yyyy-MM-dd')"),
+ Row("pt=2024-06-01, hm=01:00") :: Row("pt=2024-06-02, hm=02:00") :: Nil
+ )
+
+ checkAnswer(query(), Row("Never-expire", "9999-09-09", "99:99") :: Nil)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: expire partitions with values-time strategy.") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (k STRING, pt STRING)
+ |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1')
+ | PARTITIONED BY (pt)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(String, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("k", "pt")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ // snapshot-1
+ inputData.addData(("HXH", "2024-06-01"))
+ stream.processAllAvailable()
+
+ // Never expire.
+ inputData.addData(("Never-expire", "9999-09-09"))
+ stream.processAllAvailable()
+
+ checkAnswer(
+ query(),
+ Row("HXH", "2024-06-01") :: Row("Never-expire", "9999-09-09") :: Nil)
+ // expire
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T'," +
+ " expiration_time => '1 d'" +
+ ", timestamp_formatter => 'yyyy-MM-dd'" +
+ ",expire_strategy => 'values-time')"),
+ Row("pt=2024-06-01") :: Nil
+ )
+
+ checkAnswer(query(), Row("Never-expire", "9999-09-09") :: Nil)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: expire partitions with update-time strategy.") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (k STRING, pt STRING)
+ |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1')
+ | PARTITIONED BY (pt)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(String, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("k", "pt")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ // This partition will expire.
+ inputData.addData(("HXH", "9999-09-09"))
+ stream.processAllAvailable()
+ // Waiting for partition 'pt=9999-09-09' to expire.
+ Thread.sleep(2500L)
+ // snapshot-2
+ inputData.addData(("HXH", "2024-06-01"))
+ stream.processAllAvailable()
+
+ // Partitions that are updated within 2 second would be retained.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(" +
+ "table => 'test.T'," +
+ " expiration_time => '2 s'" +
+ ",expire_strategy => 'update-time')"),
+ Row("pt=9999-09-09") :: Nil
+ )
+
+ checkAnswer(query(), Row("HXH", "2024-06-01") :: Nil)
+
+ // Waiting for all partitions to expire.
+ Thread.sleep(1500)
+ // All partition will expire.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(" +
+ "table => 'test.T'," +
+ " expiration_time => '1 s'" +
+ ",expire_strategy => 'update-time')"),
+ Row("pt=2024-06-01") :: Nil
+ )
+
+ checkAnswer(query(), Nil)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: expire partitions with update-time strategy in same partition.") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (k STRING, pt STRING, hm STRING)
+ |TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1')
+ | PARTITIONED BY (pt,hm)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(String, String, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("k", "pt", "hm")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ // This partition will not expire.
+ inputData.addData(("HXH", "2024-06-01", "01:00"))
+ stream.processAllAvailable()
+ // Waiting for 'pt=9999-09-09, hm=99:99' partitions to expire.
+ Thread.sleep(2500L)
+ // Updating the same partition data will update partition last update time, then this partition will not expire.
+ inputData.addData(("HXH", "2024-06-01", "01:00"))
+ stream.processAllAvailable()
+
+ // The last update time of the 'pt=9999-09-09, hm=99:99' partition is updated so the partition would not expire.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T'," +
+ " expiration_time => '2 s'" +
+ ",expire_strategy => 'update-time')"),
+ Row("No expired partitions.") :: Nil
+ )
+
+ checkAnswer(query(), Row("HXH", "2024-06-01", "01:00") :: Nil)
+ // Waiting for all partitions to expire.
+ Thread.sleep(1500)
+
+ // The partition 'dt=2024-06-01, hm=01:00' will expire.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T'," +
+ " expiration_time => '1 s'" +
+ ",expire_strategy => 'update-time')"),
+ Row("pt=2024-06-01, hm=01:00") :: Nil
+ )
+
+ checkAnswer(query(), Nil)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: expire partitions with non-date format partition.") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (k STRING, pt STRING)
+ |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1')
+ | PARTITIONED BY (pt)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(String, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("k", "pt")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ // This partition will expire.
+ inputData.addData(("HXH", "pt-1"))
+ stream.processAllAvailable()
+ Thread.sleep(2500L)
+ // snapshot-2
+ inputData.addData(("HXH", "pt-2"))
+ stream.processAllAvailable()
+
+ // Only update-time strategy support non date format partition to expire.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T'," +
+ " expiration_time => '2 s'" +
+ ",expire_strategy => 'update-time')"),
+ Row("pt=pt-1") :: Nil
+ )
+
+ checkAnswer(query(), Row("HXH", "pt-2") :: Nil)
+
+ // Waiting for all partitions to expire.
+ Thread.sleep(1500)
+ // call expire_partitions.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T'," +
+ " expiration_time => '1 s'" +
+ ",expire_strategy => 'update-time')"),
+ Row("pt=pt-2") :: Nil
+ )
+
+ checkAnswer(query(), Nil)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon procedure : expire partitions with specified time-pattern partitions.") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (k STRING, pt STRING, hm STRING)
+ |TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1')
+ | PARTITIONED BY (hm, pt)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(String, String, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("k", "pt", "hm")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ // Show results : There are no expired partitions.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" +
+ ", timestamp_formatter => 'yyyy-MM-dd', timestamp_pattern => '$pt')"),
+ Row("No expired partitions.") :: Nil
+ )
+
+ // snapshot-1
+ inputData.addData(("a", "2024-06-01", "01:00"))
+ stream.processAllAvailable()
+ // snapshot-2
+ inputData.addData(("b", "2024-06-02", "02:00"))
+ stream.processAllAvailable()
+ // snapshot-3, never expires.
+ inputData.addData(("Never-expire", "9999-09-09", "99:99"))
+ stream.processAllAvailable()
+
+ checkAnswer(
+ query(),
+ Row("a", "2024-06-01", "01:00") :: Row("b", "2024-06-02", "02:00") :: Row(
+ "Never-expire",
+ "9999-09-09",
+ "99:99") :: Nil)
+
+ // Show a list of expired partitions.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T'" +
+ ", expiration_time => '1 d'" +
+ ", timestamp_formatter => 'yyyy-MM-dd HH:mm'" +
+ ", timestamp_pattern => '$pt $hm')"),
+ Row("hm=01:00, pt=2024-06-01") :: Row("hm=02:00, pt=2024-06-02") :: Nil
+ )
+
+ checkAnswer(query(), Row("Never-expire", "9999-09-09", "99:99") :: Nil)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon procedure : sorted the expired partitions with max_expires.") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (k STRING, pt STRING, hm STRING)
+ |TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1')
+ | PARTITIONED BY (pt,hm)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(String, String, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("k", "pt", "hm")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ // Show results : There are no expired partitions.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" +
+ ", timestamp_formatter => 'yyyy-MM-dd')"),
+ Row("No expired partitions.") :: Nil
+ )
+
+ inputData.addData(("a", "2024-06-02", "02:00"))
+ stream.processAllAvailable()
+ inputData.addData(("b", "2024-06-02", "01:00"))
+ stream.processAllAvailable()
+ inputData.addData(("d", "2024-06-03", "01:00"))
+ stream.processAllAvailable()
+ inputData.addData(("c", "2024-06-01", "01:00"))
+ stream.processAllAvailable()
+ // this snapshot never expires.
+ inputData.addData(("Never-expire", "9999-09-09", "99:99"))
+ stream.processAllAvailable()
+
+ checkAnswer(
+ query(),
+ Row("a", "2024-06-02", "02:00") :: Row("b", "2024-06-02", "01:00") :: Row(
+ "d",
+ "2024-06-03",
+ "01:00") :: Row("c", "2024-06-01", "01:00") :: Row(
+ "Never-expire",
+ "9999-09-09",
+ "99:99") :: Nil
+ )
+
+ // sorted result of limited expired partitions.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T'" +
+ ", expiration_time => '1 d'" +
+ ", timestamp_formatter => 'yyyy-MM-dd', max_expires => 3)"),
+ Row("pt=2024-06-01, hm=01:00") :: Row("pt=2024-06-02, hm=01:00") :: Row(
+ "pt=2024-06-02, hm=02:00") :: Nil
+ )
+
+ checkAnswer(
+ query(),
+ Row("d", "2024-06-03", "01:00") :: Row("Never-expire", "9999-09-09", "99:99") :: Nil)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: expire partitions with default num") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(
+ s"""
+ |CREATE TABLE T (k STRING, pt STRING)
+ |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1', 'partition.expiration-max-num'='2')
+ |PARTITIONED BY (pt)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(String, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("k", "pt")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ // snapshot-1
+ inputData.addData(("a", "2024-06-01"))
+ stream.processAllAvailable()
+
+ // snapshot-2
+ inputData.addData(("b", "2024-06-02"))
+ stream.processAllAvailable()
+
+ // snapshot-3
+ inputData.addData(("c", "2024-06-03"))
+ stream.processAllAvailable()
+
+ // This partition never expires.
+ inputData.addData(("Never-expire", "9999-09-09"))
+ stream.processAllAvailable()
+
+ checkAnswer(
+ query(),
+ Row("a", "2024-06-01") :: Row("b", "2024-06-02") :: Row("c", "2024-06-03") :: Row(
+ "Never-expire",
+ "9999-09-09") :: Nil)
+ // call expire_partitions.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" +
+ ", timestamp_formatter => 'yyyy-MM-dd')"),
+ Row("pt=2024-06-01") :: Row("pt=2024-06-02") :: Nil
+ )
+
+ checkAnswer(query(), Row("c", "2024-06-03") :: Row("Never-expire", "9999-09-09") :: Nil)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: expire partitions load table property first") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (k STRING, pt STRING)
+ |TBLPROPERTIES (
+ | 'primary-key' = 'k,pt',
+ | 'bucket' = '1',
+ | 'write-only' = 'true',
+ | 'partition.timestamp-formatter' = 'yyyy-MM-dd',
+ | 'partition.expiration-max-num'='2')
+ |PARTITIONED BY (pt)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(String, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("k", "pt")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ // snapshot-1
+ inputData.addData(("a", "2024-06-01"))
+ stream.processAllAvailable()
+
+ // snapshot-2
+ inputData.addData(("b", "2024-06-02"))
+ stream.processAllAvailable()
+
+ // snapshot-3
+ inputData.addData(("c", "2024-06-03"))
+ stream.processAllAvailable()
+
+ // This partition never expires.
+ inputData.addData(("Never-expire", "9999-09-09"))
+ stream.processAllAvailable()
+
+ checkAnswer(
+ query(),
+ Row("a", "2024-06-01") :: Row("b", "2024-06-02") :: Row("c", "2024-06-03") :: Row(
+ "Never-expire",
+ "9999-09-09") :: Nil)
+
+ // 'partition.timestamp-formatter' value using table property.
+ // 'partition.expiration-time' value using procedure parameter.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d')"),
+ Row("pt=2024-06-01") :: Row("pt=2024-06-02") :: Nil
+ )
+
+ checkAnswer(query(), Row("c", "2024-06-03") :: Row("Never-expire", "9999-09-09") :: Nil)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: expire partitions add options parameter") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (k STRING, pt STRING)
+ |TBLPROPERTIES (
+ | 'primary-key' = 'k,pt',
+ | 'bucket' = '1')
+ |PARTITIONED BY (pt)
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(String, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("k", "pt")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T")
+
+ try {
+ // snapshot-1
+ inputData.addData(("a", "2024-06-01"))
+ stream.processAllAvailable()
+
+ // snapshot-2
+ inputData.addData(("b", "2024-06-02"))
+ stream.processAllAvailable()
+
+ // snapshot-3
+ inputData.addData(("c", "2024-06-03"))
+ stream.processAllAvailable()
+
+ // This partition never expires.
+ inputData.addData(("Never-expire", "9999-09-09"))
+ stream.processAllAvailable()
+
+ checkAnswer(
+ query(),
+ Row("a", "2024-06-01") :: Row("b", "2024-06-02") :: Row("c", "2024-06-03") :: Row(
+ "Never-expire",
+ "9999-09-09") :: Nil)
+
+ // set conf in options.
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_partitions(table => 'test.T', " +
+ "options => 'partition.expiration-time = 1d," +
+ " partition.expiration-max-num = 2," +
+ " partition.expiration-batch-size = 2," +
+ " partition.timestamp-formatter = yyyy-MM-dd')"),
+ Row("pt=2024-06-01") :: Row("pt=2024-06-02") :: Nil
+ )
+
+ checkAnswer(query(), Row("c", "2024-06-03") :: Row("Never-expire", "9999-09-09") :: Nil)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpireSnapshotsProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpireSnapshotsProcedureTest.scala
new file mode 100644
index 000000000000..bbaf88568e2d
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpireSnapshotsProcedureTest.scala
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure
+
+import org.apache.paimon.spark.PaimonSparkTestBase
+import org.apache.paimon.utils.SnapshotManager
+
+import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.execution.streaming.runtime.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+import org.assertj.core.api.Assertions.{assertThat, assertThatIllegalArgumentException}
+
+import java.sql.Timestamp
+
+class ExpireSnapshotsProcedureTest extends PaimonSparkTestBase with StreamTest {
+
+ import testImplicits._
+
+ test("Paimon Procedure: expire snapshots") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ // define a change-log table and test `forEachBatch` api
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3',
+ |'write-only' = 'true', 'snapshot.num-retained.min' = '1')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ // snapshot-1
+ inputData.addData((1, "a"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ // snapshot-2
+ inputData.addData((2, "b"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ // snapshot-3
+ inputData.addData((2, "b2"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil)
+
+ // expire
+ checkAnswer(
+ spark.sql("CALL paimon.sys.expire_snapshots(table => 'test.T', retain_max => 2)"),
+ Row(1) :: Nil)
+
+ checkAnswer(
+ spark.sql("SELECT snapshot_id FROM paimon.test.`T$snapshots`"),
+ Row(2L) :: Row(3L) :: Nil)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: expire snapshots retainMax retainMin value check") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ // define a change-log table and test `forEachBatch` api
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ // snapshot-1
+ inputData.addData((1, "a"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ // snapshot-2
+ inputData.addData((2, "b"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ // snapshot-3
+ inputData.addData((2, "b2"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil)
+
+ // expire assert throw exception
+ assertThrows[IllegalArgumentException] {
+ spark.sql(
+ "CALL paimon.sys.expire_snapshots(table => 'test.T', retain_max => 2, retain_min => 3)")
+ }
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: test parameter order_than with string type") {
+ sql(
+ "CREATE TABLE T (a INT, b STRING) " +
+ "TBLPROPERTIES ( 'num-sorted-run.compaction-trigger' = '999'," +
+ "'write-only' = 'true', 'snapshot.num-retained.min' = '1')")
+ val table = loadTable("T")
+ val snapshotManager = table.snapshotManager
+
+ // generate 5 snapshot
+ for (i <- 1 to 5) {
+ sql(s"INSERT INTO T VALUES ($i, '$i')")
+ }
+ checkSnapshots(snapshotManager, 1, 5)
+
+ val timestamp = new Timestamp(snapshotManager.latestSnapshot().timeMillis)
+ spark.sql(
+ s"CALL paimon.sys.expire_snapshots(table => 'test.T', older_than => '${timestamp.toString}', max_deletes => 2)")
+ checkSnapshots(snapshotManager, 3, 5)
+ }
+
+ test("Paimon Procedure: expire snapshots load table property first") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3',
+ |'snapshot.num-retained.max' = '2',
+ |'snapshot.num-retained.min' = '1',
+ |'write-only' = 'true')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ // snapshot-1
+ inputData.addData((1, "a"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ // snapshot-2
+ inputData.addData((2, "b"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ // snapshot-3
+ inputData.addData((2, "b2"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil)
+
+ // expire
+ checkAnswer(
+ spark.sql("CALL paimon.sys.expire_snapshots(table => 'test.T')"),
+ Row(1) :: Nil)
+
+ checkAnswer(
+ spark.sql("SELECT snapshot_id FROM paimon.test.`T$snapshots`"),
+ Row(2L) :: Row(3L) :: Nil)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: expire snapshots add options parameter") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3', 'write-only' = 'true')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ // snapshot-1
+ inputData.addData((1, "a"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ // snapshot-2
+ inputData.addData((2, "b"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ // snapshot-3
+ inputData.addData((2, "b2"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil)
+
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.expire_snapshots(table => 'test.T', options => 'snapshot.num-retained.max=2, snapshot.num-retained.min=1')"),
+ Row(1L) :: Nil
+ )
+
+ checkAnswer(
+ spark.sql("SELECT snapshot_id FROM paimon.test.`T$snapshots`"),
+ Row(2L) :: Row(3L) :: Nil)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ def checkSnapshots(sm: SnapshotManager, earliest: Int, latest: Int): Unit = {
+ assertThat(sm.snapshotCount).isEqualTo(latest - earliest + 1)
+ assertThat(sm.earliestSnapshotId).isEqualTo(earliest)
+ assertThat(sm.latestSnapshotId).isEqualTo(latest)
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ProcedureTest.scala
new file mode 100644
index 000000000000..d57846709877
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ProcedureTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure
+
+class ProcedureTest extends ProcedureTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/RollbackProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/RollbackProcedureTest.scala
new file mode 100644
index 000000000000..078823c3ef37
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/RollbackProcedureTest.scala
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.procedure
+
+import org.apache.paimon.spark.PaimonSparkTestBase
+
+import org.apache.spark.sql.{Dataset, Row}
+import org.apache.spark.sql.execution.streaming.runtime.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+
+class RollbackProcedureTest extends PaimonSparkTestBase with StreamTest {
+
+ import testImplicits._
+
+ test("Paimon Procedure: rollback to snapshot and tag") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ // define a change-log table and test `forEachBatch` api
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val table = loadTable("T")
+ val location = table.location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ // snapshot-1
+ inputData.addData((1, "a"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ checkAnswer(
+ spark.sql(
+ "CALL paimon.sys.create_tag(table => 'test.T', tag => 'test_tag', snapshot => 1)"),
+ Row(true) :: Nil)
+
+ // snapshot-2
+ inputData.addData((2, "b"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ // snapshot-3
+ inputData.addData((2, "b2"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil)
+ assertThrows[RuntimeException] {
+ spark.sql("CALL paimon.sys.rollback(table => 'test.T_exception', version => '2')")
+ }
+ // rollback to snapshot
+ checkAnswer(
+ spark.sql("CALL paimon.sys.rollback(table => 'test.T', version => '2')"),
+ Row(table.latestSnapshot().get().id, 2) :: Nil)
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ // rollback to tag
+ val taggedSnapshotId = table.tagManager().getOrThrow("test_tag").trimToSnapshot().id
+ checkAnswer(
+ spark.sql("CALL paimon.sys.rollback(table => 'test.T', version => 'test_tag')"),
+ Row(table.latestSnapshot().get().id, taggedSnapshotId) :: Nil)
+ checkAnswer(query(), Row(1, "a") :: Nil)
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: rollback to tag check test") {
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3', 'file.format'='orc')
+ |""".stripMargin)
+
+ val table = loadTable("T")
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ // snapshot-1
+ spark.sql("insert into T select 1, 'a'")
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ checkAnswer(
+ spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => '20250122', snapshot => 1)"),
+ Row(true) :: Nil)
+
+ // snapshot-2
+ spark.sql("insert into T select 2, 'b'")
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ // snapshot-3
+ spark.sql("insert into T select 3, 'c'")
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil)
+
+ // snapshot-4
+ spark.sql("insert into T select 4, 'd'")
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Row(4, "d") :: Nil)
+
+ assertThrows[RuntimeException] {
+ spark.sql("CALL paimon.sys.rollback(table => 'test.T_exception', version => '4')")
+ }
+ // rollback to snapshot
+ checkAnswer(
+ spark.sql("CALL paimon.sys.rollback(table => 'test.T', version => '3')"),
+ Row(table.latestSnapshot().get().id, 3) :: Nil)
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil)
+
+ // version/snapshot/tag can only set one of them
+ assertThrows[RuntimeException] {
+ spark.sql(
+ "CALL paimon.sys.rollback(table => 'test.T', version => '20250122', tag => '20250122')")
+ }
+
+ assertThrows[RuntimeException] {
+ spark.sql("CALL paimon.sys.rollback(table => 'test.T', version => '20250122', snapshot => 1)")
+ }
+
+ assertThrows[RuntimeException] {
+ spark.sql("CALL paimon.sys.rollback(table => 'test.T', tag => '20250122', snapshot => 1)")
+ }
+
+ // rollback to snapshot
+ spark.sql("CALL paimon.sys.rollback(table => 'test.T', snapshot => 2)")
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ // rollback to tag
+ spark.sql("CALL paimon.sys.rollback(table => 'test.T', tag => '20250122')")
+ checkAnswer(query(), Row(1, "a") :: Nil)
+ }
+
+ test("Paimon Procedure: rollback to timestamp") {
+ failAfter(streamingTimeout) {
+ withTempDir {
+ checkpointDir =>
+ // define a change-log table and test `forEachBatch` api
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ('primary-key'='a', 'bucket'='3')
+ |""".stripMargin)
+ val location = loadTable("T").location().toString
+
+ val inputData = MemoryStream[(Int, String)]
+ val stream = inputData
+ .toDS()
+ .toDF("a", "b")
+ .writeStream
+ .option("checkpointLocation", checkpointDir.getCanonicalPath)
+ .foreachBatch {
+ (batch: Dataset[Row], _: Long) =>
+ batch.write.format("paimon").mode("append").save(location)
+ }
+ .start()
+
+ val table = loadTable("T")
+
+ val query = () => spark.sql("SELECT * FROM T ORDER BY a")
+
+ try {
+ // snapshot-1
+ inputData.addData((1, "a"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Nil)
+
+ // snapshot-2
+ inputData.addData((2, "b"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ val timestamp = System.currentTimeMillis()
+
+ // snapshot-3
+ inputData.addData((2, "b2"))
+ stream.processAllAvailable()
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil)
+
+ // rollback to timestamp
+ checkAnswer(
+ spark.sql(
+ s"CALL paimon.sys.rollback_to_timestamp(table => 'test.T', timestamp => $timestamp)"),
+ Row(table.latestSnapshot().get().id, 2) :: Nil)
+ checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil)
+
+ } finally {
+ stream.stop()
+ }
+ }
+ }
+ }
+
+ test("Paimon Procedure: rollback with cache") {
+ sql("CREATE TABLE T (id INT)")
+ sql("INSERT INTO T VALUES (1), (2), (3), (4)")
+ sql("DELETE FROM T WHERE id = 1")
+ sql("CALL sys.rollback(table => 'T', version => '1')")
+ sql("DELETE FROM T WHERE id = 1")
+ checkAnswer(sql("SELECT * FROM T ORDER BY id"), Seq(Row(2), Row(3), Row(4)))
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/AnalyzeTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/AnalyzeTableTest.scala
new file mode 100644
index 000000000000..255906d04bf2
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/AnalyzeTableTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class AnalyzeTableTest extends AnalyzeTableTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLTest.scala
new file mode 100644
index 000000000000..b729f57b33e7
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class DDLTest extends DDLTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLWithHiveCatalogTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLWithHiveCatalogTest.scala
new file mode 100644
index 000000000000..cb139d2a57be
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLWithHiveCatalogTest.scala
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class DDLWithHiveCatalogTest extends DDLWithHiveCatalogTestBase {}
+
+class DefaultDatabaseTest extends DefaultDatabaseTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTest.scala
new file mode 100644
index 000000000000..6170e2fd6c5c
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class DataFrameWriteTest extends DataFrameWriteTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTestBase.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTestBase.scala
new file mode 100644
index 000000000000..b25e41a3fb42
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTestBase.scala
@@ -0,0 +1,701 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+import org.apache.paimon.spark.PaimonSparkTestBase
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.DecimalType
+import org.junit.jupiter.api.Assertions
+
+import java.sql.{Date, Timestamp}
+
+abstract class DataFrameWriteTestBase extends PaimonSparkTestBase {
+
+ override protected def sparkConf: SparkConf = {
+ super.sparkConf.set("spark.sql.catalog.paimon.cache-enabled", "false")
+ }
+
+ import testImplicits._
+
+ test("Paimon dataframe: insert into partitioned table") {
+ for (useV2Write <- Seq("true", "false")) {
+ withSparkSQLConf("spark.paimon.write.use-v2-write" -> useV2Write) {
+ withTable("t") {
+ // create table
+ Seq((1, "x1", "p1"), (2, "x2", "p2"))
+ .toDF("a", "b", "pt")
+ .write
+ .format("paimon")
+ .option("primary-key", "a,pt")
+ .partitionBy("pt")
+ .saveAsTable("t")
+
+ // insert into
+ Seq((3, "x3", "p3"))
+ .toDF("a", "b", "pt")
+ .write
+ .format("paimon")
+ .mode("append")
+ .insertInto("t")
+ checkAnswer(
+ spark.read.format("paimon").table("t").orderBy("a"),
+ Seq(Row(1, "x1", "p1"), Row(2, "x2", "p2"), Row(3, "x3", "p3"))
+ )
+ checkAnswer(
+ sql("SHOW PARTITIONS t"),
+ Seq(Row("pt=p1"), Row("pt=p2"), Row("pt=p3"))
+ )
+
+ // dynamic insert overwrite
+ withSparkSQLConf("spark.sql.sources.partitionOverwriteMode" -> "dynamic") {
+ Seq((4, "x4", "p1"))
+ .toDF("a", "b", "pt")
+ .write
+ .format("paimon")
+ .mode("overwrite")
+ .insertInto("t")
+ }
+ checkAnswer(
+ spark.read.format("paimon").table("t").orderBy("a"),
+ Seq(Row(2, "x2", "p2"), Row(3, "x3", "p3"), Row(4, "x4", "p1"))
+ )
+ checkAnswer(
+ sql("SHOW PARTITIONS t"),
+ Seq(Row("pt=p1"), Row("pt=p2"), Row("pt=p3"))
+ )
+
+ // insert overwrite
+ Seq((5, "x5", "p1"))
+ .toDF("a", "b", "pt")
+ .write
+ .format("paimon")
+ .mode("overwrite")
+ .insertInto("t")
+ checkAnswer(
+ spark.read.format("paimon").table("t").orderBy("a"),
+ Seq(Row(5, "x5", "p1"))
+ )
+ checkAnswer(
+ sql("SHOW PARTITIONS t"),
+ Seq(Row("pt=p1"))
+ )
+ }
+ }
+ }
+ }
+
+ test("Paimon dataframe: save as partitioned table") {
+ for (useV2Write <- Seq("true", "false")) {
+ withSparkSQLConf("spark.paimon.write.use-v2-write" -> useV2Write) {
+ withTable("t") {
+ // create table
+ Seq((1, "x1", "p1"), (2, "x2", "p2"))
+ .toDF("a", "b", "pt")
+ .write
+ .format("paimon")
+ .mode("append")
+ .option("primary-key", "a,pt")
+ .partitionBy("pt")
+ .saveAsTable("t")
+
+ // saveAsTable with append mode
+ Seq((3, "x3", "p3"))
+ .toDF("a", "b", "pt")
+ .write
+ .format("paimon")
+ .mode("append")
+ .saveAsTable("t")
+ checkAnswer(
+ spark.read.format("paimon").table("t").orderBy("a"),
+ Seq(Row(1, "x1", "p1"), Row(2, "x2", "p2"), Row(3, "x3", "p3"))
+ )
+ checkAnswer(
+ sql("SHOW PARTITIONS t"),
+ Seq(Row("pt=p1"), Row("pt=p2"), Row("pt=p3"))
+ )
+
+ // saveAsTable with overwrite mode will call replace table internal,
+ // so here we set the props and partitions again.
+ Seq((5, "x5", "p1"))
+ .toDF("a", "b", "pt")
+ .write
+ .format("paimon")
+ .option("primary-key", "a,pt")
+ .partitionBy("pt")
+ .mode("overwrite")
+ .saveAsTable("t")
+ checkAnswer(
+ spark.read.format("paimon").table("t").orderBy("a"),
+ Seq(Row(5, "x5", "p1"))
+ )
+ checkAnswer(
+ sql("SHOW PARTITIONS t"),
+ Seq(Row("pt=p1"))
+ )
+ }
+ }
+ }
+ }
+
+ test("Paimon: DataFrameWrite.saveAsTable") {
+ withTable("test_ctas") {
+ Seq((1L, "x1"), (2L, "x2"))
+ .toDF("a", "b")
+ .write
+ .format("paimon")
+ .mode("append")
+ .option("primary-key", "a")
+ .option("bucket", "-1")
+ .option("target-file-size", "256MB")
+ .option("write.merge-schema", "true")
+ .option("write.merge-schema.explicit-cast", "true")
+ .saveAsTable("test_ctas")
+
+ val paimonTable = loadTable("test_ctas")
+ Assertions.assertEquals(1, paimonTable.primaryKeys().size())
+ Assertions.assertEquals("a", paimonTable.primaryKeys().get(0))
+
+ // check all the core options
+ Assertions.assertEquals("-1", paimonTable.options().get("bucket"))
+ Assertions.assertEquals("256MB", paimonTable.options().get("target-file-size"))
+
+ // non-core options should not be here.
+ Assertions.assertFalse(paimonTable.options().containsKey("write.merge-schema"))
+ Assertions.assertFalse(paimonTable.options().containsKey("write.merge-schema.explicit-cast"))
+ }
+ }
+
+ test("Paimon: DataFrameWrite partition table") {
+ withTable("t") {
+ spark.sql(s"""
+ |CREATE TABLE t (a INT, b STRING, dt STRING) PARTITIONED BY(dt)
+ |TBLPROPERTIES ('file.format' = 'avro', 'bucket' = 2, 'bucket-key' = 'b')
+ |""".stripMargin)
+
+ val table = loadTable("t")
+ val location = table.location().toString
+
+ Seq((1, "x1", "a"), (2, "x2", "b"))
+ .toDF("a", "b", "c")
+ .write
+ .format("paimon")
+ .mode("append")
+ .save(location)
+ checkAnswer(sql("SELECT * FROM t"), Row(1, "x1", "a") :: Row(2, "x2", "b") :: Nil)
+ }
+ }
+
+ fileFormats.foreach {
+ fileFormat =>
+ test(s"Paimon: DataFrameWrite.saveAsTable in ByName mode, file.format: $fileFormat") {
+ withTable("t1", "t2") {
+ spark.sql(s"""
+ |CREATE TABLE t1 (col1 STRING, col2 INT, col3 DOUBLE)
+ |TBLPROPERTIES ('file.format' = '$fileFormat')
+ |""".stripMargin)
+
+ spark.sql(s"""
+ |CREATE TABLE t2 (col2 INT, col3 DOUBLE, col1 STRING)
+ |TBLPROPERTIES ('file.format' = '$fileFormat')
+ |""".stripMargin)
+
+ sql(s"""
+ |INSERT INTO TABLE t1 VALUES
+ |("Hello", 1, 1.1),
+ |("World", 2, 2.2),
+ |("Paimon", 3, 3.3);
+ |""".stripMargin)
+
+ spark.table("t1").write.format("paimon").mode("append").saveAsTable("t2")
+ checkAnswer(
+ sql("SELECT * FROM t2 ORDER BY col2"),
+ Row(1, 1.1d, "Hello") :: Row(2, 2.2d, "World") :: Row(3, 3.3d, "Paimon") :: Nil)
+ }
+ }
+ }
+
+ fileFormats.foreach {
+ fileFormat =>
+ test(
+ s"Paimon: DataFrameWrite.saveAsTable with complex data type in ByName mode, file.format: $fileFormat") {
+ withTable("t1", "t2") {
+ spark.sql(
+ s"""
+ |CREATE TABLE t1 (a STRING, b INT, c STRUCT, d ARRAY>>, e ARRAY)
+ |TBLPROPERTIES ('file.format' = '$fileFormat')
+ |""".stripMargin)
+
+ spark.sql(
+ s"""
+ |CREATE TABLE t2 (b INT, c STRUCT, d ARRAY, d1 TIMESTAMP>>, e ARRAY, a STRING)
+ |TBLPROPERTIES ('file.format' = '$fileFormat')
+ |""".stripMargin)
+
+ sql(s"""
+ |INSERT INTO TABLE t1 VALUES
+ |("Hello", 1, struct(1.1, 1000), array(struct(timestamp'2024-01-01 00:00:00', map("k1", "v1")), struct(timestamp'2024-08-01 00:00:00', map("k1", "v11"))), array(123, 345)),
+ |("World", 2, struct(2.2, 2000), array(struct(timestamp'2024-02-01 00:00:00', map("k2", "v2"))), array(234, 456)),
+ |("Paimon", 3, struct(3.3, 3000), null, array(345, 567));
+ |""".stripMargin)
+
+ spark.table("t1").write.format("paimon").mode("append").saveAsTable("t2")
+ checkAnswer(
+ sql("SELECT * FROM t2 ORDER BY b"),
+ Row(
+ 1,
+ Row(1000L, 1.1d),
+ Array(
+ Row(Map("k1" -> "v1"), Timestamp.valueOf("2024-01-01 00:00:00")),
+ Row(Map("k1" -> "v11"), Timestamp.valueOf("2024-08-01 00:00:00"))),
+ Array(123, 345),
+ "Hello"
+ )
+ :: Row(
+ 2,
+ Row(2000L, 2.2d),
+ Array(Row(Map("k2" -> "v2"), Timestamp.valueOf("2024-02-01 00:00:00"))),
+ Array(234, 456),
+ "World")
+ :: Row(3, Row(3000L, 3.3d), null, Array(345, 567), "Paimon") :: Nil
+ )
+ }
+ }
+ }
+
+ withPk.foreach {
+ hasPk =>
+ bucketModes.foreach {
+ bucket =>
+ test(s"Write data into Paimon directly: has-pk: $hasPk, bucket: $bucket") {
+
+ val prop = if (hasPk) {
+ s"'primary-key'='a', 'bucket' = '$bucket' "
+ } else if (bucket != -1) {
+ s"'bucket-key'='a', 'bucket' = '$bucket' "
+ } else {
+ "'write-only'='true'"
+ }
+
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ($prop)
+ |""".stripMargin)
+
+ val paimonTable = loadTable("T")
+ val location = paimonTable.location().toString
+
+ val df1 = Seq((1, "a"), (2, "b")).toDF("a", "b")
+ df1.write.format("paimon").mode("append").save(location)
+ checkAnswer(
+ spark.sql("SELECT * FROM T ORDER BY a, b"),
+ Row(1, "a") :: Row(2, "b") :: Nil)
+
+ val df2 = Seq((1, "a2"), (3, "c")).toDF("a", "b")
+ df2.write.format("paimon").mode("append").save(location)
+ val expected = if (hasPk) {
+ Row(1, "a2") :: Row(2, "b") :: Row(3, "c") :: Nil
+ } else {
+ Row(1, "a") :: Row(1, "a2") :: Row(2, "b") :: Row(3, "c") :: Nil
+ }
+ checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected)
+
+ val df3 = Seq((4, "d"), (5, "e")).toDF("a", "b")
+ df3.write.format("paimon").mode("overwrite").save(location)
+ checkAnswer(
+ spark.sql("SELECT * FROM T ORDER BY a, b"),
+ Row(4, "d") :: Row(5, "e") :: Nil)
+ }
+ }
+ }
+
+ fileFormats.foreach {
+ format =>
+ withPk.foreach {
+ hasPk =>
+ bucketModes.foreach {
+ bucket =>
+ test(
+ s"Schema evolution: write data into Paimon: $hasPk, bucket: $bucket, format: $format") {
+ val _spark = spark
+ import _spark.implicits._
+
+ val prop = if (hasPk) {
+ s"'primary-key'='a', 'bucket' = '$bucket', 'file.format' = '$format'"
+ } else if (bucket != -1) {
+ s"'bucket-key'='a', 'bucket' = '$bucket', 'file.format' = '$format'"
+ } else {
+ s"'write-only'='true', 'file.format' = '$format'"
+ }
+
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ($prop)
+ |""".stripMargin)
+
+ val paimonTable = loadTable("T")
+ val location = paimonTable.location().toString
+
+ val df1 = Seq((1, "a"), (2, "b")).toDF("a", "b")
+ df1.write.format("paimon").mode("append").save(location)
+ checkAnswer(
+ spark.sql("SELECT * FROM T ORDER BY a, b"),
+ Row(1, "a") :: Row(2, "b") :: Nil)
+
+ // Case 1: two additional fields
+ val df2 = Seq((1, "a2", 123L, Map("k" -> 11.1)), (3, "c", 345L, Map("k" -> 33.3)))
+ .toDF("a", "b", "c", "d")
+ df2.write
+ .format("paimon")
+ .mode("append")
+ .option("write.merge-schema", "true")
+ .save(location)
+ val expected2 = if (hasPk) {
+ Row(1, "a2", 123L, Map("k" -> 11.1)) ::
+ Row(2, "b", null, null) :: Row(3, "c", 345L, Map("k" -> 33.3)) :: Nil
+ } else {
+ Row(1, "a", null, null) :: Row(1, "a2", 123L, Map("k" -> 11.1)) :: Row(
+ 2,
+ "b",
+ null,
+ null) :: Row(3, "c", 345L, Map("k" -> 33.3)) :: Nil
+ }
+ checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected2)
+
+ // Case 2: two fields with the evolved types: Int -> Long, Long -> Decimal
+ val df3 = Seq(
+ (2L, "b2", BigDecimal.decimal(234), Map("k" -> 22.2)),
+ (4L, "d", BigDecimal.decimal(456), Map("k" -> 44.4))).toDF("a", "b", "c", "d")
+ df3.write
+ .format("paimon")
+ .mode("append")
+ .option("write.merge-schema", "true")
+ .save(location)
+ val expected3 = if (hasPk) {
+ Row(1L, "a2", BigDecimal.decimal(123), Map("k" -> 11.1)) :: Row(
+ 2L,
+ "b2",
+ BigDecimal.decimal(234),
+ Map("k" -> 22.2)) :: Row(
+ 3L,
+ "c",
+ BigDecimal.decimal(345),
+ Map("k" -> 33.3)) :: Row(
+ 4L,
+ "d",
+ BigDecimal.decimal(456),
+ Map("k" -> 44.4)) :: Nil
+ } else {
+ Row(1L, "a", null, null) :: Row(
+ 1L,
+ "a2",
+ BigDecimal.decimal(123),
+ Map("k" -> 11.1)) :: Row(2L, "b", null, null) :: Row(
+ 2L,
+ "b2",
+ BigDecimal.decimal(234),
+ Map("k" -> 22.2)) :: Row(
+ 3L,
+ "c",
+ BigDecimal.decimal(345),
+ Map("k" -> 33.3)) :: Row(
+ 4L,
+ "d",
+ BigDecimal.decimal(456),
+ Map("k" -> 44.4)) :: Nil
+ }
+ checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected3)
+
+ // Case 3: insert Decimal(20,18) to Decimal(38,18)
+ val df4 = Seq((99L, "df4", BigDecimal.decimal(4.0), Map("4" -> 4.1)))
+ .toDF("a", "b", "c", "d")
+ .selectExpr("a", "b", "cast(c as decimal(20,18)) as c", "d")
+ df4.write
+ .format("paimon")
+ .mode("append")
+ .option("write.merge-schema", "true")
+ .save(location)
+ val expected4 =
+ expected3 ++ Seq(Row(99L, "df4", BigDecimal.decimal(4.0), Map("4" -> 4.1)))
+ checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected4)
+ val decimalType =
+ spark.table("T").schema.apply(2).dataType.asInstanceOf[DecimalType]
+ assert(decimalType.precision == 38)
+ assert(decimalType.scale == 18)
+ }
+ }
+ }
+ }
+
+ withPk.foreach {
+ hasPk =>
+ bucketModes.foreach {
+ bucket =>
+ test(
+ s"Schema evolution: write data into Paimon with allowExplicitCast = true: $hasPk, bucket: $bucket") {
+
+ val prop = if (hasPk) {
+ s"'primary-key'='a', 'bucket' = '$bucket' "
+ } else if (bucket != -1) {
+ s"'bucket-key'='a', 'bucket' = '$bucket' "
+ } else {
+ "'write-only'='true'"
+ }
+
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |TBLPROPERTIES ($prop)
+ |""".stripMargin)
+
+ val paimonTable = loadTable("T")
+ val location = paimonTable.location().toString
+
+ val df1 = Seq((1, "2023-08-01"), (2, "2023-08-02")).toDF("a", "b")
+ df1.write.format("paimon").mode("append").save(location)
+ checkAnswer(
+ spark.sql("SELECT * FROM T ORDER BY a, b"),
+ Row(1, "2023-08-01") :: Row(2, "2023-08-02") :: Nil)
+
+ // Case 1: two additional fields: DoubleType and TimestampType
+ val ts = java.sql.Timestamp.valueOf("2023-08-01 10:00:00.0")
+ val df2 = Seq((1, "2023-08-01", 12.3d, ts), (3, "2023-08-03", 34.5d, ts))
+ .toDF("a", "b", "c", "d")
+ df2.write
+ .format("paimon")
+ .mode("append")
+ .option("write.merge-schema", "true")
+ .save(location)
+ val expected2 = if (hasPk) {
+ Row(1, "2023-08-01", 12.3d, ts) ::
+ Row(2, "2023-08-02", null, null) :: Row(3, "2023-08-03", 34.5d, ts) :: Nil
+ } else {
+ Row(1, "2023-08-01", null, null) :: Row(1, "2023-08-01", 12.3d, ts) :: Row(
+ 2,
+ "2023-08-02",
+ null,
+ null) :: Row(3, "2023-08-03", 34.5d, ts) :: Nil
+ }
+ checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected2)
+
+ // Case 2: a: Int -> Long, b: String -> Date, c: Long -> Int, d: Map -> String
+ val date = java.sql.Date.valueOf("2023-07-31")
+ val df3 = Seq((2L, date, 234, null), (4L, date, 456, "2023-08-01 11:00:00.0")).toDF(
+ "a",
+ "b",
+ "c",
+ "d")
+
+ // throw UnsupportedOperationException if write.merge-schema.explicit-cast = false
+ assertThrows[UnsupportedOperationException] {
+ df3.write
+ .format("paimon")
+ .mode("append")
+ .option("write.merge-schema", "true")
+ .save(location)
+ }
+ // merge schema and write data when write.merge-schema.explicit-cast = true
+ df3.write
+ .format("paimon")
+ .mode("append")
+ .option("write.merge-schema", "true")
+ .option("write.merge-schema.explicit-cast", "true")
+ .save(location)
+ val expected3 = if (hasPk) {
+ Row(1L, Date.valueOf("2023-08-01"), 12, ts.toString) :: Row(
+ 2L,
+ date,
+ 234,
+ null) :: Row(3L, Date.valueOf("2023-08-03"), 34, ts.toString) :: Row(
+ 4L,
+ date,
+ 456,
+ "2023-08-01 11:00:00.0") :: Nil
+ } else {
+ Row(1L, Date.valueOf("2023-08-01"), null, null) :: Row(
+ 1L,
+ Date.valueOf("2023-08-01"),
+ 12,
+ ts.toString) :: Row(2L, date, 234, null) :: Row(
+ 2L,
+ Date.valueOf("2023-08-02"),
+ null,
+ null) :: Row(3L, Date.valueOf("2023-08-03"), 34, ts.toString) :: Row(
+ 4L,
+ date,
+ 456,
+ "2023-08-01 11:00:00.0") :: Nil
+ }
+ checkAnswer(
+ spark.sql("SELECT a, b, c, substring(d, 0, 21) FROM T ORDER BY a, b"),
+ expected3)
+
+ }
+ }
+ }
+
+ withPk.foreach {
+ hasPk =>
+ test(s"Support v2 write with overwrite, hasPk: $hasPk") {
+ withTable("t") {
+ val prop = if (hasPk) {
+ "'primary-key'='c1'"
+ } else {
+ "'write-only'='true'"
+ }
+ spark.sql(s"""
+ |CREATE TABLE t (c1 INT, c2 STRING) PARTITIONED BY(p1 String, p2 string)
+ |TBLPROPERTIES ($prop)
+ |""".stripMargin)
+
+ spark
+ .range(3)
+ .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2")
+ .writeTo("t")
+ .overwrite($"p1" === "a")
+ checkAnswer(
+ spark.sql("SELECT * FROM t ORDER BY c1"),
+ Row(0, "0", "a", "0") :: Row(1, "1", "a", "1") :: Row(2, "2", "a", "2") :: Nil
+ )
+
+ spark
+ .range(7, 10)
+ .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2")
+ .writeTo("t")
+ .overwrite($"p1" === "a")
+ checkAnswer(
+ spark.sql("SELECT * FROM t ORDER BY c1"),
+ Row(7, "7", "a", "7") :: Row(8, "8", "a", "8") :: Row(9, "9", "a", "9") :: Nil
+ )
+
+ spark
+ .range(2)
+ .selectExpr("id as c1", "id as c2", "'a' as p1", "9 as p2")
+ .writeTo("t")
+ .overwrite(($"p1" <=> "a").and($"p2" === "9"))
+ checkAnswer(
+ spark.sql("SELECT * FROM t ORDER BY c1"),
+ Row(0, "0", "a", "9") :: Row(1, "1", "a", "9") :: Row(7, "7", "a", "7") ::
+ Row(8, "8", "a", "8") :: Nil
+ )
+
+ // bad case
+ val msg1 = intercept[Exception] {
+ spark
+ .range(2)
+ .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2")
+ .writeTo("t")
+ .overwrite($"p1" =!= "a")
+ }.getMessage
+ assert(msg1.contains("Only support Overwrite filters with Equal and EqualNullSafe"))
+
+ val msg2 = intercept[Exception] {
+ spark
+ .range(2)
+ .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2")
+ .writeTo("t")
+ .overwrite($"p1" === $"c2")
+ }.getMessage
+ if (gteqSpark3_4) {
+ assert(msg2.contains("Table does not support overwrite by expression"))
+ } else {
+ assert(msg2.contains("cannot translate expression to source filter"))
+ }
+
+ val msg3 = intercept[Exception] {
+ spark
+ .range(2)
+ .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2")
+ .writeTo("t")
+ .overwrite($"c1" === ($"c2" + 1))
+ }.getMessage
+ if (gteqSpark4_0) {
+ assert(msg3.contains("Table does not support overwrite by expression"))
+ } else {
+ assert(msg3.contains("cannot translate expression to source filter"))
+ }
+
+ val msg4 = intercept[Exception] {
+ spark
+ .range(2)
+ .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2")
+ .writeTo("t")
+ .overwrite(($"p1" === "a").and($"p1" === "b"))
+ }.getMessage
+ assert(msg4.contains("Only support Overwrite with one filter for each partition column"))
+
+ // Overwrite a partition which is not the specified
+ val msg5 = intercept[Exception] {
+ spark
+ .range(2)
+ .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2")
+ .writeTo("t")
+ .overwrite($"p1" === "b")
+ }.getMessage
+ assert(msg5.contains("does not belong to this partition"))
+ }
+ }
+ }
+
+ test("Paimon Schema Evolution: some columns is absent in the coming data") {
+
+ spark.sql(s"""
+ |CREATE TABLE T (a INT, b STRING)
+ |""".stripMargin)
+
+ val paimonTable = loadTable("T")
+ val location = paimonTable.location().toString
+
+ val df1 = Seq((1, "2023-08-01"), (2, "2023-08-02")).toDF("a", "b")
+ df1.write.format("paimon").mode("append").save(location)
+ checkAnswer(
+ spark.sql("SELECT * FROM T ORDER BY a, b"),
+ Row(1, "2023-08-01") :: Row(2, "2023-08-02") :: Nil)
+
+ // Case 1: two additional fields: DoubleType and TimestampType
+ val ts = java.sql.Timestamp.valueOf("2023-08-01 10:00:00.0")
+ val df2 = Seq((1, "2023-08-01", 12.3d, ts), (3, "2023-08-03", 34.5d, ts))
+ .toDF("a", "b", "c", "d")
+ df2.write
+ .format("paimon")
+ .mode("append")
+ .option("write.merge-schema", "true")
+ .save(location)
+
+ // Case 2: colum b and d are absent in the coming data
+ val df3 = Seq((4, 45.6d), (5, 56.7d))
+ .toDF("a", "c")
+ df3.write
+ .format("paimon")
+ .mode("append")
+ .option("write.merge-schema", "true")
+ .save(location)
+ val expected3 =
+ Row(1, "2023-08-01", null, null) :: Row(1, "2023-08-01", 12.3d, ts) :: Row(
+ 2,
+ "2023-08-02",
+ null,
+ null) :: Row(3, "2023-08-03", 34.5d, ts) :: Row(4, null, 45.6d, null) :: Row(
+ 5,
+ null,
+ 56.7d,
+ null) :: Nil
+ checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected3)
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DeleteFromTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DeleteFromTableTest.scala
new file mode 100644
index 000000000000..8d620ece8245
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DeleteFromTableTest.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+import org.apache.spark.SparkConf
+
+class DeleteFromTableTest extends DeleteFromTableTestBase {
+ override protected def sparkConf: SparkConf = {
+ super.sparkConf.set("spark.paimon.write.use-v2-write", "false")
+ }
+}
+
+class V2DeleteFromTableTest extends DeleteFromTableTestBase {
+ override protected def sparkConf: SparkConf = {
+ super.sparkConf.set("spark.paimon.write.use-v2-write", "true")
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DescribeTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DescribeTableTest.scala
new file mode 100644
index 000000000000..c6aa77419241
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DescribeTableTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class DescribeTableTest extends DescribeTableTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/FormatTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/FormatTableTest.scala
new file mode 100644
index 000000000000..ba49976ab6c0
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/FormatTableTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class FormatTableTest extends FormatTableTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/InsertOverwriteTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/InsertOverwriteTableTest.scala
new file mode 100644
index 000000000000..4f66584c303b
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/InsertOverwriteTableTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class InsertOverwriteTableTest extends InsertOverwriteTableTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTest.scala
new file mode 100644
index 000000000000..c83ee5493867
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTest.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+import org.apache.paimon.spark.{PaimonAppendBucketedTableTest, PaimonAppendNonBucketTableTest, PaimonPrimaryKeyBucketedTableTest, PaimonPrimaryKeyNonBucketTableTest}
+
+import org.apache.spark.SparkConf
+
+class MergeIntoPrimaryKeyBucketedTableTest
+ extends MergeIntoTableTestBase
+ with MergeIntoPrimaryKeyTableTest
+ with MergeIntoNotMatchedBySourceTest
+ with PaimonPrimaryKeyBucketedTableTest {
+ override protected def sparkConf: SparkConf = {
+ super.sparkConf.set("spark.paimon.write.use-v2-write", "false")
+ }
+}
+
+class MergeIntoPrimaryKeyNonBucketTableTest
+ extends MergeIntoTableTestBase
+ with MergeIntoPrimaryKeyTableTest
+ with MergeIntoNotMatchedBySourceTest
+ with PaimonPrimaryKeyNonBucketTableTest {
+ override protected def sparkConf: SparkConf = {
+ super.sparkConf.set("spark.paimon.write.use-v2-write", "false")
+ }
+}
+
+class MergeIntoAppendBucketedTableTest
+ extends MergeIntoTableTestBase
+ with MergeIntoAppendTableTest
+ with MergeIntoNotMatchedBySourceTest
+ with PaimonAppendBucketedTableTest {
+ override protected def sparkConf: SparkConf = {
+ super.sparkConf.set("spark.paimon.write.use-v2-write", "false")
+ }
+}
+
+class MergeIntoAppendNonBucketedTableTest
+ extends MergeIntoTableTestBase
+ with MergeIntoAppendTableTest
+ with MergeIntoNotMatchedBySourceTest
+ with PaimonAppendNonBucketTableTest {
+ override protected def sparkConf: SparkConf = {
+ super.sparkConf.set("spark.paimon.write.use-v2-write", "false")
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonCompositePartitionKeyTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonCompositePartitionKeyTest.scala
new file mode 100644
index 000000000000..635185a9ed0e
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonCompositePartitionKeyTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class PaimonCompositePartitionKeyTest extends PaimonCompositePartitionKeyTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonOptimizationTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonOptimizationTest.scala
new file mode 100644
index 000000000000..ec140a89bbd3
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonOptimizationTest.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.{Attribute, GetStructField, NamedExpression, ScalarSubquery}
+import org.apache.spark.sql.paimon.shims.SparkShimLoader
+
+class PaimonOptimizationTest extends PaimonOptimizationTestBase {
+
+ override def extractorExpression(
+ cteIndex: Int,
+ output: Seq[Attribute],
+ fieldIndex: Int): NamedExpression = {
+ GetStructField(
+ ScalarSubquery(
+ SparkShimLoader.shim
+ .createCTERelationRef(cteIndex, resolved = true, output.toSeq, isStreaming = false)),
+ fieldIndex,
+ None)
+ .as("scalarsubquery()")
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonPushDownTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonPushDownTest.scala
new file mode 100644
index 000000000000..26677d85c71a
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonPushDownTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class PaimonPushDownTest extends PaimonPushDownTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonV1FunctionTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonV1FunctionTest.scala
new file mode 100644
index 000000000000..f37fbad27033
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonV1FunctionTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class PaimonV1FunctionTest extends PaimonV1FunctionTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonViewTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonViewTest.scala
new file mode 100644
index 000000000000..6ab8a2671b51
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonViewTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class PaimonViewTest extends PaimonViewTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RewriteUpsertTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RewriteUpsertTableTest.scala
new file mode 100644
index 000000000000..412aa3b30351
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RewriteUpsertTableTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class RewriteUpsertTableTest extends RewriteUpsertTableTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowIdPushDownTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowIdPushDownTest.scala
new file mode 100644
index 000000000000..da4c9b854df3
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowIdPushDownTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class RowIdPushDownTest extends RowIdPushDownTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowTrackingTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowTrackingTest.scala
new file mode 100644
index 000000000000..9f96840a7788
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowTrackingTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class RowTrackingTest extends RowTrackingTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/ShowColumnsTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/ShowColumnsTest.scala
new file mode 100644
index 000000000000..6601dc2fca37
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/ShowColumnsTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class ShowColumnsTest extends PaimonShowColumnsTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/SparkV2FilterConverterTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/SparkV2FilterConverterTest.scala
new file mode 100644
index 000000000000..21c4c8a495ed
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/SparkV2FilterConverterTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class SparkV2FilterConverterTest extends SparkV2FilterConverterTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/TagDdlTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/TagDdlTest.scala
new file mode 100644
index 000000000000..92309d54167b
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/TagDdlTest.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+class TagDdlTest extends PaimonTagDdlTestBase {}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTest.scala
new file mode 100644
index 000000000000..3a0f56cd4820
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTest.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+import org.apache.spark.SparkConf
+
+class UpdateTableTest extends UpdateTableTestBase {
+ override protected def sparkConf: SparkConf = {
+ super.sparkConf.set("spark.paimon.write.use-v2-write", "false")
+ }
+}
+
+class V2UpdateTableTest extends UpdateTableTestBase {
+ override protected def sparkConf: SparkConf = {
+ super.sparkConf.set("spark.paimon.write.use-v2-write", "true")
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VariantTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VariantTest.scala
new file mode 100644
index 000000000000..94e9ac683f02
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VariantTest.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+import org.apache.spark.SparkConf
+
+class VariantTest extends VariantTestBase {
+ override protected def sparkConf: SparkConf = {
+ super.sparkConf.set("spark.paimon.variant.inferShreddingSchema", "false")
+ }
+}
+
+class VariantInferShreddingTest extends VariantTestBase {
+ override protected def sparkConf: SparkConf = {
+ super.sparkConf.set("spark.paimon.variant.inferShreddingSchema", "true")
+ }
+}
diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VectorSearchPushDownTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VectorSearchPushDownTest.scala
new file mode 100644
index 000000000000..7ac3c5df0d00
--- /dev/null
+++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VectorSearchPushDownTest.scala
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.spark.sql
+
+import org.apache.paimon.spark.PaimonScan
+
+/** Tests for vector search table-valued function with global vector index. */
+class VectorSearchPushDownTest extends BaseVectorSearchPushDownTest {
+ test("vector search with global index") {
+ withTable("T") {
+ spark.sql("""
+ |CREATE TABLE T (id INT, v ARRAY)
+ |TBLPROPERTIES (
+ | 'bucket' = '-1',
+ | 'global-index.row-count-per-shard' = '10000',
+ | 'row-tracking.enabled' = 'true',
+ | 'data-evolution.enabled' = 'true')
+ |""".stripMargin)
+
+ // Insert 100 rows with predictable vectors
+ val values = (0 until 100)
+ .map(
+ i => s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)))")
+ .mkString(",")
+ spark.sql(s"INSERT INTO T VALUES $values")
+
+ // Create vector index
+ val output = spark
+ .sql("CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')")
+ .collect()
+ .head
+ assert(output.getBoolean(0))
+
+ // Test vector search with table-valued function syntax
+ val result = spark
+ .sql("""
+ |SELECT * FROM vector_search('T', 'v', array(50.0f, 51.0f, 52.0f), 5)
+ |""".stripMargin)
+ .collect()
+
+ // The result should contain 5 rows
+ assert(result.length == 5)
+
+ // Vector (50, 51, 52) should be most similar to the row with id=50
+ assert(result.map(_.getInt(0)).contains(50))
+ }
+ }
+
+ test("vector search pushdown is applied in plan") {
+ withTable("T") {
+ spark.sql("""
+ |CREATE TABLE T (id INT, v ARRAY)
+ |TBLPROPERTIES (
+ | 'bucket' = '-1',
+ | 'global-index.row-count-per-shard' = '10000',
+ | 'row-tracking.enabled' = 'true',
+ | 'data-evolution.enabled' = 'true')
+ |""".stripMargin)
+
+ val values = (0 until 10)
+ .map(
+ i => s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)))")
+ .mkString(",")
+ spark.sql(s"INSERT INTO T VALUES $values")
+
+ // Create vector index
+ spark
+ .sql("CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')")
+ .collect()
+
+ // Check that vector search is pushed down with table function syntax
+ val df = spark.sql("""
+ |SELECT * FROM vector_search('T', 'v', array(50.0f, 51.0f, 52.0f), 5)
+ |""".stripMargin)
+
+ // Get the scan from the executed plan (physical plan)
+ val executedPlan = df.queryExecution.executedPlan
+ val batchScans = executedPlan.collect {
+ case scan: org.apache.spark.sql.execution.datasources.v2.BatchScanExec => scan
+ }
+
+ assert(batchScans.nonEmpty, "Should have a BatchScanExec in executed plan")
+ val paimonScans = batchScans.filter(_.scan.isInstanceOf[PaimonScan])
+ assert(paimonScans.nonEmpty, "Should have a PaimonScan in executed plan")
+
+ val paimonScan = paimonScans.head.scan.asInstanceOf[PaimonScan]
+ assert(paimonScan.pushedVectorSearch.isDefined, "Vector search should be pushed down")
+ assert(paimonScan.pushedVectorSearch.get.fieldName() == "v", "Field name should be 'v'")
+ assert(paimonScan.pushedVectorSearch.get.limit() == 5, "Limit should be 5")
+ }
+ }
+
+ test("vector search topk returns correct results") {
+ withTable("T") {
+ spark.sql("""
+ |CREATE TABLE T (id INT, v ARRAY)
+ |TBLPROPERTIES (
+ | 'bucket' = '-1',
+ | 'global-index.row-count-per-shard' = '10000',
+ | 'row-tracking.enabled' = 'true',
+ | 'data-evolution.enabled' = 'true')
+ |""".stripMargin)
+
+ // Insert rows with distinct vectors
+ val values = (1 to 100)
+ .map {
+ i =>
+ val v = math.sqrt(3.0 * i * i)
+ val normalized = i.toFloat / v.toFloat
+ s"($i, array($normalized, $normalized, $normalized))"
+ }
+ .mkString(",")
+ spark.sql(s"INSERT INTO T VALUES $values")
+
+ // Create vector index
+ spark.sql(
+ "CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')")
+
+ // Query for top 10 similar to (1, 1, 1) normalized
+ val result = spark
+ .sql("""
+ |SELECT * FROM vector_search('T', 'v', array(0.577f, 0.577f, 0.577f), 10)
+ |""".stripMargin)
+ .collect()
+
+ assert(result.length == 10)
+ }
+ }
+}
diff --git a/pom.xml b/pom.xml
index 0db5ac8d4560..eb5844e7b559 100644
--- a/pom.xml
+++ b/pom.xml
@@ -89,7 +89,7 @@ under the License.
1.20.1
2.12
2.12.18
- 2.13.16
+ 2.13.17
${scala212.version}
${scala212.version}
1.1.10.8
@@ -424,6 +424,7 @@ under the License.
paimon-spark/paimon-spark4-common
paimon-spark/paimon-spark-4.0
+ paimon-spark/paimon-spark-4.1
17