diff --git a/.github/workflows/utitcase-spark-4.x.yml b/.github/workflows/utitcase-spark-4.x.yml index 56629110f503..993fa97ba2cf 100644 --- a/.github/workflows/utitcase-spark-4.x.yml +++ b/.github/workflows/utitcase-spark-4.x.yml @@ -61,7 +61,7 @@ jobs: jvm_timezone=$(random_timezone) echo "JVM timezone is set to $jvm_timezone" test_modules="" - for suffix in ut 4.0; do + for suffix in ut 4.0 4.1; do test_modules+="org.apache.paimon:paimon-spark-${suffix}_2.13," done test_modules="${test_modules%,}" diff --git a/docs/content/spark/quick-start.md b/docs/content/spark/quick-start.md index 58530ebcb73e..524d82a16352 100644 --- a/docs/content/spark/quick-start.md +++ b/docs/content/spark/quick-start.md @@ -30,7 +30,7 @@ under the License. Paimon supports the following Spark versions with their respective Java and Scala compatibility. We recommend using the latest Spark version for a better experience. -- Spark 4.x (including 4.0) : Pre-built with Java 17 and Scala 2.13 +- Spark 4.x (including 4.1, 4.0) : Pre-built with Java 17 and Scala 2.13 - Spark 3.x (including 3.5, 3.4, 3.3, 3.2) : Pre-built with Java 8 and Scala 2.12/2.13 @@ -40,6 +40,7 @@ Download the jar file with corresponding version. | Version | Jar (Scala 2.12) | Jar (Scala 2.13) | |-----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Spark 4.1 | - | [paimon-spark-4.1_2.13-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-4.1_2.13/{{< version >}}/paimon-spark-4.1_2.13-{{< version >}}.jar) | | Spark 4.0 | - | [paimon-spark-4.0_2.13-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-4.0_2.13/{{< version >}}/paimon-spark-4.0_2.13-{{< version >}}.jar) | | Spark 3.5 | [paimon-spark-3.5_2.12-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-3.5_2.12/{{< version >}}/paimon-spark-3.5_2.12-{{< version >}}.jar) | [paimon-spark-3.5_2.13-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-3.5_2.13/{{< version >}}/paimon-spark-3.5_2.13-{{< version >}}.jar) | | Spark 3.4 | [paimon-spark-3.4_2.12-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-3.4_2.12/{{< version >}}/paimon-spark-3.4_2.12-{{< version >}}.jar) | [paimon-spark-3.4_2.13-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-3.4_2.13/{{< version >}}/paimon-spark-3.4_2.13-{{< version >}}.jar) | @@ -52,6 +53,7 @@ Download the jar file with corresponding version. | Version | Jar (Scala 2.12) | Jar (Scala 2.13) | |-----------|-----------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------| +| Spark 4.1 | - | [paimon-spark-4.1_2.13-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-4.1_2.13/{{< version >}}/) | | Spark 4.0 | - | [paimon-spark-4.0_2.13-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-4.0_2.13/{{< version >}}/) | | Spark 3.5 | [paimon-spark-3.5_2.12-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-3.5_2.12/{{< version >}}/) | [paimon-spark-3.5_2.13-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-3.5_2.13/{{< version >}}/) | | Spark 3.4 | [paimon-spark-3.4_2.12-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-3.4_2.12/{{< version >}}/) | [paimon-spark-3.4_2.13-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-3.4_2.13/{{< version >}}/) | @@ -73,6 +75,9 @@ mvn clean package -DskipTests -pl paimon-spark/paimon-spark-3.5 -am -Pscala-2.13 # build paimon spark 4.0 mvn clean package -DskipTests -pl paimon-spark/paimon-spark-4.0 -am -Pspark4 + +# build paimon spark 4.1 +mvn clean package -DskipTests -pl paimon-spark/paimon-spark-4.1 -am -Pspark4 ``` For Spark 3.5, you can find the bundled jar in `./paimon-spark/paimon-spark-3.5/target/paimon-spark-3.5_2.12-{{< version >}}.jar`. diff --git a/paimon-spark/paimon-spark-4.1/pom.xml b/paimon-spark/paimon-spark-4.1/pom.xml new file mode 100644 index 000000000000..74a30570cc5b --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/pom.xml @@ -0,0 +1,168 @@ + + + + 4.0.0 + + + org.apache.paimon + paimon-spark + 1.4-SNAPSHOT + + + paimon-spark-4.1_2.13 + Paimon : Spark : 4.1 : 2.13 + + + 4.1.1 + + + + + org.apache.paimon + paimon-format + + + + org.apache.paimon + paimon-spark4-common_${scala.binary.version} + ${project.version} + + + org.apache.spark + spark-sql-api_${scala.binary.version} + + + + + + org.apache.paimon + paimon-spark-common_${scala.binary.version} + ${project.version} + + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${spark.version} + + + + org.apache.spark + spark-hive_${scala.binary.version} + ${spark.version} + + + + + + org.apache.paimon + paimon-spark-ut_${scala.binary.version} + ${project.version} + tests + test + + + * + * + + + + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + tests + test + + + org.apache.spark + spark-connect-shims_${scala.binary.version} + + + + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${spark.version} + tests + test + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + tests + test + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + shade-paimon + package + + shade + + + + + * + + com/github/luben/zstd/** + **/*libzstd-jni-*.so + **/*libzstd-jni-*.dll + + + + + + org.apache.paimon:paimon-spark4-common_${scala.binary.version} + + + + + + + + + \ No newline at end of file diff --git a/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/paimon/spark/catalyst/optimizer/MergePaimonScalarSubqueries.scala b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/paimon/spark/catalyst/optimizer/MergePaimonScalarSubqueries.scala new file mode 100644 index 000000000000..e86195f1af0b --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/paimon/spark/catalyst/optimizer/MergePaimonScalarSubqueries.scala @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.catalyst.optimizer + +import org.apache.paimon.spark.PaimonScan + +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, ExprId, ScalarSubquery, SortOrder} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation + +object MergePaimonScalarSubqueries extends MergePaimonScalarSubqueriesBase { + + override def tryMergeDataSourceV2ScanRelation( + newV2ScanRelation: DataSourceV2ScanRelation, + cachedV2ScanRelation: DataSourceV2ScanRelation) + : Option[(LogicalPlan, AttributeMap[Attribute])] = { + (newV2ScanRelation, cachedV2ScanRelation) match { + case ( + DataSourceV2ScanRelation( + newRelation, + newScan: PaimonScan, + newOutput, + newPartitioning, + newOrdering), + DataSourceV2ScanRelation( + cachedRelation, + cachedScan: PaimonScan, + _, + cachedPartitioning, + cacheOrdering)) => + checkIdenticalPlans(newRelation, cachedRelation).flatMap { + outputMap => + if ( + samePartitioning(newPartitioning, cachedPartitioning, outputMap) && sameOrdering( + newOrdering, + cacheOrdering, + outputMap) + ) { + mergePaimonScan(newScan, cachedScan).map { + mergedScan => + val mergedAttributes = mergedScan + .readSchema() + .map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()) + val cachedOutputNameMap = cachedRelation.output.map(a => a.name -> a).toMap + val mergedOutput = + mergedAttributes.map(a => cachedOutputNameMap.getOrElse(a.name, a)) + val newV2ScanRelation = + cachedV2ScanRelation.copy(scan = mergedScan, output = mergedOutput) + + val mergedOutputNameMap = mergedOutput.map(a => a.name -> a).toMap + val newOutputMap = + AttributeMap(newOutput.map(a => a -> mergedOutputNameMap(a.name).toAttribute)) + + newV2ScanRelation -> newOutputMap + } + } else { + None + } + } + + case _ => None + } + } + + private def sameOrdering( + newOrdering: Option[Seq[SortOrder]], + cachedOrdering: Option[Seq[SortOrder]], + outputAttrMap: AttributeMap[Attribute]): Boolean = { + val mappedNewOrdering = newOrdering.map(_.map(mapAttributes(_, outputAttrMap))) + mappedNewOrdering.map(_.map(_.canonicalized)) == cachedOrdering.map(_.map(_.canonicalized)) + } + + override protected def createScalarSubquery(plan: LogicalPlan, exprId: ExprId): ScalarSubquery = { + ScalarSubquery(plan, exprId = exprId) + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/PaimonStrategyHelper.scala b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/PaimonStrategyHelper.scala new file mode 100644 index 000000000000..9fb3a7b54a25 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/PaimonStrategyHelper.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.catalog.CatalogUtils +import org.apache.spark.sql.catalyst.plans.logical.TableSpec +import org.apache.spark.sql.internal.StaticSQLConf.WAREHOUSE_PATH + +trait PaimonStrategyHelper { + + def spark: SparkSession + + protected def makeQualifiedDBObjectPath(location: String): String = { + CatalogUtils.makeQualifiedDBObjectPath( + spark.sharedState.conf.get(WAREHOUSE_PATH), + location, + spark.sharedState.hadoopConf) + } + + protected def qualifyLocInTableSpec(tableSpec: TableSpec): TableSpec = { + tableSpec.copy(location = tableSpec.location.map(makeQualifiedDBObjectPath(_))) + } + +} diff --git a/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/shim/PaimonCreateTableAsSelectStrategy.scala b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/shim/PaimonCreateTableAsSelectStrategy.scala new file mode 100644 index 000000000000..61e25b7c16a9 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/shim/PaimonCreateTableAsSelectStrategy.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.shim + +import org.apache.paimon.CoreOptions +import org.apache.paimon.iceberg.IcebergOptions +import org.apache.paimon.spark.SparkCatalog +import org.apache.paimon.spark.catalog.FormatTableCatalog + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.analysis.ResolvedIdentifier +import org.apache.spark.sql.catalyst.plans.logical.{CreateTableAsSelect, LogicalPlan, TableSpec} +import org.apache.spark.sql.connector.catalog.StagingTableCatalog +import org.apache.spark.sql.execution.{PaimonStrategyHelper, SparkPlan, SparkStrategy} +import org.apache.spark.sql.execution.datasources.v2.CreateTableAsSelectExec + +import scala.collection.JavaConverters._ + +case class PaimonCreateTableAsSelectStrategy(spark: SparkSession) + extends SparkStrategy + with PaimonStrategyHelper { + + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case CreateTableAsSelect( + ResolvedIdentifier(catalog: SparkCatalog, ident), + parts, + query, + tableSpec: TableSpec, + options, + ifNotExists, + true) => + catalog match { + case _: StagingTableCatalog => + throw new RuntimeException("Paimon can't extend StagingTableCatalog for now.") + case _ => + val coreOptionKeys = CoreOptions.getOptions.asScala.map(_.key()).toSeq + + // Include Iceberg compatibility options in table properties (fix for DataFrame writer options) + val icebergOptionKeys = IcebergOptions.getOptions.asScala.map(_.key()).toSeq + + val allTableOptionKeys = coreOptionKeys ++ icebergOptionKeys + + val (tableOptions, writeOptions) = options.partition { + case (key, _) => allTableOptionKeys.contains(key) + } + val newTableSpec = tableSpec.copy(properties = tableSpec.properties ++ tableOptions) + + val isPartitionedFormatTable = { + catalog match { + case catalog: FormatTableCatalog => + catalog.isFormatTable(newTableSpec.provider.orNull) && parts.nonEmpty + case _ => false + } + } + + if (isPartitionedFormatTable) { + throw new UnsupportedOperationException( + "Using CTAS with partitioned format table is not supported yet.") + } + + CreateTableAsSelectExec( + catalog.asTableCatalog, + ident, + parts, + query, + qualifyLocInTableSpec(newTableSpec), + writeOptions, + ifNotExists) :: Nil + } + case _ => Nil + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/resources/function/hive-test-udfs.jar b/paimon-spark/paimon-spark-4.1/src/test/resources/function/hive-test-udfs.jar new file mode 100644 index 000000000000..a5bfa456f668 Binary files /dev/null and b/paimon-spark/paimon-spark-4.1/src/test/resources/function/hive-test-udfs.jar differ diff --git a/paimon-spark/paimon-spark-4.1/src/test/resources/hive-site.xml b/paimon-spark/paimon-spark-4.1/src/test/resources/hive-site.xml new file mode 100644 index 000000000000..bdf2bb090760 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/resources/hive-site.xml @@ -0,0 +1,56 @@ + + + + + hive.metastore.integral.jdo.pushdown + true + + + + hive.metastore.schema.verification + false + + + + hive.metastore.client.capability.check + false + + + + datanucleus.schema.autoCreateTables + true + + + + datanucleus.schema.autoCreateAll + true + + + + + datanucleus.connectionPoolingType + DBCP + + + + hive.metastore.uris + thrift://localhost:9090 + Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore. + + \ No newline at end of file diff --git a/paimon-spark/paimon-spark-4.1/src/test/resources/log4j2-test.properties b/paimon-spark/paimon-spark-4.1/src/test/resources/log4j2-test.properties new file mode 100644 index 000000000000..6f324f5863ac --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/resources/log4j2-test.properties @@ -0,0 +1,38 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# Set root logger level to OFF to not flood build logs +# set manually to INFO for debugging purposes +rootLogger.level = OFF +rootLogger.appenderRef.test.ref = TestLogger + +appender.testlogger.name = TestLogger +appender.testlogger.type = CONSOLE +appender.testlogger.target = SYSTEM_ERR +appender.testlogger.layout.type = PatternLayout +appender.testlogger.layout.pattern = %-4r [%tid %t] %-5p %c %x - %m%n + +logger.kafka.name = kafka +logger.kafka.level = OFF +logger.kafka2.name = state.change +logger.kafka2.level = OFF + +logger.zookeeper.name = org.apache.zookeeper +logger.zookeeper.level = OFF +logger.I0Itec.name = org.I0Itec +logger.I0Itec.level = OFF diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonCDCSourceTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonCDCSourceTest.scala new file mode 100644 index 000000000000..9b9393be7118 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonCDCSourceTest.scala @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest + +class PaimonCDCSourceTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + test("Paimon CDC Source: batch write and streaming read change-log with default scan mode") { + withTempDir { + checkpointDir => + val tableName = "T" + spark.sql(s"DROP TABLE IF EXISTS $tableName") + spark.sql(s""" + |CREATE TABLE $tableName (a INT, b STRING) + |TBLPROPERTIES ( + | 'primary-key'='a', + | 'bucket'='2', + | 'changelog-producer' = 'lookup') + |""".stripMargin) + + spark.sql(s"INSERT INTO $tableName VALUES (1, 'v_1')") + spark.sql(s"INSERT INTO $tableName VALUES (2, 'v_2')") + spark.sql(s"INSERT INTO $tableName VALUES (2, 'v_2_new')") + + val table = loadTable(tableName) + val location = table.location().toString + + val readStream = spark.readStream + .format("paimon") + .option("read.changelog", "true") + .load(location) + .writeStream + .format("memory") + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .queryName("mem_table") + .outputMode("append") + .start() + + val currentResult = () => spark.sql("SELECT * FROM mem_table") + try { + readStream.processAllAvailable() + val expertResult1 = Row("+I", 1, "v_1") :: Row("+I", 2, "v_2_new") :: Nil + checkAnswer(currentResult(), expertResult1) + + spark.sql(s"INSERT INTO $tableName VALUES (1, 'v_1_new'), (3, 'v_3')") + readStream.processAllAvailable() + val expertResult2 = + Row("+I", 1, "v_1") :: Row("-U", 1, "v_1") :: Row("+U", 1, "v_1_new") :: Row( + "+I", + 2, + "v_2_new") :: Row("+I", 3, "v_3") :: Nil + checkAnswer(currentResult(), expertResult2) + } finally { + readStream.stop() + } + } + } + + test("Paimon CDC Source: batch write and streaming read change-log with scan.snapshot-id") { + withTempDir { + checkpointDir => + val tableName = "T" + spark.sql(s"DROP TABLE IF EXISTS $tableName") + spark.sql(s""" + |CREATE TABLE $tableName (a INT, b STRING) + |TBLPROPERTIES ( + | 'primary-key'='a', + | 'bucket'='2', + | 'changelog-producer' = 'lookup') + |""".stripMargin) + + spark.sql(s"INSERT INTO $tableName VALUES (1, 'v_1')") + spark.sql(s"INSERT INTO $tableName VALUES (2, 'v_2')") + spark.sql(s"INSERT INTO $tableName VALUES (2, 'v_2_new')") + + val table = loadTable(tableName) + val location = table.location().toString + + val readStream = spark.readStream + .format("paimon") + .option("read.changelog", "true") + .option("scan.mode", "from-snapshot") + .option("scan.snapshot-id", 1) + .load(location) + .writeStream + .format("memory") + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .queryName("mem_table") + .outputMode("append") + .start() + + val currentResult = () => spark.sql("SELECT * FROM mem_table") + try { + readStream.processAllAvailable() + val expertResult1 = Row("+I", 1, "v_1") :: Row("+I", 2, "v_2") :: Row( + "-U", + 2, + "v_2") :: Row("+U", 2, "v_2_new") :: Nil + checkAnswer(currentResult(), expertResult1) + + spark.sql(s"INSERT INTO $tableName VALUES (1, 'v_1_new'), (3, 'v_3')") + readStream.processAllAvailable() + val expertResult2 = + Row("+I", 1, "v_1") :: Row("-U", 1, "v_1") :: Row("+U", 1, "v_1_new") :: Row( + "+I", + 2, + "v_2") :: Row("-U", 2, "v_2") :: Row("+U", 2, "v_2_new") :: Row("+I", 3, "v_3") :: Nil + checkAnswer(currentResult(), expertResult2) + } finally { + readStream.stop() + } + } + } + + test("Paimon CDC Source: streaming write and streaming read change-log") { + withTempDirs { + (checkpointDir1, checkpointDir2) => + val tableName = "T" + spark.sql(s"DROP TABLE IF EXISTS $tableName") + spark.sql(s""" + |CREATE TABLE $tableName (a INT, b STRING) + |TBLPROPERTIES ( + | 'primary-key'='a', + | 'bucket'='2', + | 'changelog-producer' = 'lookup') + |""".stripMargin) + + val table = loadTable(tableName) + val location = table.location().toString + + // streaming write + val inputData = MemoryStream[(Int, String)] + val writeStream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir1.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + // streaming read + val readStream = spark.readStream + .format("paimon") + .option("read.changelog", "true") + .option("scan.mode", "from-snapshot") + .option("scan.snapshot-id", 1) + .load(location) + .writeStream + .format("memory") + .option("checkpointLocation", checkpointDir2.getCanonicalPath) + .queryName("mem_table") + .outputMode("append") + .start() + + val currentResult = () => spark.sql("SELECT * FROM mem_table") + try { + inputData.addData((1, "v_1")) + writeStream.processAllAvailable() + readStream.processAllAvailable() + val expertResult1 = Row("+I", 1, "v_1") :: Nil + checkAnswer(currentResult(), expertResult1) + + inputData.addData((2, "v_2")) + writeStream.processAllAvailable() + readStream.processAllAvailable() + val expertResult2 = Row("+I", 1, "v_1") :: Row("+I", 2, "v_2") :: Nil + checkAnswer(currentResult(), expertResult2) + + inputData.addData((2, "v_2_new")) + writeStream.processAllAvailable() + readStream.processAllAvailable() + val expertResult3 = Row("+I", 1, "v_1") :: Row("+I", 2, "v_2") :: Row( + "-U", + 2, + "v_2") :: Row("+U", 2, "v_2_new") :: Nil + checkAnswer(currentResult(), expertResult3) + + inputData.addData((1, "v_1_new"), (3, "v_3")) + writeStream.processAllAvailable() + readStream.processAllAvailable() + val expertResult4 = + Row("+I", 1, "v_1") :: Row("-U", 1, "v_1") :: Row("+U", 1, "v_1_new") :: Row( + "+I", + 2, + "v_2") :: Row("-U", 2, "v_2") :: Row("+U", 2, "v_2_new") :: Row("+I", 3, "v_3") :: Nil + checkAnswer(currentResult(), expertResult4) + } finally { + readStream.stop() + } + } + } + + test("Paimon CDC Source: streaming read change-log with audit_log system table") { + withTable("T") { + withTempDir { + checkpointDir => + spark.sql( + s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a','bucket'='2', 'changelog-producer' = 'lookup') + |""".stripMargin) + + val readStream = spark.readStream + .format("paimon") + .table("`T$audit_log`") + .writeStream + .format("memory") + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .queryName("mem_table") + .outputMode("append") + .start() + + val currentResult = () => spark.sql("SELECT * FROM mem_table") + try { + spark.sql(s"INSERT INTO T VALUES (1, 'v_1')") + readStream.processAllAvailable() + checkAnswer(currentResult(), Row("+I", 1, "v_1") :: Nil) + + spark.sql(s"INSERT INTO T VALUES (2, 'v_2')") + readStream.processAllAvailable() + checkAnswer(currentResult(), Row("+I", 1, "v_1") :: Row("+I", 2, "v_2") :: Nil) + } finally { + readStream.stop() + } + } + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSinkTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSinkTest.scala new file mode 100644 index 000000000000..9935288db9a7 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSinkTest.scala @@ -0,0 +1,365 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark + +import org.apache.paimon.Snapshot.CommitKind._ + +import org.apache.spark.SparkConf +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.functions.{col, mean, window} +import org.apache.spark.sql.streaming.StreamTest + +import java.sql.Date + +class PaimonSinkTest extends PaimonSparkTestBase with StreamTest { + + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.sql.catalog.paimon.cache-enabled", "false") + } + + import testImplicits._ + + test("Paimon Sink: forEachBatch") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], id: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Sink: append mode") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and sink into it in append mode + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .format("paimon") + .start(location) + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Sink: complete mode") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define an append-only table and sink into it in complete mode + spark.sql(s""" + |CREATE TABLE T (city String, population Long) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData.toDS + .toDF("uid", "city") + .groupBy("city") + .count() + .toDF("city", "population") + .writeStream + .outputMode("complete") + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .format("paimon") + .start(location) + + val query = () => spark.sql("SELECT * FROM T ORDER BY city") + + try { + inputData.addData((1, "HZ"), (2, "BJ"), (3, "BJ")) + stream.processAllAvailable() + checkAnswer(query(), Row("BJ", 2L) :: Row("HZ", 1L) :: Nil) + + inputData.addData((4, "SH"), (5, "BJ"), (6, "HZ")) + stream.processAllAvailable() + checkAnswer(query(), Row("BJ", 3L) :: Row("HZ", 2L) :: Row("SH", 1L) :: Nil) + + inputData.addData((7, "HZ"), (8, "SH")) + stream.processAllAvailable() + checkAnswer(query(), Row("BJ", 3L) :: Row("HZ", 3L) :: Row("SH", 2L) :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Sink: update mode") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and sink into it in update mode + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + intercept[RuntimeException] { + inputData + .toDF() + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .outputMode("update") + .format("paimon") + .start(location) + } + } + } + } + + test("Paimon Sink: aggregation and watermark") { + withTempDir { + checkpointDir => + // define an append-only table and sink into it with aggregation and watermark in append mode + spark.sql(s""" + |CREATE TABLE T (start Timestamp, stockId INT, avg_price DOUBLE) + |TBLPROPERTIES ('bucket'='3', 'bucket-key'='stockId') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Long, Int, Double)] + val data = inputData.toDS + .toDF("time", "stockId", "price") + .selectExpr("CAST(time AS timestamp) AS timestamp", "stockId", "price") + .withWatermark("timestamp", "10 seconds") + .groupBy(window($"timestamp", "5 seconds"), col("stockId")) + .agg(mean("price").as("avg_price")) + .select("window.start", "stockId", "avg_price") + + val stream = + data.writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .format("paimon") + .start(location) + + val query = () => + spark.sql( + "SELECT CAST(start as BIGINT) AS start, stockId, avg_price FROM T ORDER BY start, stockId") + + try { + inputData.addData((101L, 1, 1.0d), (102, 1, 2.0d), (104, 2, 20.0d)) + stream.processAllAvailable() + inputData.addData((105L, 2, 40.0d), (107, 2, 60.0d), (115, 3, 300.0d)) + stream.processAllAvailable() + inputData.addData((200L, 99, 99.9d)) + stream.processAllAvailable() + checkAnswer( + query(), + Row(100L, 1, 1.5d) :: Row(100L, 2, 20.0d) :: Row(105L, 2, 50.0d) :: Row( + 115L, + 3, + 300.0d) :: Nil) + } finally { + if (stream != null) { + stream.stop() + } + } + } + } + + test("Paimon Sink: enable schema evolution") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and sink into it with schema evolution in append mode + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val date = Date.valueOf("2023-08-10") + spark.sql("INSERT INTO T VALUES (1, '2023-08-09'), (2, '2023-08-09')") + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(1, "2023-08-09") :: Row(2, "2023-08-09") :: Nil) + + val inputData = MemoryStream[(Long, Date, Int)] + val stream = inputData + .toDS() + .toDF("a", "b", "c") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .option("write.merge-schema", "true") + .option("write.merge-schema.explicit-cast", "true") + .format("paimon") + .start(location) + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + inputData.addData((1L, date, 123), (3L, date, 456)) + stream.processAllAvailable() + + checkAnswer( + query(), + Row(1L, date, 123) :: Row(2L, Date.valueOf("2023-08-09"), null) :: Row( + 3L, + date, + 456) :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon SinK: set full-compaction.delta-commits with batch write") { + for (useV2Write <- Seq("true", "false")) { + withSparkSQLConf("spark.paimon.write.use-v2-write" -> useV2Write) { + withTable("t") { + sql(""" + |CREATE TABLE t ( + | a INT, + | b INT + |) TBLPROPERTIES ( + | 'primary-key'='a', + | 'bucket'='1', + | 'full-compaction.delta-commits'='1' + |) + |""".stripMargin) + + sql("INSERT INTO t VALUES (1, 1)") + sql("INSERT INTO t VALUES (2, 2)") + checkAnswer(sql("SELECT * FROM t ORDER BY a"), Seq(Row(1, 1), Row(2, 2))) + assert(loadTable("t").snapshotManager().latestSnapshot().commitKind == COMPACT) + } + } + } + } + + test("Paimon SinK: set full-compaction.delta-commits with streaming write") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b INT) + |TBLPROPERTIES ( + | 'primary-key'='a', + | 'bucket'='1', + | 'full-compaction.delta-commits'='2' + |) + |""".stripMargin) + val table = loadTable("T") + val location = table.location().toString + + val inputData = MemoryStream[(Int, Int)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .format("paimon") + .start(location) + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + inputData.addData((1, 1)) + stream.processAllAvailable() + checkAnswer(query(), Seq(Row(1, 1))) + assert(table.snapshotManager().latestSnapshot().commitKind == APPEND) + + inputData.addData((2, 1)) + stream.processAllAvailable() + checkAnswer(query(), Seq(Row(1, 1), Row(2, 1))) + assert(table.snapshotManager().latestSnapshot().commitKind == COMPACT) + + inputData.addData((2, 2)) + stream.processAllAvailable() + checkAnswer(query(), Seq(Row(1, 1), Row(2, 2))) + assert(table.snapshotManager().latestSnapshot().commitKind == APPEND) + + inputData.addData((3, 1)) + stream.processAllAvailable() + checkAnswer(query(), Seq(Row(1, 1), Row(2, 2), Row(3, 1))) + assert(table.snapshotManager().latestSnapshot().commitKind == COMPACT) + } finally { + stream.stop() + } + } + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSparkTestBase.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSparkTestBase.scala new file mode 100644 index 000000000000..3208609835f1 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSparkTestBase.scala @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark + +import org.apache.paimon.catalog.{Catalog, Identifier} +import org.apache.paimon.data.GenericRow +import org.apache.paimon.fs.FileIO +import org.apache.paimon.fs.local.LocalFileIO +import org.apache.paimon.spark.catalog.WithPaimonCatalog +import org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions +import org.apache.paimon.spark.sql.{SparkVersionSupport, WithTableOptions} +import org.apache.paimon.table.FileStoreTable + +import org.apache.spark.SparkConf +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.connector.catalog.{Identifier => SparkIdentifier} +import org.apache.spark.sql.connector.read.Scan +import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2ScanRelation} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.paimon.Utils +import org.apache.spark.sql.test.SharedSparkSession + +import java.io.File +import java.util.{TimeZone, UUID} + +import scala.util.Random + +class PaimonSparkTestBase + extends QueryTest + with SharedSparkSession + with WithTableOptions + with SparkVersionSupport { + + protected lazy val commitUser: String = UUID.randomUUID.toString + + protected lazy val fileIO: FileIO = LocalFileIO.create + + protected lazy val tempDBDir: File = Utils.createTempDir + + protected def paimonCatalog: Catalog = { + spark.sessionState.catalogManager.currentCatalog.asInstanceOf[WithPaimonCatalog].paimonCatalog() + } + + protected val dbName0: String = "test" + + protected val tableName0: String = "T" + + /** Add paimon ([[SparkCatalog]] in fileSystem) catalog */ + override protected def sparkConf: SparkConf = { + val serializer = if (Random.nextBoolean()) { + "org.apache.spark.serializer.KryoSerializer" + } else { + "org.apache.spark.serializer.JavaSerializer" + } + super.sparkConf + .set("spark.sql.warehouse.dir", tempDBDir.getCanonicalPath) + .set("spark.sql.catalog.paimon", classOf[SparkCatalog].getName) + .set("spark.sql.catalog.paimon.warehouse", tempDBDir.getCanonicalPath) + .set("spark.sql.extensions", classOf[PaimonSparkSessionExtensions].getName) + .set("spark.serializer", serializer) + } + + override protected def beforeAll(): Unit = { + super.beforeAll() + spark.sql(s"USE paimon") + spark.sql(s"CREATE DATABASE IF NOT EXISTS paimon.$dbName0") + spark.sql(s"USE paimon.$dbName0") + } + + override protected def afterAll(): Unit = { + try { + spark.sql(s"USE paimon") + spark.sql(s"DROP TABLE IF EXISTS $dbName0.$tableName0") + spark.sql("USE default") + spark.sql(s"DROP DATABASE paimon.$dbName0 CASCADE") + } finally { + super.afterAll() + } + } + + /** Default is paimon catalog */ + override protected def beforeEach(): Unit = { + super.beforeAll() + spark.sql(s"USE paimon") + spark.sql(s"USE paimon.$dbName0") + spark.sql(s"DROP TABLE IF EXISTS $tableName0") + } + + protected def withTempDirs(f: (File, File) => Unit): Unit = { + withTempDir(file1 => withTempDir(file2 => f(file1, file2))) + } + + protected def withTimeZone(timeZone: String)(f: => Unit): Unit = { + withSparkSQLConf("spark.sql.session.timeZone" -> timeZone) { + val originTimeZone = TimeZone.getDefault + try { + TimeZone.setDefault(TimeZone.getTimeZone(timeZone)) + f + } finally { + TimeZone.setDefault(originTimeZone) + } + } + } + + // Since SPARK-46227 has changed the definition of withSQLConf that resulted in + // incompatibility between the Spark3.x and Spark4.x, So Paimon declare a separate method + // to provide the same function. + protected def withSparkSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { + withSparkSQLConf0(pairs: _*)(f) + } + + private def withSparkSQLConf0(pairs: (String, String)*)(f: => Unit): Unit = { + val conf = SQLConf.get + val (keys, values) = pairs.unzip + val currentValues = keys.map { + key => + if (conf.contains(key)) { + Some(conf.getConfString(key)) + } else { + None + } + } + (keys, values).zipped.foreach { + (k, v) => + if (SQLConf.isStaticConfigKey(k)) { + throw new RuntimeException(s"Cannot modify the value of a static config: $k") + } + conf.setConfString(k, v) + } + try f + finally { + keys.zip(currentValues).foreach { + case (key, Some(value)) => conf.setConfString(key, value) + case (key, None) => conf.unsetConf(key) + } + } + } + + def loadTable(tableName: String): FileStoreTable = { + loadTable(dbName0, tableName) + } + + def loadTable(dbName: String, tableName: String): FileStoreTable = { + paimonCatalog.getTable(Identifier.create(dbName, tableName)).asInstanceOf[FileStoreTable] + } + + protected def createRelationV2(tableName: String): DataSourceV2Relation = { + val sparkTable = SparkTable(loadTable(tableName)) + DataSourceV2Relation.create( + sparkTable, + Some(spark.sessionState.catalogManager.currentCatalog), + Some(SparkIdentifier.of(Array(this.dbName0), tableName)) + ) + } + + def getScan(sqlText: String): Scan = { + sql(sqlText).queryExecution.optimizedPlan + .collectFirst { case relation: DataSourceV2ScanRelation => relation } + .get + .scan + } + + protected def getPaimonScan(sqlText: String): PaimonScan = { + getScan(sqlText).asInstanceOf[PaimonScan] + } + + protected def getFormatTableScan(sqlText: String): PaimonFormatTableScan = { + getScan(sqlText).asInstanceOf[PaimonFormatTableScan] + } + + object GenericRow { + def of(values: Any*): GenericRow = { + val row = new GenericRow(values.length) + values.zipWithIndex.foreach { + case (value, index) => + row.setField(index, value) + } + row + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/AlterBranchProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/AlterBranchProcedureTest.scala new file mode 100644 index 000000000000..df1df747897d --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/AlterBranchProcedureTest.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest + +class AlterBranchProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + test("Paimon Procedure: alter schema structure and test $branch syntax.") { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + val table = loadTable("T") + val branchManager = table.branchManager() + + // create branch with tag + checkAnswer( + spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 's_2', snapshot => 2)"), + Row(true) :: Nil) + checkAnswer( + spark.sql( + "CALL paimon.sys.create_branch(table => 'test.T', branch => 'snapshot_branch', tag => 's_2')"), + Row(true) :: Nil) + assert(branchManager.branchExists("snapshot_branch")) + + spark.sql("INSERT INTO T VALUES (1, 'APPLE'), (2,'DOG'), (2, 'horse')") + spark.sql("ALTER TABLE `T$branch_snapshot_branch` ADD COLUMNS(c INT)") + spark.sql( + "INSERT INTO `T$branch_snapshot_branch` VALUES " + "(1,'cherry', 100), (2,'bird', 200), (3, 'wolf', 400)") + + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(1, "APPLE") :: Row(2, "horse") :: Nil) + checkAnswer( + spark.sql("SELECT * FROM `T$branch_snapshot_branch` ORDER BY a, b,c"), + Row(1, "cherry", 100) :: Row(2, "bird", 200) :: Row(3, "wolf", 400) :: Nil) + assert(branchManager.branchExists("snapshot_branch")) + } + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/BranchProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/BranchProcedureTest.scala new file mode 100644 index 000000000000..111e604b1ef0 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/BranchProcedureTest.scala @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest + +class BranchProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + test("Paimon Procedure: create, query, write and delete branch") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + // create tags + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(table => 'test.T', tag => 'test_tag', snapshot => 2)"), + Row(true) :: Nil) + checkAnswer( + spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), + Row("test_tag") :: Nil) + + // create branch with tag + checkAnswer( + spark.sql( + "CALL paimon.sys.create_branch(table => 'test.T', branch => 'test_branch', tag => 'test_tag')"), + Row(true) :: Nil) + val table = loadTable("T") + val branchManager = table.branchManager() + assert(branchManager.branchExists("test_branch")) + + // query from branch + checkAnswer( + spark.sql("SELECT * FROM `T$branch_test_branch` ORDER BY a"), + Row(1, "a") :: Row(2, "b") :: Nil + ) + checkAnswer( + spark.read.format("paimon").option("branch", "test_branch").table("T").orderBy("a"), + Row(1, "a") :: Row(2, "b") :: Nil + ) + + // update branch + spark.sql("INSERT INTO `T$branch_test_branch` VALUES (3, 'c')") + checkAnswer( + spark.sql("SELECT * FROM `T$branch_test_branch` ORDER BY a"), + Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil + ) + // create tags + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(table => 'test.`T$branch_test_branch`', tag => 'test_tag2', snapshot => 3)"), + Row(true) :: Nil) + + // create branch from another branch. + checkAnswer( + spark.sql( + "CALL paimon.sys.create_branch(table => 'test.`T$branch_test_branch`', branch => 'test_branch2', tag => 'test_tag2')"), + Row(true) :: Nil) + checkAnswer( + spark.sql("SELECT * FROM `T$branch_test_branch2` ORDER BY a"), + Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil + ) + + // create empty branch + checkAnswer( + spark.sql( + "CALL paimon.sys.create_branch(table => 'test.T', branch => 'empty_branch')"), + Row(true) :: Nil) + assert(branchManager.branchExists("empty_branch")) + checkAnswer( + spark.sql("SELECT * FROM `T$branch_empty_branch` ORDER BY a"), + Nil + ) + + // delete branch + checkAnswer( + spark.sql( + "CALL paimon.sys.delete_branch(table => 'test.T', branch => 'test_branch')"), + Row(true) :: Nil) + assert(!branchManager.branchExists("test_branch")) + intercept[Exception] { + spark.sql("SELECT * FROM `T$branch_test_branch` ORDER BY a") + } + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Branch: read with scan.fallback-branch") { + withTable("T") { + sql(""" + |CREATE TABLE T ( + | dt STRING NOT NULL, + | name STRING NOT NULL, + | amount BIGINT + |) PARTITIONED BY (dt) + |""".stripMargin) + + sql("ALTER TABLE T SET TBLPROPERTIES ('k1' = 'v1')") + sql("ALTER TABLE T SET TBLPROPERTIES ('k2' = 'v2')") + + sql("CALL sys.create_branch('test.T', 'test')") + sql("ALTER TABLE T SET TBLPROPERTIES ('scan.fallback-branch' = 'test')") + + sql( + "INSERT INTO `T$branch_test` VALUES ('20240725', 'apple', 4), ('20240725', 'peach', 10), ('20240726', 'cherry', 3), ('20240726', 'pear', 6)") + sql("INSERT INTO T VALUES ('20240725', 'apple', 5), ('20240725', 'banana', 7)") + + checkAnswer( + sql("SELECT * FROM T ORDER BY amount"), + Seq( + Row("20240726", "cherry", 3), + Row("20240725", "apple", 5), + Row("20240726", "pear", 6), + Row("20240725", "banana", 7)) + ) + + sql("ALTER TABLE T UNSET TBLPROPERTIES ('scan.fallback-branch')") + checkAnswer( + sql("SELECT * FROM T ORDER BY amount"), + Seq(Row("20240725", "apple", 5), Row("20240725", "banana", 7))) + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTest.scala new file mode 100644 index 000000000000..322d50a62127 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +class CompactProcedureTest extends CompactProcedureTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTestBase.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTestBase.scala new file mode 100644 index 000000000000..19f6bc25280e --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTestBase.scala @@ -0,0 +1,1324 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.Snapshot.CommitKind +import org.apache.paimon.fs.Path +import org.apache.paimon.spark.PaimonSparkTestBase +import org.apache.paimon.spark.utils.SparkProcedureUtils +import org.apache.paimon.table.FileStoreTable +import org.apache.paimon.table.source.DataSplit + +import org.apache.spark.scheduler.{SparkListener, SparkListenerStageSubmitted} +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest +import org.assertj.core.api.Assertions +import org.scalatest.time.Span + +import java.util + +import scala.collection.JavaConverters._ +import scala.util.Random + +/** Test compact procedure. See [[CompactProcedure]]. */ +abstract class CompactProcedureTestBase extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + // ----------------------- Minor Compact ----------------------- + + test("Paimon Procedure: compact aware bucket pk table with minor compact strategy") { + withTable("T") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='id, pt', 'bucket'='1', 'write-only'='true') + |PARTITIONED BY (pt) + |""".stripMargin) + + val table = loadTable("T") + + spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1'), (2, 'b', 'p2')") + spark.sql(s"INSERT INTO T VALUES (3, 'c', 'p1'), (4, 'd', 'p2')") + + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.APPEND)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(2) + + spark.sql( + "CALL sys.compact(table => 'T', compact_strategy => 'minor'," + + "options => 'num-sorted-run.compaction-trigger=3')") + + // Due to the limitation of parameter 'num-sorted-run.compaction-trigger' = 3, so compact is not + // performed. + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.APPEND)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(2) + + // Make par-p1 has 3 datafile and par-p2 has 2 datafile, so par-p2 will not be picked out to + // compact. + spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1')") + + spark.sql( + "CALL sys.compact(table => 'T', compact_strategy => 'minor'," + + "options => 'num-sorted-run.compaction-trigger=3')") + + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4) + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + + val splits = table.newSnapshotReader.read.dataSplits + splits.forEach( + split => { + Assertions + .assertThat(split.dataFiles.size) + .isEqualTo(if (split.partition().getString(0).toString == "p2") 2 else 1) + }) + } + } + + // ----------------------- Sort Compact ----------------------- + + test("Paimon Procedure: sort compact") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b INT) + |TBLPROPERTIES ('bucket'='-1') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, Int)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // test zorder sort + inputData.addData((0, 0)) + inputData.addData((0, 1)) + inputData.addData((0, 2)) + inputData.addData((1, 0)) + inputData.addData((1, 1)) + inputData.addData((1, 2)) + inputData.addData((2, 0)) + inputData.addData((2, 1)) + inputData.addData((2, 2)) + stream.processAllAvailable() + + val result = new util.ArrayList[Row]() + for (a <- 0 until 3) { + for (b <- 0 until 3) { + result.add(Row(a, b)) + } + } + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + + checkAnswer( + spark.sql( + "CALL paimon.sys.compact(table => 'T', order_strategy => 'zorder', order_by => 'a,b')"), + Row(true) :: Nil) + + val result2 = new util.ArrayList[Row]() + result2.add(0, Row(0, 0)) + result2.add(1, Row(0, 1)) + result2.add(2, Row(1, 0)) + result2.add(3, Row(1, 1)) + result2.add(4, Row(0, 2)) + result2.add(5, Row(1, 2)) + result2.add(6, Row(2, 0)) + result2.add(7, Row(2, 1)) + result2.add(8, Row(2, 2)) + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result2) + + // test hilbert sort + val result3 = new util.ArrayList[Row]() + result3.add(0, Row(0, 0)) + result3.add(1, Row(0, 1)) + result3.add(2, Row(1, 1)) + result3.add(3, Row(1, 0)) + result3.add(4, Row(2, 0)) + result3.add(5, Row(2, 1)) + result3.add(6, Row(2, 2)) + result3.add(7, Row(1, 2)) + result3.add(8, Row(0, 2)) + + checkAnswer( + spark.sql( + "CALL paimon.sys.compact(table => 'T', order_strategy => 'hilbert', order_by => 'a,b')"), + Row(true) :: Nil) + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result3) + + // test order sort + checkAnswer( + spark.sql( + "CALL paimon.sys.compact(table => 'T', order_strategy => 'order', order_by => 'a,b')"), + Row(true) :: Nil) + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: sort compact with partition") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (p INT, a INT, b INT) + |TBLPROPERTIES ('bucket'='-1') + |PARTITIONED BY (p) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, Int, Int)] + val stream = inputData + .toDS() + .toDF("p", "a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query0 = () => spark.sql("SELECT * FROM T WHERE p=0") + val query1 = () => spark.sql("SELECT * FROM T WHERE p=1") + + try { + // test zorder sort + inputData.addData((0, 0, 0)) + inputData.addData((0, 0, 1)) + inputData.addData((0, 0, 2)) + inputData.addData((0, 1, 0)) + inputData.addData((0, 1, 1)) + inputData.addData((0, 1, 2)) + inputData.addData((0, 2, 0)) + inputData.addData((0, 2, 1)) + inputData.addData((0, 2, 2)) + + inputData.addData((1, 0, 0)) + inputData.addData((1, 0, 1)) + inputData.addData((1, 0, 2)) + inputData.addData((1, 1, 0)) + inputData.addData((1, 1, 1)) + inputData.addData((1, 1, 2)) + inputData.addData((1, 2, 0)) + inputData.addData((1, 2, 1)) + inputData.addData((1, 2, 2)) + stream.processAllAvailable() + + val result0 = new util.ArrayList[Row]() + for (a <- 0 until 3) { + for (b <- 0 until 3) { + result0.add(Row(0, a, b)) + } + } + val result1 = new util.ArrayList[Row]() + for (a <- 0 until 3) { + for (b <- 0 until 3) { + result1.add(Row(1, a, b)) + } + } + Assertions.assertThat(query0().collect()).containsExactlyElementsOf(result0) + Assertions.assertThat(query1().collect()).containsExactlyElementsOf(result1) + + checkAnswer( + spark.sql( + "CALL paimon.sys.compact(table => 'T', partitions => 'p=0', order_strategy => 'zorder', order_by => 'a,b')"), + Row(true) :: Nil) + + val result2 = new util.ArrayList[Row]() + result2.add(0, Row(0, 0, 0)) + result2.add(1, Row(0, 0, 1)) + result2.add(2, Row(0, 1, 0)) + result2.add(3, Row(0, 1, 1)) + result2.add(4, Row(0, 0, 2)) + result2.add(5, Row(0, 1, 2)) + result2.add(6, Row(0, 2, 0)) + result2.add(7, Row(0, 2, 1)) + result2.add(8, Row(0, 2, 2)) + + Assertions.assertThat(query0().collect()).containsExactlyElementsOf(result2) + Assertions.assertThat(query1().collect()).containsExactlyElementsOf(result1) + + // test hilbert sort + val result3 = new util.ArrayList[Row]() + result3.add(0, Row(0, 0, 0)) + result3.add(1, Row(0, 0, 1)) + result3.add(2, Row(0, 1, 1)) + result3.add(3, Row(0, 1, 0)) + result3.add(4, Row(0, 2, 0)) + result3.add(5, Row(0, 2, 1)) + result3.add(6, Row(0, 2, 2)) + result3.add(7, Row(0, 1, 2)) + result3.add(8, Row(0, 0, 2)) + + checkAnswer( + spark.sql( + "CALL paimon.sys.compact(table => 'T', partitions => 'p=0', order_strategy => 'hilbert', order_by => 'a,b')"), + Row(true) :: Nil) + + Assertions.assertThat(query0().collect()).containsExactlyElementsOf(result3) + Assertions.assertThat(query1().collect()).containsExactlyElementsOf(result1) + + // test order sort + checkAnswer( + spark.sql( + "CALL paimon.sys.compact(table => 'T', partitions => 'p=0', order_strategy => 'order', order_by => 'a,b')"), + Row(true) :: Nil) + Assertions.assertThat(query0().collect()).containsExactlyElementsOf(result0) + Assertions.assertThat(query1().collect()).containsExactlyElementsOf(result1) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: sort compact with multi-partitions") { + Seq("order", "zorder").foreach { + orderStrategy => + { + withTable("T") { + spark.sql(s""" + |CREATE TABLE T (id INT, pt STRING) + |PARTITIONED BY (pt) + |""".stripMargin) + + spark.sql(s"""INSERT INTO T VALUES + |(1, 'p1'), (3, 'p1'), + |(1, 'p2'), (4, 'p2'), + |(3, 'p3'), (2, 'p3'), + |(1, 'p4'), (2, 'p4') + |""".stripMargin) + + spark.sql(s"""INSERT INTO T VALUES + |(4, 'p1'), (2, 'p1'), + |(2, 'p2'), (3, 'p2'), + |(1, 'p3'), (4, 'p3'), + |(3, 'p4'), (4, 'p4') + |""".stripMargin) + + checkAnswer( + spark.sql( + s"CALL sys.compact(table => 'T', order_strategy => '$orderStrategy', order_by => 'id')"), + Seq(true).toDF()) + + val result = List(Row(1), Row(2), Row(3), Row(4)).asJava + Seq("p1", "p2", "p3", "p4").foreach { + pt => + Assertions + .assertThat(spark.sql(s"SELECT id FROM T WHERE pt='$pt'").collect()) + .containsExactlyElementsOf(result) + } + } + } + } + } + + test("Paimon Procedure: sort compact with partition filter") { + withTable("t") { + sql("CREATE TABLE t (a INT, pt INT) PARTITIONED BY (pt)") + sql("INSERT INTO t VALUES (1, 1)") + sql("INSERT INTO t VALUES (2, 1)") + sql( + "CALL sys.compact(table => 't', order_strategy => 'order', where => 'pt = 1', order_by => 'a')") + val table = loadTable("t") + assert(table.latestSnapshot().get().commitKind.equals(CommitKind.OVERWRITE)) + checkAnswer(sql("SELECT * FROM t ORDER BY a"), Seq(Row(1, 1), Row(2, 1))) + } + } + + test("Paimon Procedure: compact for pk") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b INT) + |TBLPROPERTIES ('primary-key'='a,b', 'bucket'='1') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, Int)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + inputData.addData((0, 0)) + inputData.addData((0, 1)) + inputData.addData((0, 2)) + inputData.addData((1, 0)) + inputData.addData((1, 1)) + inputData.addData((1, 2)) + inputData.addData((2, 0)) + inputData.addData((2, 1)) + inputData.addData((2, 2)) + stream.processAllAvailable() + + val result = new util.ArrayList[Row]() + for (a <- 0 until 3) { + for (b <- 0 until 3) { + result.add(Row(a, b)) + } + } + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: compact aware bucket pk table") { + Seq(1, -1).foreach( + bucket => { + withTable("T") { + spark.sql( + s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='id, pt', 'bucket'='$bucket', 'write-only'='true') + |PARTITIONED BY (pt) + |""".stripMargin) + + val table = loadTable("T") + + spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1'), (2, 'b', 'p2')") + spark.sql(s"INSERT INTO T VALUES (3, 'c', 'p1'), (4, 'd', 'p2')") + + spark.sql("CALL sys.compact(table => 'T', partitions => 'pt=\"p1\"')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(3) + + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4) + + // compact condition no longer met + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4) + + checkAnswer( + spark.sql(s"SELECT * FROM T ORDER BY id"), + Row(1, "a", "p1") :: Row(2, "b", "p2") :: Row(3, "c", "p1") :: Row(4, "d", "p2") :: Nil) + } + }) + } + + test("Paimon Procedure: compact aware bucket pk table with many small files") { + Seq(3, -1).foreach( + bucket => { + withTable("T") { + spark.sql( + s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='id, pt', 'bucket'='$bucket', 'write-only'='true', + |'source.split.target-size'='128m','source.split.open-file-cost'='32m') -- simulate multiple splits in a single bucket + |PARTITIONED BY (pt) + |""".stripMargin) + + val table = loadTable("T") + + val count = 100 + for (i <- 0 until count) { + spark.sql(s"INSERT INTO T VALUES ($i, 'a', 'p${i % 2}')") + } + + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + checkAnswer(spark.sql(s"SELECT COUNT(*) FROM T"), Row(count) :: Nil) + } + }) + } + + test("Paimon Procedure: compact unaware bucket append table") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('bucket'='-1', 'write-only'='true', 'compaction.min.file-num'='2') + |PARTITIONED BY (pt) + |""".stripMargin) + + val table = loadTable("T") + + spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1'), (2, 'b', 'p2')") + spark.sql(s"INSERT INTO T VALUES (3, 'c', 'p1'), (4, 'd', 'p2')") + spark.sql(s"INSERT INTO T VALUES (5, 'e', 'p1'), (6, 'f', 'p2')") + + spark.sql("CALL sys.compact(table => 'T', partitions => 'pt=\"p1\"')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4) + + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(5) + + // compact condition no longer met + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(5) + + checkAnswer( + spark.sql(s"SELECT * FROM T ORDER BY id"), + Row(1, "a", "p1") :: Row(2, "b", "p2") :: Row(3, "c", "p1") :: Row(4, "d", "p2") :: Row( + 5, + "e", + "p1") :: Row(6, "f", "p2") :: Nil) + } + + test("Paimon Procedure: compact unaware bucket append table with many small files") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('bucket'='-1', 'write-only'='true') + |PARTITIONED BY (pt) + |""".stripMargin) + + val table = loadTable("T") + + val count = 100 + for (i <- 0 until count) { + spark.sql(s"INSERT INTO T VALUES ($i, 'a', 'p${i % 2}')") + } + + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + checkAnswer(spark.sql(s"SELECT COUNT(*) FROM T"), Row(count) :: Nil) + } + + test("Paimon Procedure: compact with wrong usage") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('bucket'='-1', 'write-only'='true') + |PARTITIONED BY (pt) + |""".stripMargin) + + assert(intercept[IllegalArgumentException] { + spark.sql( + "CALL sys.compact(table => 'T', partitions => 'pt = \"p1\"', where => 'pt = \"p1\"')") + }.getMessage.contains("partitions and where cannot be used together")) + + assert(intercept[IllegalArgumentException] { + spark.sql("CALL sys.compact(table => 'T', partitions => 'id = 1')") + }.getMessage.contains("Only partition predicate is supported")) + + assert(intercept[IllegalArgumentException] { + spark.sql("CALL sys.compact(table => 'T', where => 'id > 1 AND pt = \"p1\"')") + }.getMessage.contains("Only partition predicate is supported")) + + assert(intercept[IllegalArgumentException] { + spark.sql("CALL sys.compact(table => 'T', order_strategy => 'sort', order_by => 'pt')") + }.getMessage.contains("order_by should not contain partition cols")) + + assert(intercept[IllegalArgumentException] { + spark.sql( + "CALL sys.compact(table => 'T', order_strategy => 'sort', order_by => 'id', partition_idle_time =>'5s')") + }.getMessage.contains("sort compact do not support 'partition_idle_time'")) + } + + test("Paimon Procedure: compact with where") { + spark.sql( + s""" + |CREATE TABLE T (id INT, value STRING, dt STRING, hh INT) + |TBLPROPERTIES ('bucket'='1', 'bucket-key'='id', 'write-only'='true', 'compaction.min.file-num'='1') + |PARTITIONED BY (dt, hh) + |""".stripMargin) + + val table = loadTable("T") + val fileIO = table.fileIO() + + spark.sql(s"INSERT INTO T VALUES (1, '1', '2024-01-01', 0), (2, '2', '2024-01-01', 1)") + spark.sql(s"INSERT INTO T VALUES (3, '3', '2024-01-01', 0), (4, '4', '2024-01-01', 1)") + spark.sql(s"INSERT INTO T VALUES (5, '5', '2024-01-02', 0), (6, '6', '2024-01-02', 1)") + spark.sql(s"INSERT INTO T VALUES (7, '7', '2024-01-02', 0), (8, '8', '2024-01-02', 1)") + + spark.sql("CALL sys.compact(table => 'T', where => 'dt = \"2024-01-01\" and hh >= 1')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions + .assertThat( + fileIO.listStatus(new Path(table.location(), "dt=2024-01-01/hh=0/bucket-0")).length) + .isEqualTo(2) + Assertions + .assertThat( + fileIO.listStatus(new Path(table.location(), "dt=2024-01-01/hh=1/bucket-0")).length) + .isEqualTo(3) + Assertions + .assertThat( + fileIO.listStatus(new Path(table.location(), "dt=2024-01-02/hh=0/bucket-0")).length) + .isEqualTo(2) + Assertions + .assertThat( + fileIO.listStatus(new Path(table.location(), "dt=2024-01-02/hh=1/bucket-0")).length) + .isEqualTo(2) + } + + test("Paimon test: toWhere method in CompactProcedure") { + val conditions = "f0=0,f1=0,f2=0;f0=1,f1=1,f2=1;f0=1,f1=2,f2=2;f3=3" + + val where = SparkProcedureUtils.toWhere(conditions) + val whereExpected = + "(f0=0 AND f1=0 AND f2=0) OR (f0=1 AND f1=1 AND f2=1) OR (f0=1 AND f1=2 AND f2=2) OR (f3=3)" + + Assertions.assertThat(where).isEqualTo(whereExpected) + } + + test("Paimon Procedure: compact unaware bucket append table with option") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('bucket'='-1', 'write-only'='true') + |PARTITIONED BY (pt) + |""".stripMargin) + + val table = loadTable("T") + + spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1'), (2, 'b', 'p2')") + spark.sql(s"INSERT INTO T VALUES (3, 'c', 'p1'), (4, 'd', 'p2')") + spark.sql(s"INSERT INTO T VALUES (5, 'e', 'p1'), (6, 'f', 'p2')") + + spark.sql( + "CALL sys.compact(table => 'T', partitions => 'pt=\"p1\"', options => 'compaction.min.file-num=2')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4) + + spark.sql("CALL sys.compact(table => 'T', options => 'compaction.min.file-num=2')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(5) + + // compact condition no longer met + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(5) + + checkAnswer( + spark.sql(s"SELECT * FROM T ORDER BY id"), + Row(1, "a", "p1") :: Row(2, "b", "p2") :: Row(3, "c", "p1") :: Row(4, "d", "p2") :: + Row(5, "e", "p1") :: Row(6, "f", "p2") :: Nil) + } + + test("Paimon Procedure: compact with partition_idle_time for pk table") { + Seq(1, -1).foreach( + bucket => { + withTable("T") { + val dynamicBucketArgs = if (bucket == -1) " ,'dynamic-bucket.initial-buckets'='1'" else "" + spark.sql( + s""" + |CREATE TABLE T (id INT, value STRING, dt STRING, hh INT) + |TBLPROPERTIES ('primary-key'='id, dt, hh', 'bucket'='$bucket', 'write-only'='true'$dynamicBucketArgs) + |PARTITIONED BY (dt, hh) + |""".stripMargin) + + val table = loadTable("T") + + spark.sql(s"INSERT INTO T VALUES (1, '1', '2024-01-01', 0), (2, '2', '2024-01-01', 1)") + spark.sql(s"INSERT INTO T VALUES (5, '5', '2024-01-02', 0), (6, '6', '2024-01-02', 1)") + spark.sql(s"INSERT INTO T VALUES (3, '3', '2024-01-01', 0), (4, '4', '2024-01-01', 1)") + spark.sql(s"INSERT INTO T VALUES (7, '7', '2024-01-02', 0), (8, '8', '2024-01-02', 1)") + + Thread.sleep(10000); + spark.sql(s"INSERT INTO T VALUES (9, '9', '2024-01-01', 0), (10, '10', '2024-01-02', 0)") + + spark.sql("CALL sys.compact(table => 'T', partition_idle_time => '10s')") + val dataSplits = table.newSnapshotReader.read.dataSplits.asScala.toList + Assertions + .assertThat(dataSplits.size) + .isEqualTo(4) + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + for (dataSplit: DataSplit <- dataSplits) { + if (dataSplit.partition().getInt(1) == 0) { + Assertions + .assertThat(dataSplit.dataFiles().size()) + .isEqualTo(3) + } else { + Assertions + .assertThat(dataSplit.dataFiles().size()) + .isEqualTo(1) + } + } + } + }) + + } + + test("Paimon Procedure: compact with partition_idle_time for unaware bucket append table") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING, dt STRING, hh INT) + |TBLPROPERTIES ('bucket'='-1', 'write-only'='true', 'compaction.min.file-num'='2') + |PARTITIONED BY (dt, hh) + |""".stripMargin) + + val table = loadTable("T") + + spark.sql(s"INSERT INTO T VALUES (1, '1', '2024-01-01', 0), (2, '2', '2024-01-01', 1)") + spark.sql(s"INSERT INTO T VALUES (5, '5', '2024-01-02', 0), (6, '6', '2024-01-02', 1)") + spark.sql(s"INSERT INTO T VALUES (3, '3', '2024-01-01', 0), (4, '4', '2024-01-01', 1)") + spark.sql(s"INSERT INTO T VALUES (7, '7', '2024-01-02', 0), (8, '8', '2024-01-02', 1)") + + Thread.sleep(10000); + spark.sql(s"INSERT INTO T VALUES (9, '9', '2024-01-01', 0), (10, '10', '2024-01-02', 0)") + + spark.sql("CALL sys.compact(table => 'T', partition_idle_time => '10s')") + val dataSplits = table.newSnapshotReader.read.dataSplits.asScala.toList + Assertions + .assertThat(dataSplits.size) + .isEqualTo(4) + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + for (dataSplit: DataSplit <- dataSplits) { + if (dataSplit.partition().getInt(1) == 0) { + Assertions + .assertThat(dataSplit.dataFiles().size()) + .isEqualTo(3) + } else { + Assertions + .assertThat(dataSplit.dataFiles().size()) + .isEqualTo(1) + } + } + } + + test("Paimon Procedure: test aware-bucket compaction read parallelism") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING) + |TBLPROPERTIES ('primary-key'='id', 'bucket'='3', 'write-only'='true') + |""".stripMargin) + + val table = loadTable("T") + for (i <- 1 to 10) { + sql(s"INSERT INTO T VALUES ($i, '$i')") + } + assertResult(10)(table.snapshotManager().snapshotCount()) + + val buckets = table.newSnapshotReader().bucketEntries().asScala.map(_.bucket()).distinct.size + assertResult(3)(buckets) + + val taskBuffer = scala.collection.mutable.ListBuffer.empty[Int] + val listener = new SparkListener { + override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = { + taskBuffer += stageSubmitted.stageInfo.numTasks + } + } + + try { + spark.sparkContext.addSparkListener(listener) + + // spark.default.parallelism cannot be change in spark session + // sparkParallelism is 2, bucket is 3, use 2 as the read parallelism + spark.conf.set("spark.sql.shuffle.partitions", 2) + spark.sql("CALL sys.compact(table => 'T')") + + // sparkParallelism is 5, bucket is 3, use 3 as the read parallelism + spark.conf.set("spark.sql.shuffle.partitions", 5) + spark.sql("CALL sys.compact(table => 'T')") + + assertResult(Seq(2, 3))(taskBuffer) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + } + + test("Paimon Procedure: test unaware-bucket compaction read parallelism") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING) + |TBLPROPERTIES ('bucket'='-1', 'write-only'='true') + |""".stripMargin) + + val table = loadTable("T") + for (i <- 1 to 12) { + sql(s"INSERT INTO T VALUES ($i, '$i')") + } + assertResult(12)(table.snapshotManager().snapshotCount()) + + val buckets = table.newSnapshotReader().bucketEntries().asScala.map(_.bucket()).distinct.size + // only has bucket-0 + assertResult(1)(buckets) + + val taskBuffer = scala.collection.mutable.ListBuffer.empty[Int] + val listener = new SparkListener { + override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = { + taskBuffer += stageSubmitted.stageInfo.numTasks + } + } + + try { + spark.sparkContext.addSparkListener(listener) + + // spark.default.parallelism cannot be change in spark session + // sparkParallelism is 2, task groups is 6, use 2 as the read parallelism + spark.conf.set("spark.sql.shuffle.partitions", 2) + spark.sql( + "CALL sys.compact(table => 'T', options => 'source.split.open-file-cost=3200M, compaction.min.file-num=2')") + + // sparkParallelism is 5, task groups is 1, use 1 as the read parallelism + spark.conf.set("spark.sql.shuffle.partitions", 5) + spark.sql( + "CALL sys.compact(table => 'T', options => 'source.split.open-file-cost=3200M, compaction.min.file-num=2')") + + assertResult(Seq(2, 3))(taskBuffer) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + } + + test("Paimon Procedure: type cast in where") { + withTable("t") { + sql(""" + |CREATE TABLE t (id INT, value STRING, day_part LONG) + |TBLPROPERTIES ('compaction.min.file-num'='2') + |PARTITIONED BY (day_part) + |""".stripMargin) + sql("INSERT INTO t VALUES (1, 'a', 20250810)") + sql("INSERT INTO t VALUES (2, 'b', 20250810)") + sql("INSERT INTO t VALUES (3, 'c', 20250811)") + + sql("CALL sys.compact(table => 't', where => 'day_part < 20250811 and day_part > 20250809')") + val table = loadTable("t") + assert(table.snapshotManager().latestSnapshot().commitKind().equals(CommitKind.COMPACT)) + } + } + + test("Paimon Procedure: cluster for unpartitioned table") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql( + s""" + |CREATE TABLE T (a INT, b INT, c STRING) + |TBLPROPERTIES ('bucket'='-1','num-levels'='6', 'num-sorted-run.compaction-trigger'='2', 'clustering.columns'='a,b', 'clustering.strategy'='zorder', 'clustering.incremental' = 'true') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b", "c") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + val random = new Random() + val randomStr = random.nextString(40) + // first write + inputData.addData((0, 0, randomStr)) + inputData.addData((0, 1, randomStr)) + inputData.addData((0, 2, randomStr)) + inputData.addData((1, 0, randomStr)) + inputData.addData((1, 1, randomStr)) + inputData.addData((1, 2, randomStr)) + inputData.addData((2, 0, randomStr)) + inputData.addData((2, 1, randomStr)) + inputData.addData((2, 2, randomStr)) + stream.processAllAvailable() + + val result = new util.ArrayList[Row]() + for (a <- 0 until 3) { + for (b <- 0 until 3) { + result.add(Row(a, b, randomStr)) + } + } + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + + // first cluster, the outputLevel should be 5 + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + + // first cluster result + val result2 = new util.ArrayList[Row]() + result2.add(0, Row(0, 0, randomStr)) + result2.add(1, Row(0, 1, randomStr)) + result2.add(2, Row(1, 0, randomStr)) + result2.add(3, Row(1, 1, randomStr)) + result2.add(4, Row(0, 2, randomStr)) + result2.add(5, Row(1, 2, randomStr)) + result2.add(6, Row(2, 0, randomStr)) + result2.add(7, Row(2, 1, randomStr)) + result2.add(8, Row(2, 2, randomStr)) + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result2) + + var clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + var dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5) + + // second write + inputData.addData((0, 3, null), (1, 3, null), (2, 3, null)) + inputData.addData((3, 0, null), (3, 1, null), (3, 2, null), (3, 3, null)) + stream.processAllAvailable() + + val result3 = new util.ArrayList[Row]() + result3.addAll(result2) + for (a <- 0 until 3) { + result3.add(Row(a, 3, null)) + } + for (b <- 0 until 4) { + result3.add(Row(3, b, null)) + } + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result3) + + // second cluster, the outputLevel should be 4 + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + // second cluster result, level-5 and level-4 are individually ordered + val result4 = new util.ArrayList[Row]() + result4.addAll(result2) + result4.add(Row(0, 3, null)) + result4.add(Row(1, 3, null)) + result4.add(Row(3, 0, null)) + result4.add(Row(3, 1, null)) + result4.add(Row(2, 3, null)) + result4.add(Row(3, 2, null)) + result4.add(Row(3, 3, null)) + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result4) + + clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(2) + Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5) + Assertions.assertThat(dataSplits.get(0).dataFiles().get(1).level()).isEqualTo(4) + + // full cluster + checkAnswer( + spark.sql("CALL paimon.sys.compact(table => 'T', compact_strategy => 'full')"), + Row(true) :: Nil) + val result5 = new util.ArrayList[Row]() + result5.add(Row(0, 0, randomStr)) + result5.add(Row(0, 1, randomStr)) + result5.add(Row(1, 0, randomStr)) + result5.add(Row(1, 1, randomStr)) + result5.add(Row(0, 2, randomStr)) + result5.add(Row(0, 3, null)) + result5.add(Row(1, 2, randomStr)) + result5.add(Row(1, 3, null)) + result5.add(Row(2, 0, randomStr)) + result5.add(Row(2, 1, randomStr)) + result5.add(Row(3, 0, null)) + result5.add(Row(3, 1, null)) + result5.add(Row(2, 2, randomStr)) + result5.add(Row(2, 3, null)) + result5.add(Row(3, 2, null)) + result5.add(Row(3, 3, null)) + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result5) + + clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: cluster for partitioned table") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql( + s""" + |CREATE TABLE T (a INT, b INT, c STRING, pt INT) + |PARTITIONED BY (pt) + |TBLPROPERTIES ('bucket'='-1', 'num-levels'='6', 'num-sorted-run.compaction-trigger'='2', 'clustering.columns'='a,b', 'clustering.strategy'='zorder', 'clustering.incremental' = 'true') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, Int, String, Int)] + val stream = inputData + .toDS() + .toDF("a", "b", "c", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY pt") + + try { + val random = new Random() + val randomStr = random.nextString(50) + // first write + for (pt <- 0 until 2) { + val c = if (pt == 0) randomStr else null + inputData.addData((0, 0, c, pt)) + inputData.addData((0, 1, c, pt)) + inputData.addData((0, 2, c, pt)) + inputData.addData((1, 0, c, pt)) + inputData.addData((1, 1, c, pt)) + inputData.addData((1, 2, c, pt)) + inputData.addData((2, 0, c, pt)) + inputData.addData((2, 1, c, pt)) + inputData.addData((2, 2, c, pt)) + } + stream.processAllAvailable() + + val result = new util.ArrayList[Row]() + for (pt <- 0 until 2) { + for (a <- 0 until 3) { + for (b <- 0 until 3) { + val c = if (pt == 0) randomStr else null + result.add(Row(a, b, c, pt)) + } + } + } + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + + // first cluster, the outputLevel should be 5 + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + + // first cluster result + val result2 = new util.ArrayList[Row]() + for (pt <- 0 until 2) { + val c = if (pt == 0) randomStr else null + result2.add(Row(0, 0, c, pt)) + result2.add(Row(0, 1, c, pt)) + result2.add(Row(1, 0, c, pt)) + result2.add(Row(1, 1, c, pt)) + result2.add(Row(0, 2, c, pt)) + result2.add(Row(1, 2, c, pt)) + result2.add(Row(2, 0, c, pt)) + result2.add(Row(2, 1, c, pt)) + result2.add(Row(2, 2, c, pt)) + } + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result2) + + var clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + var dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(2) + dataSplits.forEach( + dataSplit => { + Assertions.assertThat(dataSplit.dataFiles().size()).isEqualTo(1) + Assertions.assertThat(dataSplit.dataFiles().get(0).level()).isEqualTo(5) + }) + + // second write + for (pt <- 0 until 2) { + inputData.addData((0, 3, null, pt), (1, 3, null, pt), (2, 3, null, pt)) + inputData.addData( + (3, 0, null, pt), + (3, 1, null, pt), + (3, 2, null, pt), + (3, 3, null, pt)) + } + stream.processAllAvailable() + + val result3 = new util.ArrayList[Row]() + for (pt <- 0 until 2) { + val c = if (pt == 0) randomStr else null + result3.add(Row(0, 0, c, pt)) + result3.add(Row(0, 1, c, pt)) + result3.add(Row(1, 0, c, pt)) + result3.add(Row(1, 1, c, pt)) + result3.add(Row(0, 2, c, pt)) + result3.add(Row(1, 2, c, pt)) + result3.add(Row(2, 0, c, pt)) + result3.add(Row(2, 1, c, pt)) + result3.add(Row(2, 2, c, pt)) + for (a <- 0 until 3) { + result3.add(Row(a, 3, null, pt)) + } + for (b <- 0 until 4) { + result3.add(Row(3, b, null, pt)) + } + } + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result3) + + // second cluster + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + val result4 = new util.ArrayList[Row]() + // for partition-0: only file in level-0 will be picked for clustering, outputLevel is 4 + result4.add(Row(0, 0, randomStr, 0)) + result4.add(Row(0, 1, randomStr, 0)) + result4.add(Row(1, 0, randomStr, 0)) + result4.add(Row(1, 1, randomStr, 0)) + result4.add(Row(0, 2, randomStr, 0)) + result4.add(Row(1, 2, randomStr, 0)) + result4.add(Row(2, 0, randomStr, 0)) + result4.add(Row(2, 1, randomStr, 0)) + result4.add(Row(2, 2, randomStr, 0)) + result4.add(Row(0, 3, null, 0)) + result4.add(Row(1, 3, null, 0)) + result4.add(Row(3, 0, null, 0)) + result4.add(Row(3, 1, null, 0)) + result4.add(Row(2, 3, null, 0)) + result4.add(Row(3, 2, null, 0)) + result4.add(Row(3, 3, null, 0)) + // for partition-1:all files will be picked for clustering, outputLevel is 5 + result4.add(Row(0, 0, null, 1)) + result4.add(Row(0, 1, null, 1)) + result4.add(Row(1, 0, null, 1)) + result4.add(Row(1, 1, null, 1)) + result4.add(Row(0, 2, null, 1)) + result4.add(Row(0, 3, null, 1)) + result4.add(Row(1, 2, null, 1)) + result4.add(Row(1, 3, null, 1)) + result4.add(Row(2, 0, null, 1)) + result4.add(Row(2, 1, null, 1)) + result4.add(Row(3, 0, null, 1)) + result4.add(Row(3, 1, null, 1)) + result4.add(Row(2, 2, null, 1)) + result4.add(Row(2, 3, null, 1)) + result4.add(Row(3, 2, null, 1)) + result4.add(Row(3, 3, null, 1)) + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result4) + + clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(2) + dataSplits.forEach( + dataSplit => { + if (dataSplit.partition().getInt(0) == 1) { + // partition-1 + Assertions.assertThat(dataSplit.dataFiles().size()).isEqualTo(1) + Assertions.assertThat(dataSplit.dataFiles().get(0).level()).isEqualTo(5) + } else { + // partition-0 + Assertions.assertThat(dataSplit.dataFiles().size()).isEqualTo(2) + Assertions.assertThat(dataSplit.dataFiles().get(0).level()).isEqualTo(5) + Assertions.assertThat(dataSplit.dataFiles().get(1).level()).isEqualTo(4) + } + }) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: cluster for partitioned table with partition filter") { + sql( + """ + |CREATE TABLE T (a INT, b INT, pt INT) + |PARTITIONED BY (pt) + |TBLPROPERTIES ( + | 'bucket'='-1', 'num-levels'='6', 'num-sorted-run.compaction-trigger'='2', + | 'clustering.columns'='a,b', 'clustering.strategy'='zorder', 'clustering.incremental' = 'true' + |) + |""".stripMargin) + + sql("INSERT INTO T VALUES (0, 0, 0), (0, 0, 1)") + sql("INSERT INTO T VALUES (0, 1, 0), (0, 1, 1)") + sql("INSERT INTO T VALUES (0, 2, 0), (0, 2, 1)") + sql("INSERT INTO T VALUES (1, 0, 0), (1, 0, 1)") + sql("INSERT INTO T VALUES (1, 1, 0), (1, 1, 1)") + sql("INSERT INTO T VALUES (1, 2, 0), (1, 2, 1)") + sql("INSERT INTO T VALUES (2, 0, 0), (2, 0, 1)") + sql("INSERT INTO T VALUES (2, 1, 0), (2, 1, 1)") + sql("INSERT INTO T VALUES (2, 2, 0), (2, 2, 1)") + + sql("CALL sys.compact(table => 'T', where => 'pt = 0')") + checkAnswer( + sql("select distinct partition, level from `T$files` order by partition"), + Seq(Row("{0}", 5), Row("{1}", 0)) + ) + + sql("CALL sys.compact(table => 'T', where => 'pt = 1')") + checkAnswer( + sql("select distinct partition, level from `T$files` order by partition"), + Seq(Row("{0}", 5), Row("{1}", 5)) + ) + } + + test("Paimon Procedure: cluster with deletion vectors") { + failAfter(Span(5, org.scalatest.time.Minutes)) { + withTempDir { + checkpointDir => + spark.sql( + s""" + |CREATE TABLE T (a INT, b INT, c STRING) + |TBLPROPERTIES ('bucket'='-1', 'deletion-vectors.enabled'='true','num-levels'='6', 'num-sorted-run.compaction-trigger'='2', 'clustering.columns'='a,b', 'clustering.strategy'='zorder', 'clustering.incremental' = 'true') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b", "c") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + val random = new Random() + val randomStr = random.nextString(40) + // first write + inputData.addData((0, 0, randomStr)) + inputData.addData((0, 1, randomStr)) + inputData.addData((0, 2, randomStr)) + inputData.addData((1, 0, randomStr)) + inputData.addData((1, 1, randomStr)) + inputData.addData((1, 2, randomStr)) + inputData.addData((2, 0, randomStr)) + inputData.addData((2, 1, randomStr)) + inputData.addData((2, 2, randomStr)) + stream.processAllAvailable() + + val result = new util.ArrayList[Row]() + for (a <- 0 until 3) { + for (b <- 0 until 3) { + result.add(Row(a, b, randomStr)) + } + } + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + + // first cluster, the outputLevel should be 5 + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + + // first cluster result + val result2 = new util.ArrayList[Row]() + result2.add(0, Row(0, 0, randomStr)) + result2.add(1, Row(0, 1, randomStr)) + result2.add(2, Row(1, 0, randomStr)) + result2.add(3, Row(1, 1, randomStr)) + result2.add(4, Row(0, 2, randomStr)) + result2.add(5, Row(1, 2, randomStr)) + result2.add(6, Row(2, 0, randomStr)) + result2.add(7, Row(2, 1, randomStr)) + result2.add(8, Row(2, 2, randomStr)) + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result2) + + var clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + var dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5) + + // second write + inputData.addData((0, 3, null), (1, 3, null), (2, 3, null)) + inputData.addData((3, 0, null), (3, 1, null), (3, 2, null), (3, 3, null)) + stream.processAllAvailable() + + // delete (0,0), which is in level-5 file + spark.sql("DELETE FROM T WHERE a=0 and b=0;").collect() + // delete (0,3), which is in level-0 file + spark.sql("DELETE FROM T WHERE a=0 and b=3;").collect() + + val result3 = new util.ArrayList[Row]() + result3.addAll(result2.subList(1, result2.size())) + for (a <- 1 until 3) { + result3.add(Row(a, 3, null)) + } + for (b <- 0 until 4) { + result3.add(Row(3, b, null)) + } + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result3) + + // second cluster, the outputLevel should be 4. dv index for level-0 will be updated + // and dv index for level-5 will be retained + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + // second cluster result, level-5 and level-4 are individually ordered + val result4 = new util.ArrayList[Row]() + result4.addAll(result2.subList(1, result2.size())) + result4.add(Row(1, 3, null)) + result4.add(Row(3, 0, null)) + result4.add(Row(3, 1, null)) + result4.add(Row(2, 3, null)) + result4.add(Row(3, 2, null)) + result4.add(Row(3, 3, null)) + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result4) + + clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(2) + Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5) + Assertions.assertThat(dataSplits.get(0).deletionFiles().get().get(0)).isNotNull + Assertions.assertThat(dataSplits.get(0).dataFiles().get(1).level()).isEqualTo(4) + Assertions.assertThat(dataSplits.get(0).deletionFiles().get().get(1)).isNull() + + // full cluster + checkAnswer( + spark.sql("CALL paimon.sys.compact(table => 'T', compact_strategy => 'full')"), + Row(true) :: Nil) + clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).deletionFiles().get().get(0)).isNull() + + } finally { + stream.stop() + } + } + } + } + + def checkSnapshot(table: FileStoreTable): Unit = { + Assertions + .assertThat(table.latestSnapshot().get().commitKind().toString) + .isEqualTo(CommitKind.COMPACT.toString) + } + + def lastSnapshotCommand(table: FileStoreTable): CommitKind = { + table.snapshotManager().latestSnapshot().commitKind() + } + + def lastSnapshotId(table: FileStoreTable): Long = { + table.snapshotManager().latestSnapshotId() + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateAndDeleteTagProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateAndDeleteTagProcedureTest.scala new file mode 100644 index 000000000000..605f80e27ad3 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateAndDeleteTagProcedureTest.scala @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest + +class CreateAndDeleteTagProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + test("Paimon Procedure: create and delete tag") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(" + + "table => 'test.T', tag => 'test_tag', time_retained => '5 d', snapshot => 2)"), + Row(true) :: Nil) + checkAnswer( + spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), + Row("test_tag") :: Nil) + checkAnswer( + spark.sql("CALL paimon.sys.delete_tag(table => 'test.T', tag => 'test_tag')"), + Row(true) :: Nil) + checkAnswer(spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), Nil) + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(table => 'test.T', tag => 'test_latestSnapshot_tag')"), + Row(true) :: Nil) + checkAnswer( + spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), + Row("test_latestSnapshot_tag") :: Nil) + checkAnswer( + spark.sql( + "CALL paimon.sys.delete_tag(table => 'test.T', tag => 'test_latestSnapshot_tag')"), + Row(true) :: Nil) + checkAnswer(spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), Nil) + + // create test_tag_1 and test_tag_2 + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(" + + "table => 'test.T', tag => 'test_tag_1', snapshot => 1)"), + Row(true) :: Nil) + + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(" + + "table => 'test.T', tag => 'test_tag_2', snapshot => 2)"), + Row(true) :: Nil) + + checkAnswer( + spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), + Row("test_tag_1") :: Row("test_tag_2") :: Nil) + + // test rename_tag + checkAnswer( + spark.sql( + "CALL paimon.sys.rename_tag(table => 'test.T', tag => 'test_tag_1', target_tag => 'test_tag_3')"), + Row(true) :: Nil + ) + checkAnswer( + spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), + Row("test_tag_2") :: Row("test_tag_3") :: Nil) + + // delete test_tag_2 and test_tag_3 + checkAnswer( + spark.sql( + "CALL paimon.sys.delete_tag(table => 'test.T', tag => 'test_tag_2,test_tag_3')"), + Row(true) :: Nil) + + checkAnswer(spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: create same tag with same snapshot") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(" + + "table => 'test.T', tag => 'test_tag', snapshot => 1)"), + Row(true) :: Nil) + checkAnswer( + spark.sql("SELECT count(*) FROM paimon.test.`T$tags` where tag_name = 'test_tag'"), + Row(1) :: Nil) + + // throw exception "Tag test_tag already exists" + assertThrows[IllegalArgumentException] { + spark.sql( + "CALL paimon.sys.create_tag(" + + "table => 'test.T', tag => 'test_tag', time_retained => '5 d', snapshot => 1)") + } + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: delete tag not failed if tag not exists") { + spark.sql("CREATE TABLE T (id STRING, name STRING) USING PAIMON") + + checkAnswer( + spark.sql("CALL paimon.sys.delete_tag(table => 'test.T', tag => 'test_tag')"), + Row(true) :: Nil) + } + + test("Paimon Procedure: delete multiple tags") { + spark.sql("CREATE TABLE T (id INT, name STRING) USING PAIMON") + spark.sql("insert into T values (1, 'a')") + + // create four tags + spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 'tag-1')") + spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 'tag-2')") + spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 'tag-3')") + spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 'tag-4')") + checkAnswer(spark.sql("SELECT count(*) FROM paimon.test.`T$tags`"), Row(4) :: Nil) + + // multiple tags with no space + checkAnswer( + spark.sql("CALL paimon.sys.delete_tag(table => 'test.T', tag => 'tag-1,tag-2')"), + Row(true) :: Nil) + checkAnswer( + spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), + Row("tag-3") :: Row("tag-4") :: Nil) + + // multiple tags with space + checkAnswer( + spark.sql("CALL paimon.sys.delete_tag(table => 'test.T', tag => 'tag-3, tag-4')"), + Row(true) :: Nil) + checkAnswer(spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), Nil) + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateGlobalVectorIndexProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateGlobalVectorIndexProcedureTest.scala new file mode 100644 index 000000000000..b9283d996cc6 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateGlobalVectorIndexProcedureTest.scala @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.utils.Range + +import scala.collection.JavaConverters._ +import scala.collection.immutable + +class CreateGlobalVectorIndexProcedureTest extends CreateGlobalIndexProcedureTest { + test("create lucene-vector-knn global index") { + withTable("T") { + spark.sql(""" + |CREATE TABLE T (id INT, v ARRAY) + |TBLPROPERTIES ( + | 'bucket' = '-1', + | 'global-index.row-count-per-shard' = '10000', + | 'row-tracking.enabled' = 'true', + | 'data-evolution.enabled' = 'true') + |""".stripMargin) + + val values = (0 until 100) + .map( + i => s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)))") + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + val output = + spark + .sql("CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')") + .collect() + .head + + assert(output.getBoolean(0)) + + val table = loadTable("T") + val indexEntries = table + .store() + .newIndexFileHandler() + .scanEntries() + .asScala + .filter(_.indexFile().indexType() == "lucene-vector-knn") + + assert(indexEntries.nonEmpty) + val totalRowCount = indexEntries.map(_.indexFile().rowCount()).sum + assert(totalRowCount == 100L) + } + } + + test("create lucene-vector-knn global index with partition") { + withTable("T") { + spark.sql(""" + |CREATE TABLE T (id INT, v ARRAY, pt STRING) + |TBLPROPERTIES ( + | 'bucket' = '-1', + | 'global-index.row-count-per-shard' = '10000', + | 'row-tracking.enabled' = 'true', + | 'data-evolution.enabled' = 'true') + | PARTITIONED BY (pt) + |""".stripMargin) + + var values = (0 until 65000) + .map( + i => + s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)), 'p0')") + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + values = (0 until 35000) + .map( + i => + s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)), 'p1')") + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + values = (0 until 22222) + .map( + i => + s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)), 'p0')") + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + val output = + spark + .sql("CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')") + .collect() + .head + + assert(output.getBoolean(0)) + + val table = loadTable("T") + val indexEntries = table + .store() + .newIndexFileHandler() + .scanEntries() + .asScala + .filter(_.indexFile().indexType() == "lucene-vector-knn") + + assert(indexEntries.nonEmpty) + val totalRowCount = indexEntries.map(_.indexFile().rowCount()).sum + assert(totalRowCount == 122222L) + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateTagFromTimestampProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateTagFromTimestampProcedureTest.scala new file mode 100644 index 000000000000..b4f7d63086ae --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateTagFromTimestampProcedureTest.scala @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase +import org.apache.paimon.utils.SnapshotNotExistException + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest + +class CreateTagFromTimestampProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + test("Paimon Procedure: Create tags from snapshots commit-time ") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + try { + + for (i <- 1 to 4) { + inputData.addData((i, "a")) + stream.processAllAvailable() + Thread.sleep(500L) + } + + val table = loadTable("T") + val earliestCommitTime = table.snapshotManager.earliestSnapshot.timeMillis + val commitTime3 = table.snapshotManager.snapshot(3).timeMillis + val commitTime4 = table.snapshotManager.snapshot(4).timeMillis + + // create tag from timestamp that earlier than the earliest snapshot commit time. + checkAnswer( + spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp( + |table => 'test.T', + | tag => 'test_tag', + | timestamp => ${earliestCommitTime - 1})""".stripMargin), + Row("test_tag", 1, earliestCommitTime, "null") :: Nil + ) + + // create tag from timestamp that equals to snapshot-3 commit time. + checkAnswer( + spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp( + |table => 'test.T', + | tag => 'test_tag2', + | timestamp => $commitTime3)""".stripMargin), + Row("test_tag2", 3, commitTime3, "null") :: Nil + ) + + // create tag from timestamp that later than snapshot-3 commit time. + checkAnswer( + spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp( + |table => 'test.T', + |tag => 'test_tag3', + |timestamp => ${commitTime3 + 1})""".stripMargin), + Row("test_tag3", 4, commitTime4, "null") :: Nil + ) + + // create tag from timestamp that later than the latest snapshot commit time and throw SnapshotNotExistException. + assertThrows[SnapshotNotExistException] { + spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp( + |table => 'test.T', + |tag => 'test_tag3', + |timestamp => ${Long.MaxValue})""".stripMargin) + } + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: Create tags from tags commit-time") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + try { + for (i <- 1 to 2) { + inputData.addData((i, "a")) + stream.processAllAvailable() + Thread.sleep(500L) + } + + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(" + + "table => 'test.T', tag => 'test_tag', snapshot => 1)"), + Row(true) :: Nil) + + val table = loadTable("T") + val latestCommitTime = table.snapshotManager.latestSnapshot().timeMillis + val tagsCommitTime = table.tagManager().getOrThrow("test_tag").timeMillis + assert(latestCommitTime > tagsCommitTime) + + // make snapshot 1 expire. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_snapshots(table => 'test.T', retain_max => 1, retain_min => 1)"), + Row(1) :: Nil) + + // create tag from timestamp that earlier than the expired snapshot 1. + checkAnswer( + spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp( + |table => 'test.T', + | tag => 'test_tag1', + | timestamp => ${tagsCommitTime - 1})""".stripMargin), + Row("test_tag1", 1, tagsCommitTime, "null") :: Nil + ) + + // create tag from timestamp that later than the expired snapshot 1. + checkAnswer( + spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp( + |table => 'test.T', + |tag => 'test_tag2', + |timestamp => ${tagsCommitTime + 1})""".stripMargin), + Row("test_tag2", 2, latestCommitTime, "null") :: Nil + ) + + } finally { + stream.stop() + } + } + } + } + +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpirePartitionsProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpirePartitionsProcedureTest.scala new file mode 100644 index 000000000000..c7cdc0f517a7 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpirePartitionsProcedureTest.scala @@ -0,0 +1,760 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest +import org.assertj.core.api.Assertions.assertThatThrownBy + +/** IT Case for [[ExpirePartitionsProcedure]]. */ +class ExpirePartitionsProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + test("Paimon Procedure: expire partitions") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1') + | PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // snapshot-1 + inputData.addData(("a", "2024-06-01")) + stream.processAllAvailable() + + // This partition never expires. + inputData.addData(("Never-expire", "9999-09-09")) + stream.processAllAvailable() + + checkAnswer(query(), Row("a", "2024-06-01") :: Row("Never-expire", "9999-09-09") :: Nil) + // call expire_partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd')"), + Row("pt=2024-06-01") :: Nil + ) + + checkAnswer(query(), Row("Never-expire", "9999-09-09") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon procedure : expire partitions show a list of expired partitions.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING, hm STRING) + |TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1') + | PARTITIONED BY (pt,hm) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt", "hm") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // Show results : There are no expired partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd')"), + Row("No expired partitions.") :: Nil + ) + + // snapshot-1 + inputData.addData(("a", "2024-06-01", "01:00")) + stream.processAllAvailable() + // snapshot-2 + inputData.addData(("b", "2024-06-02", "02:00")) + stream.processAllAvailable() + // snapshot-3, never expires. + inputData.addData(("Never-expire", "9999-09-09", "99:99")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("a", "2024-06-01", "01:00") :: Row("b", "2024-06-02", "02:00") :: Row( + "Never-expire", + "9999-09-09", + "99:99") :: Nil) + + // Show a list of expired partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'" + + ", expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd')"), + Row("pt=2024-06-01, hm=01:00") :: Row("pt=2024-06-02, hm=02:00") :: Nil + ) + + checkAnswer(query(), Row("Never-expire", "9999-09-09", "99:99") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions with values-time strategy.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1') + | PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // snapshot-1 + inputData.addData(("HXH", "2024-06-01")) + stream.processAllAvailable() + + // Never expire. + inputData.addData(("Never-expire", "9999-09-09")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("HXH", "2024-06-01") :: Row("Never-expire", "9999-09-09") :: Nil) + // expire + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'," + + " expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd'" + + ",expire_strategy => 'values-time')"), + Row("pt=2024-06-01") :: Nil + ) + + checkAnswer(query(), Row("Never-expire", "9999-09-09") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions with update-time strategy.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1') + | PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // This partition will expire. + inputData.addData(("HXH", "9999-09-09")) + stream.processAllAvailable() + // Waiting for partition 'pt=9999-09-09' to expire. + Thread.sleep(2500L) + // snapshot-2 + inputData.addData(("HXH", "2024-06-01")) + stream.processAllAvailable() + + // Partitions that are updated within 2 second would be retained. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(" + + "table => 'test.T'," + + " expiration_time => '2 s'" + + ",expire_strategy => 'update-time')"), + Row("pt=9999-09-09") :: Nil + ) + + checkAnswer(query(), Row("HXH", "2024-06-01") :: Nil) + + // Waiting for all partitions to expire. + Thread.sleep(1500) + // All partition will expire. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(" + + "table => 'test.T'," + + " expiration_time => '1 s'" + + ",expire_strategy => 'update-time')"), + Row("pt=2024-06-01") :: Nil + ) + + checkAnswer(query(), Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions with update-time strategy in same partition.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING, hm STRING) + |TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1') + | PARTITIONED BY (pt,hm) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt", "hm") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // This partition will not expire. + inputData.addData(("HXH", "2024-06-01", "01:00")) + stream.processAllAvailable() + // Waiting for 'pt=9999-09-09, hm=99:99' partitions to expire. + Thread.sleep(2500L) + // Updating the same partition data will update partition last update time, then this partition will not expire. + inputData.addData(("HXH", "2024-06-01", "01:00")) + stream.processAllAvailable() + + // The last update time of the 'pt=9999-09-09, hm=99:99' partition is updated so the partition would not expire. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'," + + " expiration_time => '2 s'" + + ",expire_strategy => 'update-time')"), + Row("No expired partitions.") :: Nil + ) + + checkAnswer(query(), Row("HXH", "2024-06-01", "01:00") :: Nil) + // Waiting for all partitions to expire. + Thread.sleep(1500) + + // The partition 'dt=2024-06-01, hm=01:00' will expire. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'," + + " expiration_time => '1 s'" + + ",expire_strategy => 'update-time')"), + Row("pt=2024-06-01, hm=01:00") :: Nil + ) + + checkAnswer(query(), Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions with non-date format partition.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1') + | PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // This partition will expire. + inputData.addData(("HXH", "pt-1")) + stream.processAllAvailable() + Thread.sleep(2500L) + // snapshot-2 + inputData.addData(("HXH", "pt-2")) + stream.processAllAvailable() + + // Only update-time strategy support non date format partition to expire. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'," + + " expiration_time => '2 s'" + + ",expire_strategy => 'update-time')"), + Row("pt=pt-1") :: Nil + ) + + checkAnswer(query(), Row("HXH", "pt-2") :: Nil) + + // Waiting for all partitions to expire. + Thread.sleep(1500) + // call expire_partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'," + + " expiration_time => '1 s'" + + ",expire_strategy => 'update-time')"), + Row("pt=pt-2") :: Nil + ) + + checkAnswer(query(), Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon procedure : expire partitions with specified time-pattern partitions.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING, hm STRING) + |TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1') + | PARTITIONED BY (hm, pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt", "hm") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // Show results : There are no expired partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd', timestamp_pattern => '$pt')"), + Row("No expired partitions.") :: Nil + ) + + // snapshot-1 + inputData.addData(("a", "2024-06-01", "01:00")) + stream.processAllAvailable() + // snapshot-2 + inputData.addData(("b", "2024-06-02", "02:00")) + stream.processAllAvailable() + // snapshot-3, never expires. + inputData.addData(("Never-expire", "9999-09-09", "99:99")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("a", "2024-06-01", "01:00") :: Row("b", "2024-06-02", "02:00") :: Row( + "Never-expire", + "9999-09-09", + "99:99") :: Nil) + + // Show a list of expired partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'" + + ", expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd HH:mm'" + + ", timestamp_pattern => '$pt $hm')"), + Row("hm=01:00, pt=2024-06-01") :: Row("hm=02:00, pt=2024-06-02") :: Nil + ) + + checkAnswer(query(), Row("Never-expire", "9999-09-09", "99:99") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon procedure : sorted the expired partitions with max_expires.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING, hm STRING) + |TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1') + | PARTITIONED BY (pt,hm) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt", "hm") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // Show results : There are no expired partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd')"), + Row("No expired partitions.") :: Nil + ) + + inputData.addData(("a", "2024-06-02", "02:00")) + stream.processAllAvailable() + inputData.addData(("b", "2024-06-02", "01:00")) + stream.processAllAvailable() + inputData.addData(("d", "2024-06-03", "01:00")) + stream.processAllAvailable() + inputData.addData(("c", "2024-06-01", "01:00")) + stream.processAllAvailable() + // this snapshot never expires. + inputData.addData(("Never-expire", "9999-09-09", "99:99")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("a", "2024-06-02", "02:00") :: Row("b", "2024-06-02", "01:00") :: Row( + "d", + "2024-06-03", + "01:00") :: Row("c", "2024-06-01", "01:00") :: Row( + "Never-expire", + "9999-09-09", + "99:99") :: Nil + ) + + // sorted result of limited expired partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'" + + ", expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd', max_expires => 3)"), + Row("pt=2024-06-01, hm=01:00") :: Row("pt=2024-06-02, hm=01:00") :: Row( + "pt=2024-06-02, hm=02:00") :: Nil + ) + + checkAnswer( + query(), + Row("d", "2024-06-03", "01:00") :: Row("Never-expire", "9999-09-09", "99:99") :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions with default num") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql( + s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1', 'partition.expiration-max-num'='2') + |PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // snapshot-1 + inputData.addData(("a", "2024-06-01")) + stream.processAllAvailable() + + // snapshot-2 + inputData.addData(("b", "2024-06-02")) + stream.processAllAvailable() + + // snapshot-3 + inputData.addData(("c", "2024-06-03")) + stream.processAllAvailable() + + // This partition never expires. + inputData.addData(("Never-expire", "9999-09-09")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("a", "2024-06-01") :: Row("b", "2024-06-02") :: Row("c", "2024-06-03") :: Row( + "Never-expire", + "9999-09-09") :: Nil) + // call expire_partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd')"), + Row("pt=2024-06-01") :: Row("pt=2024-06-02") :: Nil + ) + + checkAnswer(query(), Row("c", "2024-06-03") :: Row("Never-expire", "9999-09-09") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions load table property first") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ( + | 'primary-key' = 'k,pt', + | 'bucket' = '1', + | 'write-only' = 'true', + | 'partition.timestamp-formatter' = 'yyyy-MM-dd', + | 'partition.expiration-max-num'='2') + |PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // snapshot-1 + inputData.addData(("a", "2024-06-01")) + stream.processAllAvailable() + + // snapshot-2 + inputData.addData(("b", "2024-06-02")) + stream.processAllAvailable() + + // snapshot-3 + inputData.addData(("c", "2024-06-03")) + stream.processAllAvailable() + + // This partition never expires. + inputData.addData(("Never-expire", "9999-09-09")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("a", "2024-06-01") :: Row("b", "2024-06-02") :: Row("c", "2024-06-03") :: Row( + "Never-expire", + "9999-09-09") :: Nil) + + // 'partition.timestamp-formatter' value using table property. + // 'partition.expiration-time' value using procedure parameter. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d')"), + Row("pt=2024-06-01") :: Row("pt=2024-06-02") :: Nil + ) + + checkAnswer(query(), Row("c", "2024-06-03") :: Row("Never-expire", "9999-09-09") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions add options parameter") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ( + | 'primary-key' = 'k,pt', + | 'bucket' = '1') + |PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // snapshot-1 + inputData.addData(("a", "2024-06-01")) + stream.processAllAvailable() + + // snapshot-2 + inputData.addData(("b", "2024-06-02")) + stream.processAllAvailable() + + // snapshot-3 + inputData.addData(("c", "2024-06-03")) + stream.processAllAvailable() + + // This partition never expires. + inputData.addData(("Never-expire", "9999-09-09")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("a", "2024-06-01") :: Row("b", "2024-06-02") :: Row("c", "2024-06-03") :: Row( + "Never-expire", + "9999-09-09") :: Nil) + + // set conf in options. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', " + + "options => 'partition.expiration-time = 1d," + + " partition.expiration-max-num = 2," + + " partition.expiration-batch-size = 2," + + " partition.timestamp-formatter = yyyy-MM-dd')"), + Row("pt=2024-06-01") :: Row("pt=2024-06-02") :: Nil + ) + + checkAnswer(query(), Row("c", "2024-06-03") :: Row("Never-expire", "9999-09-09") :: Nil) + + } finally { + stream.stop() + } + } + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpireSnapshotsProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpireSnapshotsProcedureTest.scala new file mode 100644 index 000000000000..bbaf88568e2d --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpireSnapshotsProcedureTest.scala @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase +import org.apache.paimon.utils.SnapshotManager + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest +import org.assertj.core.api.Assertions.{assertThat, assertThatIllegalArgumentException} + +import java.sql.Timestamp + +class ExpireSnapshotsProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + test("Paimon Procedure: expire snapshots") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3', + |'write-only' = 'true', 'snapshot.num-retained.min' = '1') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + // expire + checkAnswer( + spark.sql("CALL paimon.sys.expire_snapshots(table => 'test.T', retain_max => 2)"), + Row(1) :: Nil) + + checkAnswer( + spark.sql("SELECT snapshot_id FROM paimon.test.`T$snapshots`"), + Row(2L) :: Row(3L) :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire snapshots retainMax retainMin value check") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + // expire assert throw exception + assertThrows[IllegalArgumentException] { + spark.sql( + "CALL paimon.sys.expire_snapshots(table => 'test.T', retain_max => 2, retain_min => 3)") + } + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: test parameter order_than with string type") { + sql( + "CREATE TABLE T (a INT, b STRING) " + + "TBLPROPERTIES ( 'num-sorted-run.compaction-trigger' = '999'," + + "'write-only' = 'true', 'snapshot.num-retained.min' = '1')") + val table = loadTable("T") + val snapshotManager = table.snapshotManager + + // generate 5 snapshot + for (i <- 1 to 5) { + sql(s"INSERT INTO T VALUES ($i, '$i')") + } + checkSnapshots(snapshotManager, 1, 5) + + val timestamp = new Timestamp(snapshotManager.latestSnapshot().timeMillis) + spark.sql( + s"CALL paimon.sys.expire_snapshots(table => 'test.T', older_than => '${timestamp.toString}', max_deletes => 2)") + checkSnapshots(snapshotManager, 3, 5) + } + + test("Paimon Procedure: expire snapshots load table property first") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3', + |'snapshot.num-retained.max' = '2', + |'snapshot.num-retained.min' = '1', + |'write-only' = 'true') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + // expire + checkAnswer( + spark.sql("CALL paimon.sys.expire_snapshots(table => 'test.T')"), + Row(1) :: Nil) + + checkAnswer( + spark.sql("SELECT snapshot_id FROM paimon.test.`T$snapshots`"), + Row(2L) :: Row(3L) :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire snapshots add options parameter") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3', 'write-only' = 'true') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_snapshots(table => 'test.T', options => 'snapshot.num-retained.max=2, snapshot.num-retained.min=1')"), + Row(1L) :: Nil + ) + + checkAnswer( + spark.sql("SELECT snapshot_id FROM paimon.test.`T$snapshots`"), + Row(2L) :: Row(3L) :: Nil) + } finally { + stream.stop() + } + } + } + } + + def checkSnapshots(sm: SnapshotManager, earliest: Int, latest: Int): Unit = { + assertThat(sm.snapshotCount).isEqualTo(latest - earliest + 1) + assertThat(sm.earliestSnapshotId).isEqualTo(earliest) + assertThat(sm.latestSnapshotId).isEqualTo(latest) + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ProcedureTest.scala new file mode 100644 index 000000000000..d57846709877 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ProcedureTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +class ProcedureTest extends ProcedureTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/RollbackProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/RollbackProcedureTest.scala new file mode 100644 index 000000000000..078823c3ef37 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/RollbackProcedureTest.scala @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest + +class RollbackProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + test("Paimon Procedure: rollback to snapshot and tag") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val table = loadTable("T") + val location = table.location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(table => 'test.T', tag => 'test_tag', snapshot => 1)"), + Row(true) :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + assertThrows[RuntimeException] { + spark.sql("CALL paimon.sys.rollback(table => 'test.T_exception', version => '2')") + } + // rollback to snapshot + checkAnswer( + spark.sql("CALL paimon.sys.rollback(table => 'test.T', version => '2')"), + Row(table.latestSnapshot().get().id, 2) :: Nil) + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // rollback to tag + val taggedSnapshotId = table.tagManager().getOrThrow("test_tag").trimToSnapshot().id + checkAnswer( + spark.sql("CALL paimon.sys.rollback(table => 'test.T', version => 'test_tag')"), + Row(table.latestSnapshot().get().id, taggedSnapshotId) :: Nil) + checkAnswer(query(), Row(1, "a") :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: rollback to tag check test") { + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3', 'file.format'='orc') + |""".stripMargin) + + val table = loadTable("T") + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + // snapshot-1 + spark.sql("insert into T select 1, 'a'") + checkAnswer(query(), Row(1, "a") :: Nil) + + checkAnswer( + spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => '20250122', snapshot => 1)"), + Row(true) :: Nil) + + // snapshot-2 + spark.sql("insert into T select 2, 'b'") + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + spark.sql("insert into T select 3, 'c'") + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil) + + // snapshot-4 + spark.sql("insert into T select 4, 'd'") + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Row(4, "d") :: Nil) + + assertThrows[RuntimeException] { + spark.sql("CALL paimon.sys.rollback(table => 'test.T_exception', version => '4')") + } + // rollback to snapshot + checkAnswer( + spark.sql("CALL paimon.sys.rollback(table => 'test.T', version => '3')"), + Row(table.latestSnapshot().get().id, 3) :: Nil) + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil) + + // version/snapshot/tag can only set one of them + assertThrows[RuntimeException] { + spark.sql( + "CALL paimon.sys.rollback(table => 'test.T', version => '20250122', tag => '20250122')") + } + + assertThrows[RuntimeException] { + spark.sql("CALL paimon.sys.rollback(table => 'test.T', version => '20250122', snapshot => 1)") + } + + assertThrows[RuntimeException] { + spark.sql("CALL paimon.sys.rollback(table => 'test.T', tag => '20250122', snapshot => 1)") + } + + // rollback to snapshot + spark.sql("CALL paimon.sys.rollback(table => 'test.T', snapshot => 2)") + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // rollback to tag + spark.sql("CALL paimon.sys.rollback(table => 'test.T', tag => '20250122')") + checkAnswer(query(), Row(1, "a") :: Nil) + } + + test("Paimon Procedure: rollback to timestamp") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val table = loadTable("T") + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + val timestamp = System.currentTimeMillis() + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + // rollback to timestamp + checkAnswer( + spark.sql( + s"CALL paimon.sys.rollback_to_timestamp(table => 'test.T', timestamp => $timestamp)"), + Row(table.latestSnapshot().get().id, 2) :: Nil) + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: rollback with cache") { + sql("CREATE TABLE T (id INT)") + sql("INSERT INTO T VALUES (1), (2), (3), (4)") + sql("DELETE FROM T WHERE id = 1") + sql("CALL sys.rollback(table => 'T', version => '1')") + sql("DELETE FROM T WHERE id = 1") + checkAnswer(sql("SELECT * FROM T ORDER BY id"), Seq(Row(2), Row(3), Row(4))) + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/AnalyzeTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/AnalyzeTableTest.scala new file mode 100644 index 000000000000..255906d04bf2 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/AnalyzeTableTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class AnalyzeTableTest extends AnalyzeTableTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLTest.scala new file mode 100644 index 000000000000..b729f57b33e7 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class DDLTest extends DDLTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLWithHiveCatalogTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLWithHiveCatalogTest.scala new file mode 100644 index 000000000000..cb139d2a57be --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLWithHiveCatalogTest.scala @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class DDLWithHiveCatalogTest extends DDLWithHiveCatalogTestBase {} + +class DefaultDatabaseTest extends DefaultDatabaseTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTest.scala new file mode 100644 index 000000000000..6170e2fd6c5c --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class DataFrameWriteTest extends DataFrameWriteTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTestBase.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTestBase.scala new file mode 100644 index 000000000000..b25e41a3fb42 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTestBase.scala @@ -0,0 +1,701 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.SparkConf +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.DecimalType +import org.junit.jupiter.api.Assertions + +import java.sql.{Date, Timestamp} + +abstract class DataFrameWriteTestBase extends PaimonSparkTestBase { + + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.sql.catalog.paimon.cache-enabled", "false") + } + + import testImplicits._ + + test("Paimon dataframe: insert into partitioned table") { + for (useV2Write <- Seq("true", "false")) { + withSparkSQLConf("spark.paimon.write.use-v2-write" -> useV2Write) { + withTable("t") { + // create table + Seq((1, "x1", "p1"), (2, "x2", "p2")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .option("primary-key", "a,pt") + .partitionBy("pt") + .saveAsTable("t") + + // insert into + Seq((3, "x3", "p3")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .mode("append") + .insertInto("t") + checkAnswer( + spark.read.format("paimon").table("t").orderBy("a"), + Seq(Row(1, "x1", "p1"), Row(2, "x2", "p2"), Row(3, "x3", "p3")) + ) + checkAnswer( + sql("SHOW PARTITIONS t"), + Seq(Row("pt=p1"), Row("pt=p2"), Row("pt=p3")) + ) + + // dynamic insert overwrite + withSparkSQLConf("spark.sql.sources.partitionOverwriteMode" -> "dynamic") { + Seq((4, "x4", "p1")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .mode("overwrite") + .insertInto("t") + } + checkAnswer( + spark.read.format("paimon").table("t").orderBy("a"), + Seq(Row(2, "x2", "p2"), Row(3, "x3", "p3"), Row(4, "x4", "p1")) + ) + checkAnswer( + sql("SHOW PARTITIONS t"), + Seq(Row("pt=p1"), Row("pt=p2"), Row("pt=p3")) + ) + + // insert overwrite + Seq((5, "x5", "p1")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .mode("overwrite") + .insertInto("t") + checkAnswer( + spark.read.format("paimon").table("t").orderBy("a"), + Seq(Row(5, "x5", "p1")) + ) + checkAnswer( + sql("SHOW PARTITIONS t"), + Seq(Row("pt=p1")) + ) + } + } + } + } + + test("Paimon dataframe: save as partitioned table") { + for (useV2Write <- Seq("true", "false")) { + withSparkSQLConf("spark.paimon.write.use-v2-write" -> useV2Write) { + withTable("t") { + // create table + Seq((1, "x1", "p1"), (2, "x2", "p2")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .mode("append") + .option("primary-key", "a,pt") + .partitionBy("pt") + .saveAsTable("t") + + // saveAsTable with append mode + Seq((3, "x3", "p3")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .mode("append") + .saveAsTable("t") + checkAnswer( + spark.read.format("paimon").table("t").orderBy("a"), + Seq(Row(1, "x1", "p1"), Row(2, "x2", "p2"), Row(3, "x3", "p3")) + ) + checkAnswer( + sql("SHOW PARTITIONS t"), + Seq(Row("pt=p1"), Row("pt=p2"), Row("pt=p3")) + ) + + // saveAsTable with overwrite mode will call replace table internal, + // so here we set the props and partitions again. + Seq((5, "x5", "p1")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .option("primary-key", "a,pt") + .partitionBy("pt") + .mode("overwrite") + .saveAsTable("t") + checkAnswer( + spark.read.format("paimon").table("t").orderBy("a"), + Seq(Row(5, "x5", "p1")) + ) + checkAnswer( + sql("SHOW PARTITIONS t"), + Seq(Row("pt=p1")) + ) + } + } + } + } + + test("Paimon: DataFrameWrite.saveAsTable") { + withTable("test_ctas") { + Seq((1L, "x1"), (2L, "x2")) + .toDF("a", "b") + .write + .format("paimon") + .mode("append") + .option("primary-key", "a") + .option("bucket", "-1") + .option("target-file-size", "256MB") + .option("write.merge-schema", "true") + .option("write.merge-schema.explicit-cast", "true") + .saveAsTable("test_ctas") + + val paimonTable = loadTable("test_ctas") + Assertions.assertEquals(1, paimonTable.primaryKeys().size()) + Assertions.assertEquals("a", paimonTable.primaryKeys().get(0)) + + // check all the core options + Assertions.assertEquals("-1", paimonTable.options().get("bucket")) + Assertions.assertEquals("256MB", paimonTable.options().get("target-file-size")) + + // non-core options should not be here. + Assertions.assertFalse(paimonTable.options().containsKey("write.merge-schema")) + Assertions.assertFalse(paimonTable.options().containsKey("write.merge-schema.explicit-cast")) + } + } + + test("Paimon: DataFrameWrite partition table") { + withTable("t") { + spark.sql(s""" + |CREATE TABLE t (a INT, b STRING, dt STRING) PARTITIONED BY(dt) + |TBLPROPERTIES ('file.format' = 'avro', 'bucket' = 2, 'bucket-key' = 'b') + |""".stripMargin) + + val table = loadTable("t") + val location = table.location().toString + + Seq((1, "x1", "a"), (2, "x2", "b")) + .toDF("a", "b", "c") + .write + .format("paimon") + .mode("append") + .save(location) + checkAnswer(sql("SELECT * FROM t"), Row(1, "x1", "a") :: Row(2, "x2", "b") :: Nil) + } + } + + fileFormats.foreach { + fileFormat => + test(s"Paimon: DataFrameWrite.saveAsTable in ByName mode, file.format: $fileFormat") { + withTable("t1", "t2") { + spark.sql(s""" + |CREATE TABLE t1 (col1 STRING, col2 INT, col3 DOUBLE) + |TBLPROPERTIES ('file.format' = '$fileFormat') + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE t2 (col2 INT, col3 DOUBLE, col1 STRING) + |TBLPROPERTIES ('file.format' = '$fileFormat') + |""".stripMargin) + + sql(s""" + |INSERT INTO TABLE t1 VALUES + |("Hello", 1, 1.1), + |("World", 2, 2.2), + |("Paimon", 3, 3.3); + |""".stripMargin) + + spark.table("t1").write.format("paimon").mode("append").saveAsTable("t2") + checkAnswer( + sql("SELECT * FROM t2 ORDER BY col2"), + Row(1, 1.1d, "Hello") :: Row(2, 2.2d, "World") :: Row(3, 3.3d, "Paimon") :: Nil) + } + } + } + + fileFormats.foreach { + fileFormat => + test( + s"Paimon: DataFrameWrite.saveAsTable with complex data type in ByName mode, file.format: $fileFormat") { + withTable("t1", "t2") { + spark.sql( + s""" + |CREATE TABLE t1 (a STRING, b INT, c STRUCT, d ARRAY>>, e ARRAY) + |TBLPROPERTIES ('file.format' = '$fileFormat') + |""".stripMargin) + + spark.sql( + s""" + |CREATE TABLE t2 (b INT, c STRUCT, d ARRAY, d1 TIMESTAMP>>, e ARRAY, a STRING) + |TBLPROPERTIES ('file.format' = '$fileFormat') + |""".stripMargin) + + sql(s""" + |INSERT INTO TABLE t1 VALUES + |("Hello", 1, struct(1.1, 1000), array(struct(timestamp'2024-01-01 00:00:00', map("k1", "v1")), struct(timestamp'2024-08-01 00:00:00', map("k1", "v11"))), array(123, 345)), + |("World", 2, struct(2.2, 2000), array(struct(timestamp'2024-02-01 00:00:00', map("k2", "v2"))), array(234, 456)), + |("Paimon", 3, struct(3.3, 3000), null, array(345, 567)); + |""".stripMargin) + + spark.table("t1").write.format("paimon").mode("append").saveAsTable("t2") + checkAnswer( + sql("SELECT * FROM t2 ORDER BY b"), + Row( + 1, + Row(1000L, 1.1d), + Array( + Row(Map("k1" -> "v1"), Timestamp.valueOf("2024-01-01 00:00:00")), + Row(Map("k1" -> "v11"), Timestamp.valueOf("2024-08-01 00:00:00"))), + Array(123, 345), + "Hello" + ) + :: Row( + 2, + Row(2000L, 2.2d), + Array(Row(Map("k2" -> "v2"), Timestamp.valueOf("2024-02-01 00:00:00"))), + Array(234, 456), + "World") + :: Row(3, Row(3000L, 3.3d), null, Array(345, 567), "Paimon") :: Nil + ) + } + } + } + + withPk.foreach { + hasPk => + bucketModes.foreach { + bucket => + test(s"Write data into Paimon directly: has-pk: $hasPk, bucket: $bucket") { + + val prop = if (hasPk) { + s"'primary-key'='a', 'bucket' = '$bucket' " + } else if (bucket != -1) { + s"'bucket-key'='a', 'bucket' = '$bucket' " + } else { + "'write-only'='true'" + } + + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ($prop) + |""".stripMargin) + + val paimonTable = loadTable("T") + val location = paimonTable.location().toString + + val df1 = Seq((1, "a"), (2, "b")).toDF("a", "b") + df1.write.format("paimon").mode("append").save(location) + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(1, "a") :: Row(2, "b") :: Nil) + + val df2 = Seq((1, "a2"), (3, "c")).toDF("a", "b") + df2.write.format("paimon").mode("append").save(location) + val expected = if (hasPk) { + Row(1, "a2") :: Row(2, "b") :: Row(3, "c") :: Nil + } else { + Row(1, "a") :: Row(1, "a2") :: Row(2, "b") :: Row(3, "c") :: Nil + } + checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected) + + val df3 = Seq((4, "d"), (5, "e")).toDF("a", "b") + df3.write.format("paimon").mode("overwrite").save(location) + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(4, "d") :: Row(5, "e") :: Nil) + } + } + } + + fileFormats.foreach { + format => + withPk.foreach { + hasPk => + bucketModes.foreach { + bucket => + test( + s"Schema evolution: write data into Paimon: $hasPk, bucket: $bucket, format: $format") { + val _spark = spark + import _spark.implicits._ + + val prop = if (hasPk) { + s"'primary-key'='a', 'bucket' = '$bucket', 'file.format' = '$format'" + } else if (bucket != -1) { + s"'bucket-key'='a', 'bucket' = '$bucket', 'file.format' = '$format'" + } else { + s"'write-only'='true', 'file.format' = '$format'" + } + + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ($prop) + |""".stripMargin) + + val paimonTable = loadTable("T") + val location = paimonTable.location().toString + + val df1 = Seq((1, "a"), (2, "b")).toDF("a", "b") + df1.write.format("paimon").mode("append").save(location) + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(1, "a") :: Row(2, "b") :: Nil) + + // Case 1: two additional fields + val df2 = Seq((1, "a2", 123L, Map("k" -> 11.1)), (3, "c", 345L, Map("k" -> 33.3))) + .toDF("a", "b", "c", "d") + df2.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + val expected2 = if (hasPk) { + Row(1, "a2", 123L, Map("k" -> 11.1)) :: + Row(2, "b", null, null) :: Row(3, "c", 345L, Map("k" -> 33.3)) :: Nil + } else { + Row(1, "a", null, null) :: Row(1, "a2", 123L, Map("k" -> 11.1)) :: Row( + 2, + "b", + null, + null) :: Row(3, "c", 345L, Map("k" -> 33.3)) :: Nil + } + checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected2) + + // Case 2: two fields with the evolved types: Int -> Long, Long -> Decimal + val df3 = Seq( + (2L, "b2", BigDecimal.decimal(234), Map("k" -> 22.2)), + (4L, "d", BigDecimal.decimal(456), Map("k" -> 44.4))).toDF("a", "b", "c", "d") + df3.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + val expected3 = if (hasPk) { + Row(1L, "a2", BigDecimal.decimal(123), Map("k" -> 11.1)) :: Row( + 2L, + "b2", + BigDecimal.decimal(234), + Map("k" -> 22.2)) :: Row( + 3L, + "c", + BigDecimal.decimal(345), + Map("k" -> 33.3)) :: Row( + 4L, + "d", + BigDecimal.decimal(456), + Map("k" -> 44.4)) :: Nil + } else { + Row(1L, "a", null, null) :: Row( + 1L, + "a2", + BigDecimal.decimal(123), + Map("k" -> 11.1)) :: Row(2L, "b", null, null) :: Row( + 2L, + "b2", + BigDecimal.decimal(234), + Map("k" -> 22.2)) :: Row( + 3L, + "c", + BigDecimal.decimal(345), + Map("k" -> 33.3)) :: Row( + 4L, + "d", + BigDecimal.decimal(456), + Map("k" -> 44.4)) :: Nil + } + checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected3) + + // Case 3: insert Decimal(20,18) to Decimal(38,18) + val df4 = Seq((99L, "df4", BigDecimal.decimal(4.0), Map("4" -> 4.1))) + .toDF("a", "b", "c", "d") + .selectExpr("a", "b", "cast(c as decimal(20,18)) as c", "d") + df4.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + val expected4 = + expected3 ++ Seq(Row(99L, "df4", BigDecimal.decimal(4.0), Map("4" -> 4.1))) + checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected4) + val decimalType = + spark.table("T").schema.apply(2).dataType.asInstanceOf[DecimalType] + assert(decimalType.precision == 38) + assert(decimalType.scale == 18) + } + } + } + } + + withPk.foreach { + hasPk => + bucketModes.foreach { + bucket => + test( + s"Schema evolution: write data into Paimon with allowExplicitCast = true: $hasPk, bucket: $bucket") { + + val prop = if (hasPk) { + s"'primary-key'='a', 'bucket' = '$bucket' " + } else if (bucket != -1) { + s"'bucket-key'='a', 'bucket' = '$bucket' " + } else { + "'write-only'='true'" + } + + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ($prop) + |""".stripMargin) + + val paimonTable = loadTable("T") + val location = paimonTable.location().toString + + val df1 = Seq((1, "2023-08-01"), (2, "2023-08-02")).toDF("a", "b") + df1.write.format("paimon").mode("append").save(location) + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(1, "2023-08-01") :: Row(2, "2023-08-02") :: Nil) + + // Case 1: two additional fields: DoubleType and TimestampType + val ts = java.sql.Timestamp.valueOf("2023-08-01 10:00:00.0") + val df2 = Seq((1, "2023-08-01", 12.3d, ts), (3, "2023-08-03", 34.5d, ts)) + .toDF("a", "b", "c", "d") + df2.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + val expected2 = if (hasPk) { + Row(1, "2023-08-01", 12.3d, ts) :: + Row(2, "2023-08-02", null, null) :: Row(3, "2023-08-03", 34.5d, ts) :: Nil + } else { + Row(1, "2023-08-01", null, null) :: Row(1, "2023-08-01", 12.3d, ts) :: Row( + 2, + "2023-08-02", + null, + null) :: Row(3, "2023-08-03", 34.5d, ts) :: Nil + } + checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected2) + + // Case 2: a: Int -> Long, b: String -> Date, c: Long -> Int, d: Map -> String + val date = java.sql.Date.valueOf("2023-07-31") + val df3 = Seq((2L, date, 234, null), (4L, date, 456, "2023-08-01 11:00:00.0")).toDF( + "a", + "b", + "c", + "d") + + // throw UnsupportedOperationException if write.merge-schema.explicit-cast = false + assertThrows[UnsupportedOperationException] { + df3.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + } + // merge schema and write data when write.merge-schema.explicit-cast = true + df3.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .option("write.merge-schema.explicit-cast", "true") + .save(location) + val expected3 = if (hasPk) { + Row(1L, Date.valueOf("2023-08-01"), 12, ts.toString) :: Row( + 2L, + date, + 234, + null) :: Row(3L, Date.valueOf("2023-08-03"), 34, ts.toString) :: Row( + 4L, + date, + 456, + "2023-08-01 11:00:00.0") :: Nil + } else { + Row(1L, Date.valueOf("2023-08-01"), null, null) :: Row( + 1L, + Date.valueOf("2023-08-01"), + 12, + ts.toString) :: Row(2L, date, 234, null) :: Row( + 2L, + Date.valueOf("2023-08-02"), + null, + null) :: Row(3L, Date.valueOf("2023-08-03"), 34, ts.toString) :: Row( + 4L, + date, + 456, + "2023-08-01 11:00:00.0") :: Nil + } + checkAnswer( + spark.sql("SELECT a, b, c, substring(d, 0, 21) FROM T ORDER BY a, b"), + expected3) + + } + } + } + + withPk.foreach { + hasPk => + test(s"Support v2 write with overwrite, hasPk: $hasPk") { + withTable("t") { + val prop = if (hasPk) { + "'primary-key'='c1'" + } else { + "'write-only'='true'" + } + spark.sql(s""" + |CREATE TABLE t (c1 INT, c2 STRING) PARTITIONED BY(p1 String, p2 string) + |TBLPROPERTIES ($prop) + |""".stripMargin) + + spark + .range(3) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite($"p1" === "a") + checkAnswer( + spark.sql("SELECT * FROM t ORDER BY c1"), + Row(0, "0", "a", "0") :: Row(1, "1", "a", "1") :: Row(2, "2", "a", "2") :: Nil + ) + + spark + .range(7, 10) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite($"p1" === "a") + checkAnswer( + spark.sql("SELECT * FROM t ORDER BY c1"), + Row(7, "7", "a", "7") :: Row(8, "8", "a", "8") :: Row(9, "9", "a", "9") :: Nil + ) + + spark + .range(2) + .selectExpr("id as c1", "id as c2", "'a' as p1", "9 as p2") + .writeTo("t") + .overwrite(($"p1" <=> "a").and($"p2" === "9")) + checkAnswer( + spark.sql("SELECT * FROM t ORDER BY c1"), + Row(0, "0", "a", "9") :: Row(1, "1", "a", "9") :: Row(7, "7", "a", "7") :: + Row(8, "8", "a", "8") :: Nil + ) + + // bad case + val msg1 = intercept[Exception] { + spark + .range(2) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite($"p1" =!= "a") + }.getMessage + assert(msg1.contains("Only support Overwrite filters with Equal and EqualNullSafe")) + + val msg2 = intercept[Exception] { + spark + .range(2) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite($"p1" === $"c2") + }.getMessage + if (gteqSpark3_4) { + assert(msg2.contains("Table does not support overwrite by expression")) + } else { + assert(msg2.contains("cannot translate expression to source filter")) + } + + val msg3 = intercept[Exception] { + spark + .range(2) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite($"c1" === ($"c2" + 1)) + }.getMessage + if (gteqSpark4_0) { + assert(msg3.contains("Table does not support overwrite by expression")) + } else { + assert(msg3.contains("cannot translate expression to source filter")) + } + + val msg4 = intercept[Exception] { + spark + .range(2) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite(($"p1" === "a").and($"p1" === "b")) + }.getMessage + assert(msg4.contains("Only support Overwrite with one filter for each partition column")) + + // Overwrite a partition which is not the specified + val msg5 = intercept[Exception] { + spark + .range(2) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite($"p1" === "b") + }.getMessage + assert(msg5.contains("does not belong to this partition")) + } + } + } + + test("Paimon Schema Evolution: some columns is absent in the coming data") { + + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |""".stripMargin) + + val paimonTable = loadTable("T") + val location = paimonTable.location().toString + + val df1 = Seq((1, "2023-08-01"), (2, "2023-08-02")).toDF("a", "b") + df1.write.format("paimon").mode("append").save(location) + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(1, "2023-08-01") :: Row(2, "2023-08-02") :: Nil) + + // Case 1: two additional fields: DoubleType and TimestampType + val ts = java.sql.Timestamp.valueOf("2023-08-01 10:00:00.0") + val df2 = Seq((1, "2023-08-01", 12.3d, ts), (3, "2023-08-03", 34.5d, ts)) + .toDF("a", "b", "c", "d") + df2.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + + // Case 2: colum b and d are absent in the coming data + val df3 = Seq((4, 45.6d), (5, 56.7d)) + .toDF("a", "c") + df3.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + val expected3 = + Row(1, "2023-08-01", null, null) :: Row(1, "2023-08-01", 12.3d, ts) :: Row( + 2, + "2023-08-02", + null, + null) :: Row(3, "2023-08-03", 34.5d, ts) :: Row(4, null, 45.6d, null) :: Row( + 5, + null, + 56.7d, + null) :: Nil + checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected3) + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DeleteFromTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DeleteFromTableTest.scala new file mode 100644 index 000000000000..8d620ece8245 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DeleteFromTableTest.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.spark.SparkConf + +class DeleteFromTableTest extends DeleteFromTableTestBase { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "false") + } +} + +class V2DeleteFromTableTest extends DeleteFromTableTestBase { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "true") + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DescribeTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DescribeTableTest.scala new file mode 100644 index 000000000000..c6aa77419241 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DescribeTableTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class DescribeTableTest extends DescribeTableTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/FormatTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/FormatTableTest.scala new file mode 100644 index 000000000000..ba49976ab6c0 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/FormatTableTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class FormatTableTest extends FormatTableTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/InsertOverwriteTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/InsertOverwriteTableTest.scala new file mode 100644 index 000000000000..4f66584c303b --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/InsertOverwriteTableTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class InsertOverwriteTableTest extends InsertOverwriteTableTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTest.scala new file mode 100644 index 000000000000..c83ee5493867 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTest.scala @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.paimon.spark.{PaimonAppendBucketedTableTest, PaimonAppendNonBucketTableTest, PaimonPrimaryKeyBucketedTableTest, PaimonPrimaryKeyNonBucketTableTest} + +import org.apache.spark.SparkConf + +class MergeIntoPrimaryKeyBucketedTableTest + extends MergeIntoTableTestBase + with MergeIntoPrimaryKeyTableTest + with MergeIntoNotMatchedBySourceTest + with PaimonPrimaryKeyBucketedTableTest { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "false") + } +} + +class MergeIntoPrimaryKeyNonBucketTableTest + extends MergeIntoTableTestBase + with MergeIntoPrimaryKeyTableTest + with MergeIntoNotMatchedBySourceTest + with PaimonPrimaryKeyNonBucketTableTest { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "false") + } +} + +class MergeIntoAppendBucketedTableTest + extends MergeIntoTableTestBase + with MergeIntoAppendTableTest + with MergeIntoNotMatchedBySourceTest + with PaimonAppendBucketedTableTest { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "false") + } +} + +class MergeIntoAppendNonBucketedTableTest + extends MergeIntoTableTestBase + with MergeIntoAppendTableTest + with MergeIntoNotMatchedBySourceTest + with PaimonAppendNonBucketTableTest { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "false") + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonCompositePartitionKeyTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonCompositePartitionKeyTest.scala new file mode 100644 index 000000000000..635185a9ed0e --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonCompositePartitionKeyTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class PaimonCompositePartitionKeyTest extends PaimonCompositePartitionKeyTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonOptimizationTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonOptimizationTest.scala new file mode 100644 index 000000000000..ec140a89bbd3 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonOptimizationTest.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.expressions.{Attribute, GetStructField, NamedExpression, ScalarSubquery} +import org.apache.spark.sql.paimon.shims.SparkShimLoader + +class PaimonOptimizationTest extends PaimonOptimizationTestBase { + + override def extractorExpression( + cteIndex: Int, + output: Seq[Attribute], + fieldIndex: Int): NamedExpression = { + GetStructField( + ScalarSubquery( + SparkShimLoader.shim + .createCTERelationRef(cteIndex, resolved = true, output.toSeq, isStreaming = false)), + fieldIndex, + None) + .as("scalarsubquery()") + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonPushDownTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonPushDownTest.scala new file mode 100644 index 000000000000..26677d85c71a --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonPushDownTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class PaimonPushDownTest extends PaimonPushDownTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonV1FunctionTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonV1FunctionTest.scala new file mode 100644 index 000000000000..f37fbad27033 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonV1FunctionTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class PaimonV1FunctionTest extends PaimonV1FunctionTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonViewTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonViewTest.scala new file mode 100644 index 000000000000..6ab8a2671b51 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonViewTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class PaimonViewTest extends PaimonViewTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RewriteUpsertTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RewriteUpsertTableTest.scala new file mode 100644 index 000000000000..412aa3b30351 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RewriteUpsertTableTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class RewriteUpsertTableTest extends RewriteUpsertTableTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowIdPushDownTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowIdPushDownTest.scala new file mode 100644 index 000000000000..da4c9b854df3 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowIdPushDownTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class RowIdPushDownTest extends RowIdPushDownTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowTrackingTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowTrackingTest.scala new file mode 100644 index 000000000000..9f96840a7788 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowTrackingTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class RowTrackingTest extends RowTrackingTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/ShowColumnsTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/ShowColumnsTest.scala new file mode 100644 index 000000000000..6601dc2fca37 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/ShowColumnsTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class ShowColumnsTest extends PaimonShowColumnsTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/SparkV2FilterConverterTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/SparkV2FilterConverterTest.scala new file mode 100644 index 000000000000..21c4c8a495ed --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/SparkV2FilterConverterTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class SparkV2FilterConverterTest extends SparkV2FilterConverterTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/TagDdlTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/TagDdlTest.scala new file mode 100644 index 000000000000..92309d54167b --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/TagDdlTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class TagDdlTest extends PaimonTagDdlTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTest.scala new file mode 100644 index 000000000000..3a0f56cd4820 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTest.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.spark.SparkConf + +class UpdateTableTest extends UpdateTableTestBase { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "false") + } +} + +class V2UpdateTableTest extends UpdateTableTestBase { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "true") + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VariantTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VariantTest.scala new file mode 100644 index 000000000000..94e9ac683f02 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VariantTest.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.spark.SparkConf + +class VariantTest extends VariantTestBase { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.variant.inferShreddingSchema", "false") + } +} + +class VariantInferShreddingTest extends VariantTestBase { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.variant.inferShreddingSchema", "true") + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VectorSearchPushDownTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VectorSearchPushDownTest.scala new file mode 100644 index 000000000000..7ac3c5df0d00 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VectorSearchPushDownTest.scala @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.paimon.spark.PaimonScan + +/** Tests for vector search table-valued function with global vector index. */ +class VectorSearchPushDownTest extends BaseVectorSearchPushDownTest { + test("vector search with global index") { + withTable("T") { + spark.sql(""" + |CREATE TABLE T (id INT, v ARRAY) + |TBLPROPERTIES ( + | 'bucket' = '-1', + | 'global-index.row-count-per-shard' = '10000', + | 'row-tracking.enabled' = 'true', + | 'data-evolution.enabled' = 'true') + |""".stripMargin) + + // Insert 100 rows with predictable vectors + val values = (0 until 100) + .map( + i => s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)))") + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + // Create vector index + val output = spark + .sql("CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')") + .collect() + .head + assert(output.getBoolean(0)) + + // Test vector search with table-valued function syntax + val result = spark + .sql(""" + |SELECT * FROM vector_search('T', 'v', array(50.0f, 51.0f, 52.0f), 5) + |""".stripMargin) + .collect() + + // The result should contain 5 rows + assert(result.length == 5) + + // Vector (50, 51, 52) should be most similar to the row with id=50 + assert(result.map(_.getInt(0)).contains(50)) + } + } + + test("vector search pushdown is applied in plan") { + withTable("T") { + spark.sql(""" + |CREATE TABLE T (id INT, v ARRAY) + |TBLPROPERTIES ( + | 'bucket' = '-1', + | 'global-index.row-count-per-shard' = '10000', + | 'row-tracking.enabled' = 'true', + | 'data-evolution.enabled' = 'true') + |""".stripMargin) + + val values = (0 until 10) + .map( + i => s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)))") + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + // Create vector index + spark + .sql("CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')") + .collect() + + // Check that vector search is pushed down with table function syntax + val df = spark.sql(""" + |SELECT * FROM vector_search('T', 'v', array(50.0f, 51.0f, 52.0f), 5) + |""".stripMargin) + + // Get the scan from the executed plan (physical plan) + val executedPlan = df.queryExecution.executedPlan + val batchScans = executedPlan.collect { + case scan: org.apache.spark.sql.execution.datasources.v2.BatchScanExec => scan + } + + assert(batchScans.nonEmpty, "Should have a BatchScanExec in executed plan") + val paimonScans = batchScans.filter(_.scan.isInstanceOf[PaimonScan]) + assert(paimonScans.nonEmpty, "Should have a PaimonScan in executed plan") + + val paimonScan = paimonScans.head.scan.asInstanceOf[PaimonScan] + assert(paimonScan.pushedVectorSearch.isDefined, "Vector search should be pushed down") + assert(paimonScan.pushedVectorSearch.get.fieldName() == "v", "Field name should be 'v'") + assert(paimonScan.pushedVectorSearch.get.limit() == 5, "Limit should be 5") + } + } + + test("vector search topk returns correct results") { + withTable("T") { + spark.sql(""" + |CREATE TABLE T (id INT, v ARRAY) + |TBLPROPERTIES ( + | 'bucket' = '-1', + | 'global-index.row-count-per-shard' = '10000', + | 'row-tracking.enabled' = 'true', + | 'data-evolution.enabled' = 'true') + |""".stripMargin) + + // Insert rows with distinct vectors + val values = (1 to 100) + .map { + i => + val v = math.sqrt(3.0 * i * i) + val normalized = i.toFloat / v.toFloat + s"($i, array($normalized, $normalized, $normalized))" + } + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + // Create vector index + spark.sql( + "CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')") + + // Query for top 10 similar to (1, 1, 1) normalized + val result = spark + .sql(""" + |SELECT * FROM vector_search('T', 'v', array(0.577f, 0.577f, 0.577f), 10) + |""".stripMargin) + .collect() + + assert(result.length == 10) + } + } +} diff --git a/pom.xml b/pom.xml index 0db5ac8d4560..eb5844e7b559 100644 --- a/pom.xml +++ b/pom.xml @@ -89,7 +89,7 @@ under the License. 1.20.1 2.12 2.12.18 - 2.13.16 + 2.13.17 ${scala212.version} ${scala212.version} 1.1.10.8 @@ -424,6 +424,7 @@ under the License. paimon-spark/paimon-spark4-common paimon-spark/paimon-spark-4.0 + paimon-spark/paimon-spark-4.1 17