From aeae9acad7cbded69b6965e157a2937fede99f93 Mon Sep 17 00:00:00 2001 From: Muhammad Junaid Muzammil <4795269+junmuz@users.noreply.github.com> Date: Wed, 25 Feb 2026 08:50:10 -0800 Subject: [PATCH 1/3] Initial implementation of the Spark 4.1 functionality --- .github/workflows/utitcase-spark-4.x.yml | 2 +- docs/content/spark/quick-start.md | 7 +- paimon-spark/paimon-spark-4.1/pom.xml | 162 ++++++++++++++++++ .../MergePaimonScalarSubqueries.scala | 92 ++++++++++ .../resources/function/hive-test-udfs.jar | Bin 0 -> 35660 bytes .../src/test/resources/hive-site.xml | 56 ++++++ .../src/test/resources/log4j2-test.properties | 38 ++++ .../procedure/CompactProcedureTest.scala | 21 +++ ...CreateGlobalVectorIndexProcedureTest.scala | 120 +++++++++++++ .../spark/procedure/ProcedureTest.scala | 21 +++ .../paimon/spark/sql/AnalyzeTableTest.scala | 21 +++ .../org/apache/paimon/spark/sql/DDLTest.scala | 21 +++ .../spark/sql/DDLWithHiveCatalogTest.scala | 23 +++ .../paimon/spark/sql/DataFrameWriteTest.scala | 21 +++ .../spark/sql/DeleteFromTableTest.scala | 33 ++++ .../paimon/spark/sql/DescribeTableTest.scala | 21 +++ .../paimon/spark/sql/FormatTableTest.scala | 21 +++ .../spark/sql/InsertOverwriteTableTest.scala | 21 +++ .../paimon/spark/sql/MergeIntoTableTest.scala | 63 +++++++ .../sql/PaimonCompositePartitionKeyTest.scala | 21 +++ .../spark/sql/PaimonOptimizationTest.scala | 39 +++++ .../paimon/spark/sql/PaimonPushDownTest.scala | 21 +++ .../spark/sql/PaimonV1FunctionTest.scala | 21 +++ .../paimon/spark/sql/PaimonViewTest.scala | 21 +++ .../spark/sql/RewriteUpsertTableTest.scala | 21 +++ .../paimon/spark/sql/RowIdPushDownTest.scala | 21 +++ .../paimon/spark/sql/RowTrackingTest.scala | 21 +++ .../paimon/spark/sql/ShowColumnsTest.scala | 21 +++ .../sql/SparkV2FilterConverterTest.scala | 21 +++ .../apache/paimon/spark/sql/TagDdlTest.scala | 21 +++ .../paimon/spark/sql/UpdateTableTest.scala | 33 ++++ .../apache/paimon/spark/sql/VariantTest.scala | 33 ++++ .../spark/sql/VectorSearchPushDownTest.scala | 145 ++++++++++++++++ pom.xml | 1 + 34 files changed, 1223 insertions(+), 2 deletions(-) create mode 100644 paimon-spark/paimon-spark-4.1/pom.xml create mode 100644 paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/paimon/spark/catalyst/optimizer/MergePaimonScalarSubqueries.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/resources/function/hive-test-udfs.jar create mode 100644 paimon-spark/paimon-spark-4.1/src/test/resources/hive-site.xml create mode 100644 paimon-spark/paimon-spark-4.1/src/test/resources/log4j2-test.properties create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateGlobalVectorIndexProcedureTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ProcedureTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/AnalyzeTableTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLWithHiveCatalogTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DeleteFromTableTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DescribeTableTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/FormatTableTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/InsertOverwriteTableTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonCompositePartitionKeyTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonOptimizationTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonPushDownTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonV1FunctionTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonViewTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RewriteUpsertTableTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowIdPushDownTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowTrackingTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/ShowColumnsTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/SparkV2FilterConverterTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/TagDdlTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VariantTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VectorSearchPushDownTest.scala diff --git a/.github/workflows/utitcase-spark-4.x.yml b/.github/workflows/utitcase-spark-4.x.yml index 56629110f503..993fa97ba2cf 100644 --- a/.github/workflows/utitcase-spark-4.x.yml +++ b/.github/workflows/utitcase-spark-4.x.yml @@ -61,7 +61,7 @@ jobs: jvm_timezone=$(random_timezone) echo "JVM timezone is set to $jvm_timezone" test_modules="" - for suffix in ut 4.0; do + for suffix in ut 4.0 4.1; do test_modules+="org.apache.paimon:paimon-spark-${suffix}_2.13," done test_modules="${test_modules%,}" diff --git a/docs/content/spark/quick-start.md b/docs/content/spark/quick-start.md index 58530ebcb73e..524d82a16352 100644 --- a/docs/content/spark/quick-start.md +++ b/docs/content/spark/quick-start.md @@ -30,7 +30,7 @@ under the License. Paimon supports the following Spark versions with their respective Java and Scala compatibility. We recommend using the latest Spark version for a better experience. -- Spark 4.x (including 4.0) : Pre-built with Java 17 and Scala 2.13 +- Spark 4.x (including 4.1, 4.0) : Pre-built with Java 17 and Scala 2.13 - Spark 3.x (including 3.5, 3.4, 3.3, 3.2) : Pre-built with Java 8 and Scala 2.12/2.13 @@ -40,6 +40,7 @@ Download the jar file with corresponding version. | Version | Jar (Scala 2.12) | Jar (Scala 2.13) | |-----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Spark 4.1 | - | [paimon-spark-4.1_2.13-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-4.1_2.13/{{< version >}}/paimon-spark-4.1_2.13-{{< version >}}.jar) | | Spark 4.0 | - | [paimon-spark-4.0_2.13-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-4.0_2.13/{{< version >}}/paimon-spark-4.0_2.13-{{< version >}}.jar) | | Spark 3.5 | [paimon-spark-3.5_2.12-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-3.5_2.12/{{< version >}}/paimon-spark-3.5_2.12-{{< version >}}.jar) | [paimon-spark-3.5_2.13-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-3.5_2.13/{{< version >}}/paimon-spark-3.5_2.13-{{< version >}}.jar) | | Spark 3.4 | [paimon-spark-3.4_2.12-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-3.4_2.12/{{< version >}}/paimon-spark-3.4_2.12-{{< version >}}.jar) | [paimon-spark-3.4_2.13-{{< version >}}.jar](https://repo.maven.apache.org/maven2/org/apache/paimon/paimon-spark-3.4_2.13/{{< version >}}/paimon-spark-3.4_2.13-{{< version >}}.jar) | @@ -52,6 +53,7 @@ Download the jar file with corresponding version. | Version | Jar (Scala 2.12) | Jar (Scala 2.13) | |-----------|-----------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------| +| Spark 4.1 | - | [paimon-spark-4.1_2.13-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-4.1_2.13/{{< version >}}/) | | Spark 4.0 | - | [paimon-spark-4.0_2.13-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-4.0_2.13/{{< version >}}/) | | Spark 3.5 | [paimon-spark-3.5_2.12-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-3.5_2.12/{{< version >}}/) | [paimon-spark-3.5_2.13-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-3.5_2.13/{{< version >}}/) | | Spark 3.4 | [paimon-spark-3.4_2.12-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-3.4_2.12/{{< version >}}/) | [paimon-spark-3.4_2.13-{{< version >}}.jar](https://repository.apache.org/snapshots/org/apache/paimon/paimon-spark-3.4_2.13/{{< version >}}/) | @@ -73,6 +75,9 @@ mvn clean package -DskipTests -pl paimon-spark/paimon-spark-3.5 -am -Pscala-2.13 # build paimon spark 4.0 mvn clean package -DskipTests -pl paimon-spark/paimon-spark-4.0 -am -Pspark4 + +# build paimon spark 4.1 +mvn clean package -DskipTests -pl paimon-spark/paimon-spark-4.1 -am -Pspark4 ``` For Spark 3.5, you can find the bundled jar in `./paimon-spark/paimon-spark-3.5/target/paimon-spark-3.5_2.12-{{< version >}}.jar`. diff --git a/paimon-spark/paimon-spark-4.1/pom.xml b/paimon-spark/paimon-spark-4.1/pom.xml new file mode 100644 index 000000000000..91aa2c76eac4 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/pom.xml @@ -0,0 +1,162 @@ + + + + 4.0.0 + + + org.apache.paimon + paimon-spark + 1.4-SNAPSHOT + + + paimon-spark-4.1_2.13 + Paimon : Spark : 4.1 : 2.13 + + + 4.1.1 + + + + + org.apache.paimon + paimon-format + + + + org.apache.paimon + paimon-spark4-common_${scala.binary.version} + ${project.version} + + + + org.apache.paimon + paimon-spark-common_${scala.binary.version} + ${project.version} + + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${spark.version} + + + + org.apache.spark + spark-hive_${scala.binary.version} + ${spark.version} + + + + + + org.apache.paimon + paimon-spark-ut_${scala.binary.version} + ${project.version} + tests + test + + + * + * + + + + + + org.apache.spark + spark-sql_${scala.binary.version} + ${spark.version} + tests + test + + + org.apache.spark + spark-connect-shims_${scala.binary.version} + + + + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${spark.version} + tests + test + + + + org.apache.spark + spark-core_${scala.binary.version} + ${spark.version} + tests + test + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + shade-paimon + package + + shade + + + + + * + + com/github/luben/zstd/** + **/*libzstd-jni-*.so + **/*libzstd-jni-*.dll + + + + + + org.apache.paimon:paimon-spark4-common_${scala.binary.version} + + + + + + + + + \ No newline at end of file diff --git a/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/paimon/spark/catalyst/optimizer/MergePaimonScalarSubqueries.scala b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/paimon/spark/catalyst/optimizer/MergePaimonScalarSubqueries.scala new file mode 100644 index 000000000000..e86195f1af0b --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/paimon/spark/catalyst/optimizer/MergePaimonScalarSubqueries.scala @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.catalyst.optimizer + +import org.apache.paimon.spark.PaimonScan + +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, ExprId, ScalarSubquery, SortOrder} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation + +object MergePaimonScalarSubqueries extends MergePaimonScalarSubqueriesBase { + + override def tryMergeDataSourceV2ScanRelation( + newV2ScanRelation: DataSourceV2ScanRelation, + cachedV2ScanRelation: DataSourceV2ScanRelation) + : Option[(LogicalPlan, AttributeMap[Attribute])] = { + (newV2ScanRelation, cachedV2ScanRelation) match { + case ( + DataSourceV2ScanRelation( + newRelation, + newScan: PaimonScan, + newOutput, + newPartitioning, + newOrdering), + DataSourceV2ScanRelation( + cachedRelation, + cachedScan: PaimonScan, + _, + cachedPartitioning, + cacheOrdering)) => + checkIdenticalPlans(newRelation, cachedRelation).flatMap { + outputMap => + if ( + samePartitioning(newPartitioning, cachedPartitioning, outputMap) && sameOrdering( + newOrdering, + cacheOrdering, + outputMap) + ) { + mergePaimonScan(newScan, cachedScan).map { + mergedScan => + val mergedAttributes = mergedScan + .readSchema() + .map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()) + val cachedOutputNameMap = cachedRelation.output.map(a => a.name -> a).toMap + val mergedOutput = + mergedAttributes.map(a => cachedOutputNameMap.getOrElse(a.name, a)) + val newV2ScanRelation = + cachedV2ScanRelation.copy(scan = mergedScan, output = mergedOutput) + + val mergedOutputNameMap = mergedOutput.map(a => a.name -> a).toMap + val newOutputMap = + AttributeMap(newOutput.map(a => a -> mergedOutputNameMap(a.name).toAttribute)) + + newV2ScanRelation -> newOutputMap + } + } else { + None + } + } + + case _ => None + } + } + + private def sameOrdering( + newOrdering: Option[Seq[SortOrder]], + cachedOrdering: Option[Seq[SortOrder]], + outputAttrMap: AttributeMap[Attribute]): Boolean = { + val mappedNewOrdering = newOrdering.map(_.map(mapAttributes(_, outputAttrMap))) + mappedNewOrdering.map(_.map(_.canonicalized)) == cachedOrdering.map(_.map(_.canonicalized)) + } + + override protected def createScalarSubquery(plan: LogicalPlan, exprId: ExprId): ScalarSubquery = { + ScalarSubquery(plan, exprId = exprId) + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/resources/function/hive-test-udfs.jar b/paimon-spark/paimon-spark-4.1/src/test/resources/function/hive-test-udfs.jar new file mode 100644 index 0000000000000000000000000000000000000000..a5bfa456f6686da0ca010061cc3d690bfcd2ac96 GIT binary patch literal 35660 zcmb@t1yrP2vNcLkI7#8|?oQ$E?(XjHP8II%E`@91?(XjH?hXZyo}Ty5?Y{rK?s@mJ zSh+GGbD#4;?AQ_e#3w8983YRGw=bK*G?{-q`R4-&5EhV)y$P+7A|%jff@@Ii`D0&g zCs$~oPat=nfPg@dKHew$Z$AzF&rj>w>KT|C{Y?-9A0OHOx&?$kf|%+V+Su6sj>Z15 znSZmP-!=vTWc-1Zdnp)I{M%m9fPgUnuUIsK3OWjoHugsUxJsT6P=o(|m1oIq;yt`r zo$pD)ZK}}_Uy!lBU}>_~nmfVNIUyh^oXXab5H0eGipSOqkR#!Cf=|vxJ;>sAquQ|? z{R${L+nklZ03W!BeHqlESrq~L7SEv!&kG6!wHga-kb~xppD!89KB~RQthEtd$`dPsZ>DoRpOb-GeDrP?09%@Y?FL$0(cvu{Cj4o9LD z2H)DxT;;L*l8Uvuww=wv;M43F!-}VNOp{H!9!`f-Z3Qik%eFyVP7buWffLqBj?BG> zVef&OZea>NBp&))Wvs%q)Gpx&m7-}yhhN^sila4W8Et%-cMlKHZuOW0;86J|8bO6g z8EGpmFF^x^2T3aHCj3=5r99gG8qY?KcbyJi7Z}304ZL`8qlm2KQ>Tt;_;3)$(T3i- zrZ8N)B*`>6dy44DW0M%}gHm)^{)tma<+I~ z2$YbTe?Q*6>EOjnHVD?7qBKf=%_3B@P{XQdhw6NY09ucux2Z-tkbn3GNT2Y>`%tp} z6^<|7e~cv5|LcF>)Xe#xq3L$?Z^P+#b{zZ(5cl5$8rWDn+MDVB9;b4D1#J5#+7bOV zu#=&q@$cwFe??2dqCb+F>TjYOx$0TjTK@Bo)A{dL`JHROT?0Hp;4BOX5RfJ$5Rkxs zxQd98wUNDyNYn%;JgQIWFyuINriw0>^sA8@9Q}^oxuO(G~LNT z-J;NEW2E#d-bb5W(hR__K~n@3@KJl#C$6vwvGucYyJb-PqIs_5O7?Z#eY6N0oqjaQ zuWJ7i%*DmOrl*u|($;ym!gjS&C1=%syJXq97g5{BwOb9P9U{M_sa8&Nc6v1hY@D}f7o1P=;uS9B zuF0~rRG+HAG+l|jFRGKzadD)dLWxGkfEGb#EA&Ha%-O8z#|W8^Aj^UutD}&d`Gbhg zNf;#ci0wHJel-mRLoCvv?*(`$DRnoUbNUF6>#R+;Q=;H*Aj1(E6vvK2>QKe`Kmm+- zo7Q>XE$e$#V^j4b-3gfXI8FmgQm8Xrsc;#A#u>zlOkmfS`ZGFLtfd&ZH#B&De9JRU zCcQ1{a5U9=TTEi;&uE5dhvDnpaDmjTT`EjbTH?GkvA@WnjaI)_>FXg|fVJ1#!k(Zo zR}IoG8cC( z)T7-g$Y;-4t;ytTE^wLD-ebYY)_V-(yWO!KHvk@F<{u+u=gp|$dzQP(4NyWgvpjaI z>qR1+C(-AnSuI7FFC6DY)l}K%9&ws27yQ*#Rx9QCTWEL#@mA`2Vv*-UvOH(rg815i z@+&FE`0`~Ikh3WI@1XW+MA*C%`TQf&cE5E_wM|E`a7z=X!=y947WzGR^BEd4{Fgk+ z+T^SK+v0z!;4&06Y!LX6-oO&&k8*28>(tNc@2YES4Q_?}=*>cj1mzLyTDPk>0_dy4 zJn(<1J?BQi{+2$kM)_sG_xDQOYqql;q_G{0zCJ#kaeO0njTb<1aai=t?{ogbu{gJ_ zjq*_A+F5n0=|9tnbhmAQ(^T94bPc2fA2L=90 z>1-dI9-)=CmBz+3=5#o_8Hj?m6D?Vnj7Iy2j`LSs%l?2OuM-aWgh2B>sP1_=E_h@a zcrUEaXE-ZIbRCb05Sm6 zEMTW0 z%0wF-2X-Jb3%xhv9GbYZRhQ$>2x51_X9>Tf%;?dW0Wrqul(tIUtRYdofL(cD>uw^m z$j{LOO4GOIvdKgpxrazt20Gn$`tuMy~Hgg zLtqG))1eKGLtJQyX{16V;$ynqW8FTph@I5fM7vP-T3H4*-a!Axuiurv#mh%FioW^S z;v+h}KBANNKa0-)BlNC>KW}u*2COO?hf}$gmELlOq67>{A(yy@>2c~5F%s3+k)jurQp9_-Ws^%uF?lr)Dc4cH zHq#9s=Zy%i%E9*-E2Y&FHg}(i77}unw@8cJaey$tc>Hw^f>>#d_WFZL2ZvXjxQX|IY(DXOn|Yw zhxLJ_{;qeFjoMEU?SxD&zUPK^tlBFc6hHo}X4>sQ0cG0B*>D$QuFlpz2||cB(s)@w z1V+3bUPA?dh-4VGP7!fEweC!a+gH$ud-)mtusz&$HD)!bJFlt4^v5NRn_4!wSSD5*m&FL9|I zsW7@58?0ZUkt0A!RuJ=@M+tCZB4Xf{XPmLw1{YJS8Q3&q9R1cm3>jDP1vsq>a#XI& zGc^;^JT6yEo2PHM$ry7aIXlaI0SzsISn7qyOa`1ix;e=xi|*bIXmBzUAF@OZflV`e zW)@EPnqUZoM$g?aS5!1is@!ORhwy4EDP;FsHFNd^EIwf~msUTZEefBbQ>Tk+$cTMR z59ve` zwEmTh{60s$%f9isl%RRsC|Um=mwb7dgif7J8+=M!=#8VOS`eG#7k zgRp@3mq0LN0gOO9`MzMyWQNW7P&s4+ig7wJ>BLI9DuBRjiv94Lfck2FpTAP|>j+@W z&^)u7WDT@ylEEsWSF&#i@`7YyY1b^v`DQiqAd`5(sc;*$oH&q~-^4QMMPX#M!gk{h z?baec_L9V^y_Sf~(0Ta|iiGyYQ!ZJkeY4r9#*`YG@gmW4VmKuM)O!Mna+Q3gyVdDV z+_;39^w#v9_LXTbu^C-)1Qq8&Y*hD}g^g+f;JkiMrDHB#SY*75|@XX;?xHxd?3{FK2Vj;ppiqA7y8 z>~uSrgAR#JZ8~R%a-f*LVD||X{n=Rsk9>#S-A)!%De+e&Oio;kxKvsa)V`&)+-@;T zkB{h8CrQ=dLXn2QE#G4o*AI@=#ayE%9ij%pN*kh-p}`Gt20|N)XI%46`oSbY7G(VA zL(L<%fZeIsh}f+0^!_jE`9loV^ah_tiwJJkLL~vEhT;h3S>Y4&qlEYJ{;@TtKwE-IhHt+Nze3I0rt%x`eEOpT z&YUAMUVivW3m*!|_dg@nU(+aGQOjnA4v9;9B_Of8nV!4fq#vpl0s$X=vR8i(6$TL@-V0hkctr3#DqwSXAmcJ z!FRfAN((t&>_zF}`wrV#^^kR40|W2_#u@9V)1LCDlp3~`r`y^)Hzu1_9-Fi1y3EmS ze)Xd8UK~M%Z&A(L5W1&xFD@oq&rWGb{+w{2)73ty3$uc>fchgW4e3PUll2y}9;6FO z3HBWF~>YB_Y5G*2RZbm0}AHz>%#Vt5({4#`jGHv2AL~X)W=|;Xq-aFJkhMEwp zcX7{04o~~Y;d1}8Q2Q;1i&_6?8C*jKQ4T3!!7S(tbv_;*nJXWN6}TZ%>qiEM5K91t zLqAz@}d@#I6bfGmZVLbKL@@pmd*A879`m{sE zp{%t;X`$x|QaGsu@kExL!E)*_$RoFfk+f>68qg7{!Ss{FG2b?;6t#w359j`}-CfF= z<(R%|gOt)Jobj>weh?WzRyJvIDJS${y^Y&+VNA@U4U*-;ZgXMg8D+x|NGWnfr|ga5 z_j@B9W1T{yCT%m)giz-YeTnHCt0M@&0OFV!=a=t^xqEw%kSut>F-8sBM&)c8Qy$Xs zobBWdMo5K|$U+_F%kW{jpEKlEyREQYavQ@ufc?g9KQt`#Q~FF-N#^T-!Dc2|QURfa zn*)7~nUNcTfe)PP4xELN#V8>^oz2r++JFW+M{a(;j&5TSjim%mS`tkkkzI6_mo5QZ4`C6i8>%o_+cAJgo z)*0SzQ+ec!mwjoTP&(F0gI^VyoV?G;5TwonrF5)!2h7d$UL776xr~0P!o~a*>3;>e z?BR%+2W~2-J6)&ch#n5EbjrvNu<#g#Rjzwh!@mr#Ul7~CyJB0S)!j5wYG~-MI4PNO z)6BjWiu?)!rjG?{4{o82BC5pwV$SXOGi*`i@I?F#t{fMmI>%i7T-D6W2@-T3-qfSa zAee`S0iY#oyjmZbW1 z3MvpJ8iZ)7;pn3z@+e-R{@$_4LwA*dio*mi%5*ga?h~3?6td8ZHc^|#FKZrNFviBn zap1Y(GM?ggx3dHdl-CF+fJJH$yrmc~DV|yjMIA_D_yawDzZQTdZztSW6$%g5m$^>~ z;8LhinRE-m6v{z;$~{SRoS8f53(YbLUfgjW(lOF?F?ASnlzfL0U`^1*`rmS5wO}AB$sT!Wk3yRS z+BfY8eKV@W+;lN>MorxCoD-rn$Gd6JYUZ#zGSPn~qHISf!u4#3EO7^KKR~kO^fikCn zWHLg%feNI6gl7tA73>)#R%`)iIb<#V&}18FQaG-ZF2Dqll~jd`wQQTSXY}2Q#EEZs zhWZ@oHtmU6PgV!~VYAT&zM9UwY6T)9O-6;%ywN$tLq9Vx`wZSQLj9D=U(>T-xU zFBnjp$?yhN-WEY>w2VWNiCxpr*(3496WbUoys8S!*{6Z-A~KI$P_34MVH2y=2G!5>?F# z-eYXO1pW001+zitd!YoA@imjzPxCHT8H1)=OrV3R0#^iBi^W%)*dkT29!v>=r@~=) zefZtkYY0e#pai1NfdVFIOSNq=Yz|(oMEE)HX&?zN`m!#*a17+cr)D9n@ zD3spGcTEo!&R2LYd9FK&fLz` z7%E7SJ4}@hBeNN;6XX%YRx9j(>o`h3I*xuT?P7?Mlnnbe-=Pss6(r-B1FZ&2lyV`x z{ngkDHMN@50~e{Ey&2WiqiHFWafn?hRYC4Pq@su|isRYp?U#)*_QJ6oF^c|4r~(Ch zsh>uvC$5d4kB}n1H39ru-be**(kC1q3f!8wBb2ANREc95Mz}V%nR{#*>MV4Z@dSF| zwZGFDz(|5wjhDN#(VK}?M_LhLsBVU74&c-t9VQn1x$hX&8jgw1V|TaZ1-;|E@o5kh zh->~w-*I>9J2=Jv;_^W?#OguNsx*i#-%E>$q)cahM61*nA#slDIg!|u@K)tm^ryj@ z_eqdQe(!I$?eMv{QYr1j(O67RWP4_x4Ohs;<4kr9`q`6z22RGQn}IKZbQ|jMby+ZN z!Q9LL0Pj2O^}d$0{`ti*q-0NPRkV!2lB`s=UZ70?nem8&TJR2BzNGy^~qgwQM*pXF6LGto4S+BnB$&D}8L5*qD7#{PcrSEO1Mx5RR2&x5yt--|4fT@} z1yNn(oX4-o1BctLc@tC1&WDxNO^f0N>YrvGsLk_W>ZY321J^s|9(QCg1X$%}>=9lD zEfkCNqQwn5{&B+oFOsb?NPM06AldT&PDig`YGePO)r0@)=$C0WAfUX%rmb_rsB>lM z@Ks|mph%R$*QHi|YOS8lpCxW4@N`r9d}7fwB`{^Wnuh+(G_EHH5dfH{j*hZ%-K0OH zv%TybB-sElsD=^*7L(-$i2yI?jls7( zI1PMA4L>lXN^zQLnScu?ttYdbD~1SsbE=_XR(BR_jB*Kt`gFxJeskkc9#d&>ahpPxE-fv z&Spu;{uyk+n!6x`a=^g^^_b%Ze0gUZMw1oWd5$Z0 z*5FK4ZgE;L04U5e(@h9UO6&n<(q6$SV z@U`U}0-Z4qT}<9Z$*$|E%HR+XbqT8650d!EK64YkCcX}HYe75@^%o0nG9O7JatjQy z`A!gqfbv|KVy=r($t2mP$m`_P_;w2NWK_ySujn%eQv^iT1r%ArdgO8cxHqBI}!z-lGT3Ej7xndd1-7Q?vA1#+09le{Y%oXcQ* zZalJR3b?5iA*U1ng(a~2r%p@6Ogrwk+A#NnTk`+&+ECKwKdTJmBz|kP2HqPooDUtz zARwUHql3=Ah5O2cl7Qg+_)(ywI!}xDKzvpMVKu8$7^vP^|^gLo>Q~|$Ge3%~AVtql30sw2i zzG7c;DC38Eji~&tUL5(G_~WmxI*fa zy{5KKFNtNRLIk7_A3&MkgsXv0Ys)zWJaZyj$4P5TX35h{O}1Ba{sx+a_Ua#Y*`XV8 zjj57<%EM}BgT1j^%*NL~C>Qa1Iveej^puK>5YFWe#H@l*`LBNFpxpptFK+Y(Vx9w$)gp4P)dImtnObg!209Aqu`G9N!)A^q!>Ul(C!ahg)`G z^_yF^@ZpwKnmyx+^V06p3c4WSyBC$->I4`Xm_@~Au}fQUsF^LJpV1p2jxO>Z8howw zn+NHxP0Y#mCE@J!uD(^LW4DWTCMi?~*UB}0qOtCshi8s=BIA&Yg6lGnzc<9BSm&KO zJuWIlLe2GZJ>-z@CndX>vIbb;5p?MRSLTl4`KW#{OWfKhV!LNfbM3<|3peB;eTF7z zC%UIZbJZGKQ8QS$8J^L@${7_oD#M$N%f6vM)|DE;dipV5k@_W}oZt%i*eb;vQ3@Bs zEN`xgWkvj?q%itJwOob!?A)goiX2+mv7Z%DP&F(ZO9S0oq^xRlnAKHACf^${HqHv@ z{lAwa@?m9bN~> z^`EBCNd6MnH!v&W^q{-{u$^k79j&5lvg7spf znL%GahMsWXX9xal1K)%n_#!a;Pa&c4Zo`xN^Y}Tnr7f{^3-U3Z34z$?4ML>4hB^6M z@K~X$Mkl~jDedBxs{~Z-(J|o(4+TGKT%rSEGWJ2G8TDOR;KT?JwckX~9=W%Q@(nhS zKX2XTub^s`gInvs}iKL%TtghVYoGIZJdP^oJgvjO4F}TbVDZ)Bpc-YGr zHIwXGZ}^Y~`c}QNM>I+>=jDhi_9F`|L*zO3GP69k(793l!ZOPOBp2Tu;AFxIMMLH7 zJcD44RD>Je%%fxxuoY#CH@>8Cg+r}+DR00Dx9;{pL)g5}nAF*&O}=4vWc2oSGa9sK z<+o*qLrN6xTUsLAzh~r|4N7w{bmtunLBJMP2Oz7yI(#;99nB6u{YkHZ=V`=SWVd@Q zsWz4&lO5%@yye37d8l74GDGgu_!htOR#kAA&`n$JQmPBLWbJ&vdU@_#cYfIz7UHM^)En#lBE z9=yM!*bKES-eB|{y_ggIlF4G%3As!|Fw{a7D;EKzZEfJi8Z@$6o?Ns4v zJQ4ekd&F<7w3|je>{n)=vfzKlRX$O1lWrq>X7aSF`_TqJ614Z?wrhq}Zq{M6j4J znIIQdmSU)a25_*mTK3G|T5=ccJt3qPbC(!>y-@?RA@j`aa?zBD&4IC_E{#NCP$;oe z{wZk_dICx8rA^QDcG`NI!iaGNM+<%?u1-T#xEC)~;K@_mhJ6wvS>%Nk^9^^(w3q+p zVR9cCFivsg%a7%3i!MTAJxy4O4ZptcZO<2&ebXbeN_S_4tbYL9u(*CyN&Y~f_Y4*Nbq%4`GmWFptUBuzB)^-RJ*ir9tot?$AQBezm- zLEh2Ikb>+anQSS!Xnd@1VMz+chD<>`^1h}vuo3@N>rvi;@bP(!bB z)o2ihcU7#}I);sxqq{3_kOCc~*;nE#)rC`8mITU+G8aYZXoJj2Ob~!=+IiP-!=}i% zoqh3DOXzb<7~)~=)n)R@ly)1(S7JMl{rt=cv61E*z_F`xrVYm3#XK;S+j93K>%%Ql zG}W>5F{F%c+r6CaZ!SQHlWK--Tujeym$9YUvLjyR8C2-%YHQB1U?2q**4?KI4kUcZ zG)s<7!*eFoHmFXzOX;0FMknj(?U1BiMC(802LIYmXmFoE%ong6ky@ zM9)>;TRwZ%6Pm7a=XkK3PLHHd&DHuu4-CDHS=G7Rn@FB8#&Jh)tn%Y3zs1>ca7|p& z!G>vxeXT?{pFYC~Ca#HY*wpUS_J{wp=@*o>&UAg53G^D33X4}2^niOyv&0b_B2Z;>BV@$MT{w~dY=b}x?A58uqVUO5Rxvj?NtCGp_Lveg#a zf2g&=QKaQ-V-K%{E7+VHR+Je`#g|;BBRB^98dKEpfYS>)yYBCp!R$^Q&>{-B#817_ z;goDXG_O&6?Lif{bmGJCO5{VNUkFdNtFyNSHtPpA?KCSS)ZD6rYvd>x%vMdO9 z)Oc{^I=nkj`BJsemMkTUIKgG0FsC^PC%_d>pThOS4OMbb3aJnCF(x*5>V>!a2Z;6S z0axqm1gRBz_DuHBnWe0V_>!4vrrO=o>eZC;z2^k?NK@0da9YhaJ9^E?Q|{nVu2tvF z^|7EV5E6{)Q<2z_;2z|nMN9mQD?!RALL44QeDi#no&dRf)<9L#{zLD?fJMuW(||LRj|8UxJpP`x9?!t|{Qz4B%Xw96Y;CoDd(WIE z>@ME5VR=sJ?2sZsRH~TpO6eNxfF11@_;O>ANDH|R@LF6$um zJBGJ2#N8>n{%)|v&n#uz3-@veRbhBX>Mz>_+&>YP&K^Fs?4~|fBRX3-FRt-#QbSdp zfDFmNMhKnm^(dM3M^2@3i;LOkNt3W9Qoo+8FoBf^VE7ZpHmO>>tlJZGjh~f=4_nUw43YO;Io;w=huf=DT*JTRPCBK%%bBdFHwFi_5LkC(aX3e zmX|rTDm!sWQbuNiZS9xS!;yn?$Lr1vEzs#e3;b*zPbh1oa1}T?@`OnCCa4(L4?{X% zra&SW$=ID(KxG$%bP-xaHj0590jW|enh>Bc_lLD+Y?5MY*|t)kxzF$|g4M^6%SWT7 z(O5vgyfVjvpeizEHbn`T*=7TyHkxV4#CqV|qQ}CuT`e2GSxrWGMw(!i*3QbkdVWgB zF(kROK8HZXtf+hF>T!LGQ;ZFTx9%EOLz%Am{=-cJ^n~_k5z6aoIFv zZ}rjVo^EJ+ZUJnyv(#~tGjeOpZlx~tW9Vx54dhh@EB!_pQQC46wFW07E^#X6WYuBi zgG~8E&MjlXk1FulS2n{@N{7U+vl<|vQ9KlApcYyF;^3A;OEU-+C2z{jU0#LN>{8w3n|T8dACY%Jz9*ejY!{#>GA8%vS+{mz%2z!VK57rCi@R{;<|<{*rFUT?M~))b$>yQ z47^#)=}PF0>lMk?SMLEBqpPc+z`{7l-&nY2Y;~F=;N@Kf1`@g|_s}rS@YS4xlOdy` z1&-+0QP3~|5|*?APX*Ld9r-D&h7h#h$ zzr@<;Y9{tjx-zKNn%>q!>MfA9PR|9r9sBp5S_d7$<0#g1{m`h|WLo25Nwlcil{5 z^Xj}c{9LI)l~ZY7QI&31Qv7&68%wQ&F7pMJe%7_i|H9s{Kr&uZr+B?3G{gFRI6_wG zHeL^s0ZUWKLiXy5fEs0Fj*(Dbt>x9P1UE|&u5BPyDiRlt?UKXgaBAH}@l|xPoF5g% zPX<*5Z5lhnFp8dnu~3$T>QX!3DzC_C8gwsSG)kkMgSd~eJh(C`os3Br`HQI{*7%b; z8Z6r}o?TGzM#!MeV)phB4zkRzNCCm4j<5>R*ulzg%az+^j{Z0dj@!0qzf2B8?zi}N zZDZC4irwZS;hB)&d-vg2uUhoJZ(vZIKYB&EO!T=Awa{$; zaItbsQZ7wPtu*+RKwMg=o8!J7mCHR{V`0XHsr)0zw5!H?IKB0O0KMG<#LE;QQqr46 z5`Hbx)~$n8soOx{QyZ3N=6%Z@Dv4sFU!wk2Z$elp_y+PvVYu~KF3b72muMRU2uSIF zUl{&c7V6Z@Tovcgy=9DJ(7wDxze9aO2p$82lufK(A6=EC8ST&D z=t4s6&9ARZ6I$`jwL(U!10{llxs{HjZ$_9k|m;^0RMGBtKsKner-na%zdOvu?fpj{?hjZ@8*TvoBndIBD_kgps zn&zkO$i{%A;Je?^YU9r0px!ARqYk`>oBLHXzl4Z9fzb>Bj#C126&$9Bp^ltw$W zm4xg`9Mp{^YdaZ0&~%{3x!sM=Nf8ufv{Ops38RzbH{fsu^2KB)nDg-mw9Hmf7y~!8 zk!OE^$ra3!r$QG!YUGYTH1t)vZ(OWulCL!=7ck@eG5#7;E9QdY9MgdBN%$A9i1V+$ z>b$(=Fm_f>ykWw3MZ9mFcN85uy!8Ar$#bfdRTVUr&s^$)H}xl(&SR)wL^#h@&q9t1 zWN07!HCzt0M<9dP!%&!A; z6w_-k4s^`=wn#zOq)MTyF3gIgWgxhRQejBV!XN0A_btzGo#fNOag_{evZ9IqZJ=tE z5{t&J@OR(lDivtUX-R=d#5n2FH9UfonV|;C&5);M&NtTj%npmzl&&C&89O%dJ0AfOZ@)M!7c2r|}X zTOW~({j@K0hmr&gP_a8omH>@7838Cq9&RN%`P-he*?EJbW)Wye+olOsxxoeD6# zaBZ=_inip_P6@Zn_}=C%$<3t*jot;b10!i%N*l9tsGZs;dmB-+xN_`lik(WD9q=(gLq_7T|3qia%pkFZXhgW~lUkqukv#$rk8cFr~HXjvU zuG<+ib4sRkNv}91h-Srt_H6ykS|SYz>O=}$H74AJzF_8mBU1RTl@!ayADbCGw;z&K z7Q{`bDTrZ%P9 zZMUK0ve3Q9Y~HcQS!9Yt{ghA#Q^hM^%kexpT|SKMbz3^LFH1{YN}|Y4joF(*$h2D; z6z$aDw56_~l_if~v<>S1iXI8aDIViOd3LmreY#p& z1^Y2egrMF^wrv{m)-Eo96Y})4w{af}d3RVPNUI!gn}l`y&@8A)Qf`q{znO85O`pn& zHHUXLt^-pAHz?bfwhGouAl8mLBb&u}G&A0(;h9oqt|`{#}Ma)X)oq=>tse_ zR)ElADL*uu%5Qm-V00jg8&d1WUi@5##Mo4P;fz+RwkfMV5=r$df!pFapwc`-odgtH zjN47*q$^m+32O>%gEO+ZzYbWaID7{O?GB{do98B{kcNtKueB=ykP-E5G0INao4EKc ziED^44ovC_vYT*Bm=0kQQmEllv#J-f{C@Olz4Bz?kf0RT#8jQ+j1 zJ>pb8*tdv_VVPY)zNb}Dn)=Mxwc(c|u5zm-CDsMBrkB|!TFv#|1VyFcW2)VbYl{W{ zKnRFCZ*{v5+Nb^#S|YPw%wE8brraC(IG@aBHpx_%Xj_F;G$;D8TiLXy5_PUK(h3U3 zwx7(8`j1MQnM8I-OMQ1=I?1$Y2w%lasANF)-Pq3*vv9sjt|d`bL&hS9ExSxAkMa>G z0@)G>BsQKJ4R)hn2i37hvuGurU^U?xLQXh;6=AW>imo}wfoRBsMjtq0?i zgjTYou)`teSN95~w(bC49ISahw7a^FXp6<`eZ+Q>~6#y4;5LK{y_ z8olP1PE$?k6rUwxjjA}c$sGboN_Pb*8T>$kEiAU~bXr;%GeLCN{DcIR$Xdv{qYwuw zol~_1alO4m?yK}V8(6K7>>SR(YPp1pmvn}Mw5>eaYn2?dBKMsmWSV<+o5pfrZ85|H zuDfm3DXV-#|F{Y}PIYZT0oOckQt=*R($t~@RZ3N+oU-@@j44h7+N9WL%R{wH*WhQU zR6V*Y`SnB3B1+I;pm;Z1G3pHDNI3h$t97f*+fSlpL^0^aC`-;@NY}wcQ}b6*{zyx* z_zOa!v|(Q-Pv1UgZ{i5wrvv)7_SvO?lzfm@=$dCMbzKYR=u~b=32t9ctZFvoJ{gj) zGD#iW#JJooPcGjUrnx z7aVt%t&vbijRY{Y2V__(cp_o}fEF9=;lQ2ve7G7qOcl~oZE0FioP$%naon9@_~Uz` ziU9?(KDpxuCTh2MWskr{E{O|n>ywKD6rn{3Tu021NFoGmNFu{ZNUPbFzDQt^Eh(ti z_pKB~!(~09vY$+PmsDn^E*%*)mg-vM_haJM!{V3X`GY}fzS~_O^XJn^as*bZvd#!K z+TU+n(LcKwr(h)vC$oZ9Q%NmyMUKqfVs8Q_9tglE2w_dkJo;g1=$N&|^`ixlfNL9i za2|&Cr1dw2qv_*fE}z36qAT%UZ6q=A?1><|+x;cRBU*OFMmd30im9CnScTQj_*jMw z8+hCcmzVrKE-6TEq_{vU6)ITdIi0NK>zl-6A|!BtM=98gCEw`(BkK^#zW z)MY37XC}h}G*-{2t*Iz>Y+#39B2Z8drRr^_Zc!oERwOA=Yo}~z-3>$v7y6j8cpcffm|?KIb~xW>uCTCvO<_DFB$vyPFw7?SLTB7x52Q2` zUq}|@Lbc*v`-mhw0SN83jTK;}TSvk?S_ZyCbp&_@-xb!F;oeoesUri8= zs;}YJme2|k#;Y`X;62ibe2MspT2w7+_f@AxO9Q!8rzUKj3{-l8-9_KvV^|&QPZGRCBn@F&!r=l*Tq?a_*_!C}#jejeQ@D`V84- zov3wDz`N2H&U@&urxB^dLp;gN%0vk#+DNRRt2A#v>S^FTejECnF2%K`wlI07wpNqe z;o@xWwALtmV#$zC3pceIXp(FeHsH}OHE>I1KvG;#*Z+FiUi$1ST~mnF<7|*E_&l`Xouyq2|`1z|%{<;sRZUdL)_o zd!TjOj)8eU)STGqbEI!@r>?3%BZtd~7hJEy(*gZSU zV*pF*3(w1{jX1w^B%Sn8?grcSCV1LFO7%8y7?xU;#OVY&;%jaxu1v%F35WdXIK&wYoAn>NBs<9RD(1f*)L~xIw~2oMA7Fb zu`R6aI1)Q$l2arW(nf->FQEC3htLi9hcl$U;qd7W!aa>QUEK_m`Ix<;_)NV^xN<5is zpj8uvJfMr7(86xQ1!Ar)D8}`x|J_El)93P5?+rWuy*_FgR<{^t)BCXi*Jrtk3Q&vv zZG($#P?D@CA1r1DH~t@-)9+I|+dpyC+$=Z5LXz>rI*;9m^A)7CBP=JyQ|cv_g94Ij zW-zAVFy?jJ0$~PC2e(0b76ppbF{Ha^ql5Chg!y{NEtR&+FMlfa#9I~^G5VT8k)O(e#*}^;B$g1%9rpj>CTmMgbF8 z$oRS!Ffg;a^ZZF)v4lf_aIUK-FrmKCMrFYE*!rZ2HzXUiHC#It-N98G5G*-2-4a|Q zci}Y~{dCYfS=$2*8gqWbNHI=dmL2XQm?%-(I|9+2=n3?73iNt6Mqn3uohkz?l3}*V zJF^HMOlcgR5Is!1(FOFH&@B+`vf5vvbl5HOXNB*_~M405m4_eyKfSf4uB8 zy0K|((6!pekduv$)wawfIat6DjvTA5a4q&Y*(y~?jcLBo3OaqWK!a`QDO2wlvL}I< znKMa11yp4BjC{DIKhbqA7mEq<$p04jDL+5KGdG%)O#IhZRRloGxkW??s|<^ux-pQz zKrXP$0`GnRUt(?j_3*Lwi+)Bx>hU9%oHkckFl(u zm_BZ$*y*f@!O}G`BdXsN%N7+2DHG)iARCLGommWRCb$F0L3tI$BL>y}oUg8TI7^as zK_0%Wmr%VXS9Jb_PjJ%lJ@tnOj1K-ITVM#(l#7{qb^MM_b`V7Qxds*VCPnw!F#8_+ z>?SBvnF5Ugo7A~7dj)E~5PPuPE);pqD2z|kApFxVl(HrfV?05xlF4qfW z%hE*fPn|=<7fQ=}4N!ML`5`aq7y%G6X}Vt|jXnQoE3cKKd0f1tT~kj|zICCAx?I2A zqJ!XNDMzHPjw@UTK!ZK6b&3xMd`gHelQOy0Zgm#(k z(?es>*kX7+B0m@_b@!;F@zJsGIljN{glqJ(Y#6r&@(#H&*iQfx zdJuj#G{lvg9;A6rV3XUJ_eL+JQ64Lf4ScS~Ms_1A%x_Nre_FfBs4TawOGt-wcO%{1 z-QC?G(j@}Yosv@00@B^x-3<~F5&}x+_i_@ho^w3+yD%7o_Xlguz4pW2d+legHK#(~ z2)*bl_k=}x4M7?9bxw1y2U*xj>2PNJnL5Npb?)(uAKO?#FFJ#aIbGzkKw96b2gb=) zk8n^qsN4Ujw{epNz8(YY3fND9fVlqKL)lMU)v0MIp@^Z}CV`n^_T*vvyg^_hdtRQ- z2+Wd{?l(|gYh2e$11*}8`Xat<>8%qjWL@Ib2I;{|DP6_5eq>A3T*0_Lp|r1{U(E_7 zH;(tpAwlZF@pG?^_ftw*94@8LKHlAW;Qz}CvEn^7Cbq{ zq_EkX_2|)EQ?N~SM;QDRJ8obt`<=^^bVtKEiZPveZM2HxD=IW5`YkEB7TV-r4OdE+ z#He;Akz*FT+K0_wPGF?r{j@c)h&cml6dn6wyG~+>L8M_)V}I!Flgq&l)6L_s7wDB7 zf>}$w7M`Z6188=Z*hkedPNV}(&0suRyJ`FMD25XY`Sx`&tNTec$}71n&!4Z8kVtHs z7caWb5fMhg)Mstg_2ABnzuMTuJdJo+Z|5OT7b_lpg$Ad4D7~iHP8~jhCswz`6|YWW z5V6`CWV9(0iE_4sH!?qr)Nxj}wBTmppPIH6V5Vv9{cK^ZUp3Iz`_nxCF(Nx|dyR2G z#y%wm)H^2ooV+(ZRv#H$+ffFsLe@{GUwR0TuJghGOA9WVxFr7%A59%1gWed8PM8tyh>4seW!W15|S1TDoEpVA}s5l z>7g8dlm0+&AepFUi~AC)+6Tm%9tY*voFjF3Cvj^udjDuhJ79uiEP!TQP5oQavlMyN z{WroiLO?5+Ec>qtKp-j+pYnF#8PpSuhtoF#>9xsFGp2Y5@+BCV46Q=PA7=PO9x`PshFY|R^h}vAjm$KSb3dOUYq$U~(&yBcC zCKTBE-wU;~!6u}spc$D5kt!w2DFmL6SfHA+aI(dA&!7g*ba|`iymVzcw4aVUq!BW7 ziDj4!4TzhC6}gdS zn-k5nG(~|@hmhm*WVd+M=j$EFJd`1jl*=>8ID4{VCN@iOZ9K8vK@ph=5ntTBrMKG= z7*wRF@`h|J2hv^lSSs7oKUxpKZr!P+OrW*|>p}o^5JoFV6p+2O22!q;m&!^~(3!A* zy2lEo!Y*p^VstKGUHMrT6ub8{e{=ww1saVh4|7S#ME%cU5Oy2zBt8^-8S9_2eb z>d9uGB(A(0%ppUPO6bpDdHjf-sG@w^cu0g%o~f3q71sK0<=x5cX|>&=;+WsWDH}a2 zh1$z0F`Hm39b(heWUC5+eCzJ{67@v6h2IHu3L=4Rg`*M|ZyW)7*1QYv*~qx{$6T|q zmqeAw???TgWSj%voI?qngAuEDv){HGw`vMSzeRd3Xeq?C|B0lJ(Q2Z2?Ai z7trBy)V~k3=jNJ9(y`?mwcFXCeDHpm*3zk=^La@qyKc_1bCQQ}-aLM8jRm0QIp#!J zEZ;i)lA<$|IkZA2{Pd&b90i@YT0BP3jzuiUtgEae)BLHB|KKS@^ z^jSq2ITQf|k1BlFhIB}ukRT%3C)}gL>muQO(3y#FWTo~i(_9N&3&y#xK7XF`!(WX& zGG1%LUx{R#4#NYgfIxi8GWD&X9?&-9@^*cf4M?s`Q4A9X_zPIATwa0kiX4=qFvWWf z!C5N}bSRD}1HItpJxwkI4<2QrN(sa8!}p-1%M7JOf~mddU~BBQr5hlxKuJ&6&8kvngG>;)FYlqQ6tc+;BX%Yp8RGA~j=&jA;lk z(khcyibKjB=}A@)h7@D*L<%ijKprdES>qmv=@y^v#wRJP^3-QSFThPb<9(Ge-Cc$w z+1~DH+9*3&jJcy+(VxpOEmcxvkTbxIRlK3BOzeH+*!(UuR(2;9_6Ad;q(WDYLpL(I3<$10kP1!A^|h0npsVMF zp`Tg8cmQc^RLkq&A|K0UC=8)iJ;U`leZ8$1{ddGuJ$n4_LSc|aEa5%Z_WRiK#9LVg zOX=24qgg^ZPLXjMI#^%(HPp7XGFIrAol=@o99bmQSsNhMs6BUgLVWtH>X)98f8HX^ zg&GQm0}d=sNPk-d0^q9O^UBSB<9|QqWB&y|`zNlJrT~d7vI@%GlJjRk=YZ8QkgpV$ zoxJ8XCoBu zyJo|2;&xFIl$x-79+K3voQ=#1f^ zq%QII(8HxSv88@Sx!svfW!VXl^UKPz^ya>3WmgW3>L?-su8Yo$oOFlEs@odYvw%!C zX+p3iX@ZBi$V^i3xU0RMD9-OJ3Yc7az)_CTeO_Cp?Lz_!{g4Bm%ZvABwxuR10-tN( z1Y0<)HHlU?-up52d%U&^U!Y9G-in~S`X()M8BM4l0t?7<(vs$61CZ~YgvZ(idBaI& zh8ie|)6_HWAw`5x_WL2xedhGpk_m7Nt^0M{mJR&p?^t#0srUD~>;DBYI2Othd8FUtZZ>!&SO zG9jt=ID@)#sfDJ_)F?D;)6CA;d0k`Ze&US67h=Yu&g#(U=(nWy*R9RgN zvOc%U#1ft%R851No8uN30WgHSH^_nSX+vYBE^ThqJjAQPVyd^Yg$H9zHiAKg)B;lk zwKl!t`sYN-O~q?97Dn~?&1b!ruuchz2z>`&k$070ShNLXh4A9RvBBWf?D$0#d;>wT z0I4i-vc_P(t5`>aYb4<{i8|cOA+K+HJZ-}Egm;r$X$Yw1F5#d#-KVrk-@75kaxyY^azs`(eqfoq2QAdLn zo2gV(S=5%|&?JPxm{&usM-{qJ+bMg8r-l7_EYZKio{G>C^yW;T4&xNjDHtz|lzejj z6i$7&tqHE*W71tD1&dihOKiP0KpfsSn;B_jjidE_w(zFX2(cg&$q5L2`t*BU63Gm@ zILEap)@jC)ebEJ-NnS5T$3t4yMW23aVsYc88hK>L54X>s@b(}m2wd7QlStJagSiEb zTDElLVsZ25Y0c$u`(f3GU+VPpSmpLDNnQ!*Y1XD<@Ujt`3_A$%5pYu;EcV2683(Kh zEkeXeOJ$~o8?s?{FtB=KaEWbgO3dyFV!FZ7n2SLGa#<7dMXv~1dqJKT!GX4kDW%vN zSRVzVBF8<3T%vGJkQ91?A_xw#tMgqsa@Woe*ac~BGOILGXj?#bFj1GbkM^_Pt2eGe ztdpFW`%-ftvE`GeE>=!PKVf91Onr&3h3^a7zg>#r&Fs#9B{eZXpVUw9JlNrC4J*Ak980PA z6XG2`2#T<;%d#o$O!t|r%lNZdu#3&SV~frObA?Y*8KRX#kza^LjE@ipIWNBSdVhHG zpLCQ-4`arU5I{g9s6arn|D!X={r%=oB)7lSP0%7?k$@A^FB?MJOXCpwnIHqouxc2p z+X712yAln_UqDzOtczp4WCmF;{`{tCXuLAKZw;_kno>IR&t_CPEII%RC%kW&czBv@ zOc4yDUxf8@B%J{YCu$C858&@E4-(jbxqY;FnT-ZR37C)A$yY7#7cr0rpr9o1NwPa5 zXbEY$zv*K1iqm`KA@%t5y1itj#Y0{@@$r+Ob6(B#!@n$*NV!1t-0P4#R64q>faYHB z7&(GD`Z^$a4Lt%~u;uQGM+a#^cX^1Uix#OTbsqk-WJLwvd5s0C1ct07?l8dbBcf&G z2g<3@Oy8b)E{O6zJRVf?UL8tZ?j(6eOctR-4(xrq<(Nuk&gZ8th_c7qqTHF;aodSl zm`aBAX%S%->y{*0SPWHw^dUtlD>uGB`{ihOuY{14B^my94Qo9bD?Ac+KvEx>bo;Q1 ztEHQ;I6fn@5~f7n+PR-$_9TY)Faj$p6bqcvgwk6hO;Max+z~>WoVOpzYJG;@Lw(6| zlO|O@GpDSgOU6+MS6U-j_?RUY*J;r;+SyRC&Dz6U-O!Ney@ecYWyOqClMzboUs_~A znq63-wN1Qw#=TqRScj|4broSp8m)p+LDsguGRNYk&v7`kn{5U`0=2(e?JGoGgdwTO zqT*}E*Qnb7sREjdi=kQg@uVphQ=yZP^|nT2+?*KrviueYEnRdvB=^wpzv(w9H{ey97-a z=R4(Ea7w6l@q9ZyN=<@XEIU<_hv)2VwDoei`RXl2> zZ@=LBoMajwUCDt}M#&|@!JZp|R-$mGkN<&;;XTPWLT-EfcFe7?zFg$fMVJ#rp5ze9 z?rfPIMJ0Dc6`T;Vln=dZ>1N@1^ZCIc$~DwQE->0AdmWybug1t6)W$?->w}!H+G%DC zFzN}{2@*FV8gvtUHb^lM_Xr*1H{0XKLY9znjT`-Ipg`{e8Ohe|j(c6jfR1Qs2hM2& zpu$}wlraxhJ!rp#_JB+K8XbzON_7g`-SKw^o{Ix|(7A_EG2Z&|Ilw`RpNEFY9p;CP z*ow4oG2Vt)V9FVax2s^z9Quc0YQLeUA`axhoMGtM7wFqtrdkaMqhp{;IbmzRy;`GQ zKd9|{p^-mHgj0JrNz7e6T5n>WnWVyDLhoO23BUwb1i{Dy`d?5F^3@%4V7gTQLlp+dCOU4)Fj%s8BH`q?!3g6$c6UtN26_Cd) z(uyFtaSwBJiij|y-)?NYqrypewX5Y_(x<;_VnV|xb5c%fJ(6h5FO`dIiLa(f8z8m4 zmyPPt{6`o9{uQZz%%=4DisWK>ZoKU<+}GH+ydia)`E6qXAQp5hEZo)zN(Njp^)Wn!%fedjrPwm zOKs+=xt!-jzGyt1a0#NAYhqPQF6Xh(#1Fm}xEl93R_x+EpPs04bPA4|#Zh8P%IN-j z#%pA3$`oI2R_fbwnhad4KxS=}78lI5j8*3Wbgd-<(RRQawf*e;!pZYh?3qMiz$QQ8 z40V68CiBNq*?8dCfaK*B2QRC4ooseqVo10U?+~9_IMY_1`3fd$lFc0AoY3XL$wPtf8#y%RS^rE45;33W_C$E2Gb))7rrt+~hch zgjWj2-rlvxfK!b!AL)!^AQGYJo(JDUjRT2$y(^rMwZ6_>&XboWhPqpoK|c9L$Z5cF zc@a(!geMZ(4Orzklhmm~W=*`vt25o1X+aQi@hv;iFg-*@PRj2Od>IODA0lBK*lLF z-agDwx)t;*!9t}umdPO_Wc)dtegi3ZEr(gvD4#07E%FcTX`H_XXewdhjtOtzt+b7bOI1>Iib*~)Rc{>nb zfIwGYXA0C7l{mQucpEo9E6GwYIH zu@UwXMNn}-BO~0!_WY)B0vzuR1U3nGm2lR<>XmFhJ*FhE3Xf4sRs#Oh(;=fRTMCrn zjQqIQ4zA{7q}9AN>iDvb_2F2T5L?Il+|q;OC2~BTKwaZzFro-1@VvvZ7$e$+aY=Nl zSJXh>N%J}!+}vP&9$?lp=(aRuiMUV}1L^j$eSyJjqDA}cn$&X}+Rt2$C~j-8gcC{k z{BB%=q_%6|Z944m+>euOw-x?_x zhAb6J5Jk~7?$&d#-}thk;r4R>8#>TgnT80|j5c+Hx?P!Q;+vyZJfv-}m28bcg&29y z!pd?vjQQC`^4Mx=F(gyaoQ)6E)&tq3^N7UXyrD!Y>5$@HqwDvbs_d+RPmJWm2ScA# z?{YP0;9x8fyTT2vsFI8;=lWR+Ouic22P6A@VMx&4e0-H@G}wCZA>;U?sOR+O%ciol zyNWMO^CeD}e8%{Qxh9q>)P*=slf563sG&9@`r+uLdzUT}4w> ztg7lJ%r!N`S8Z#-d*M;jt6p(FEoXOD6q(ZMF_UDIv(6?~$x>YIZl5xIo~@qZTdJd8 z$T+vq6uR~Wq9Hopb;Q@#N|@hBmF!QF5mk!0c_ukEFcMPV#ygY=xY{Z1BP5Z#sQU%> zpc8Ymt178`ceF6shV-EH_u`{ZfVC%p`C&%5r%~!T>>_Wk2c4el3=6mgPT;&+2V?hj zxKQZ;8Ak7O2{ASszM_x5ycI7{%I2)4*K|Tma7{C?4(^Eq)$Y;&pPoIWwoEj71uyo^ z$_4R5ZKJM%De8<=+`;{$_iLFx^?%4jKmi^-84+ax8VOm^U#r_=e?|iW*=OkMI6!@+ z2=D}>JTxIAAS)p%qNGeKBl?G^!%Y2m&B*uD|^k8&H;n`48wV*zNeo+eL1L+3Kc_$}tr=BWhI7 zfZ@2$=IEu$=$rJ2TVQ+Y$6r)8Vt*7IBWEq$acVA%&!>GCkC>*2euEifxctcruKR1x z1=Z&84YnnMyn_wFQIEgk+X*zCv5ws@R@WueT~L4ici+qFR93D9ya2ok)1?L!JUxt$ zw1kj|tdhtdQaeA0&61{;!$LQz=UK_DeXf!Pcy21Q1@=~O*{6UaAq(L;YqFR?(o%vN z@X}sdf{>dNo`fdvY-5*8Rhe8CiM_dst*@yEDP1r}buA)v1F1*{PWhbgb-BNN;)`{f zcU;@O*cH5%an7oQI6B##`N&;o&IWsvtX6V*b)!Ab>R?n;g0OR| z-ukxvIny`tB&eDg8DdRRE{0YK2oo)GW3}LMN7NFesxm{z0tD5QX=S5eJ#<|ZUM&kY z^(M@tSA?l~wo)p=Ljr{=eS*y(7t>)2spqnZSiI)0!?8vm>Q~k2jO0+LWL}rLHDLSA zx%5NJL!s@g5`1Fc3!A#jy=hEwpbALEd16&xg@op;<<`4EsKo!mgt%DZI2g$&5Q1bs zE{aqFO4AE-MT4%p6;xNC2*(Bv9JkN{ny?5OA4{>mbz-H)NUg1Wyt3$XBgtL*?5;tN zy=zCEs~Pp@vB+vR1cttXvDG#)COQ*QF}pQEy|0QDYSlnz8iz>FGPrJC2*HUZp=5iu z8c}Y8sr5yuxaw54xff@`H-@QHgKlKUj@qs&OGxJGOir<~pt7~iNBtJpo$JLUs*2l^ zn#xH-n$J&94(KenKsOCKAOzM|z)LwVffkM9V2}hpi8|@~Df&{8k2CSa47_Gtm)368 zyiQ2f+0+fZ8zp;MkEIZg`qJGMcn7g^3LX`>NoyYmjJpey9Jp?SWOBMx3yKo*^Aw(# zrj1Hbq5sD5a~ZTdKds8#aa(2y4OE=MhH}h4lqkPUCSP;M4|XnxSxhITJ%{#{gzPyW zyMc?u0=lq#@_yGPC(f0}T(vn0nC^>Kw_q8z1@(LI*SNJtRup`e&o$4>7W?2U;h{hj z#yH#+1H3YAu#onsac%m&NF-?xwmv#n2kkZfBWttkmQQYE7y)wvO`8sky z#!x^@V|inZJN><^^}$`)+p)Ev@urPcIbhYbdIbfv=+*$&DtIOf9mAyv>A3AF=xZ^d z&1$9Jh)I-;P80f|)%tfJO4odm%n{hQB7hdx8{w`$H<45e$h#Bca+YK%7ur4z^9;i4 zezq90=wDa*@w0t`H!%BiFD#(97E``tYKADj7Vc?~U}&tY?jVTELoZoh8mMo?`ic|o zO3e;3$D))<+c-fqq+DL3M1smr#T?UhmR&TS&?hw=h&PsCsa7fWTq`$KqEMQnbBcZ_ zw1kg>SR!X!1kVIp&yFdohn1ZH60VLXMGYqU5IvShE}7vJn;$dtktp^G>-F?{4+nx0 z>jE9tD7kiCj>1zE)+4Ac+l{y4QO2{ZmC&BAB`=7iLVy)Mwoc1>5vOP~PQtw1!&-X!XrDyY`sB^ux8)cUUNx-EDWa57eFVLg;shqz z0Y_)!rxegsEhj>?hhWQ_-avq7aZy$mlq`t8g>vRo>e!d|}$B6M(Sitl}CARIN zy`%DmMNjh!9=Jjl24ZIKo7KA8K<5m>W|vPdJ}p3L||u1jeXN# zIsn#zFyOjK5843p&x>Sr3g>gf)5;6o(BEuR zy2Omjo>~|-yP-mbo!_>6ZRN~fN(9wcgl$*hyXPdUDKn>`#s6+}i~`qEFYO%t3(~y) z1f?D#@U2(0u&UfZ1iBxnUEuBqn!FJbn}U>&hLOdPRo1leESG$zg78t$Mx2}`qgtv0 z)}*)61Y6i=p~|I1HPzj93C8r?q86f`XlzD#yADC))@_^P+7|7VUy|#6;Kj&L|2FDb|n`eEL?or1$OI1kjz$Csj0HVF&zSG zRp^&W(a})QV}v=$eAtl$!`lw9`R56V*jZ*#0{tohkTlAe++;~HOvCW`wnq}OQYVw< zQ=~qs$1OJ3qf-@ALpNXdPhj61*b%%kn!n(QvNyZ>G#4k(`c|5wjQWBK8C6pbiZ>SZ zB$0A^Qlo;1LL%jxsUdeBBIBXJHe^@AEDN&3a=Y2SWE(gm7AbDE(nahtox~h;CnT!} z@?@+)kMXW?hJ5ifTx0B(w97gGh%jA)gMz3&P41)CDn+{*geDc=%%>Kz2kBFtD zS3Y;=e37V=Q`+=fnN@E=`)jQ|m#;yC*-xgL{F_OTp{tug#Xgy3dP4VLtK`&MT7kYu z6_y4<`*P>Q70D0|ixYsMA5hUJJn?M%Z6d_%5&P{dz1)&k3X8(dv!Lqn(ln^nu@x!@ zFzBPLUX||X6wzr;gE}(|cdY%?Iz~f=Ii2YI5p$*N%KUr{Ztk&=PZM>Ck|a-ShI3y^ zc*iUhNT5a(Vw?2|=P*Q&8NHp&$8iEd4s@3qifDma(Pb&)XG6#5%*%{O$;C6sQZ%}f zLaKmy!~T&Sr=r&?F)q9;4M&GGHy$%zM2a=;ViX}8;__s6fO^+Qh=iO!~n~f_6VDyVFj2{ouOJ-Yx^_B;Zt6r zJYuEDfov;#L$8BTb|%*R|<_2#ay>A?-h0##4G+gg+4vP?Z3uSUc7e(j?J&j6KKkPOOJ zI{DHlG)~{EQG7@$$p^L1fcH#C@dU)FE2GUWG>I>KeIVF@&aN0NichPs%bo|Bk9vB* ziT+E^#!Vy}{+F(vuB9bIr?f}Wqz74@5dIepkKX|oqbF~Xn*z8uP-PE8}eCu{IakA5lz^C)E=?1Y< z`?}PRM`$NZZjDhmT?D8KpD7yr2I8$=g?*j_X|gOc0@e3)+%Sez2R8S>OR<=+qAgc zvI~)23#Ngxvjh%zc(N{$gQ~^t0NRlR^_O6y@2m-U)WO}SMoW<${baeYtNfCwA=3%* zF&!-z%=Pth>Z#XWxG-RDboY&zt>5?rgA5;IHxUYo^}n#gLp-2qj|Xk`UfFiOcK4)r zlkJCqH}M(3lEJ!+xaAM- z*ZmfIJAB=8z-3dt%c!VZYkgfbp^!r~4D$ke>7|)zZIT=;ZCVrZQ_a;f><-`tmfZPW zvUj+iKABdRx@)lr9Bn}%Gm30k8I?Chb>2+hp1}12+4vpW`=XCv9xiE9OBO+oZadKXM zIJaBfDmakJzzvoovGu7u)wH;E!BWH(L|zQ5zgXElD6JJ2l3Q3b1j ziver+J3&O7m;R-`g9#;?E8K~6%`M}=0!F?^S#w6@okhmX7;{X>DZ^qmm5wsb*hm~T zEod2p#dlcV%xx5(rKHK1GD(ExVlt^=uca|wk?An;8nrt#$Gq4vPnMh-Ek1S+jlsiWI}2YL(iTngih2QZLai*a*hJiL|I|=UGs)^hp~$4 zuzQuLsi~`tG=Yn?&{^B8MsM(aOoy@11PXFHETQ{U@@3T~yPtQNwC&>!IxWTZbCd}S z!SpHc@@_$FUcY?NXwD1G54rXr&`(IXd~d;RC{cws_Y(BX5YNpSS3fKfK0(eXIJhjA zb)%&%mOcwSZ|0BB5;#Ag1#{Y+%;ii+M}Xo#%ED{r>g{)SW(%!bgZJ8sJR*usa#IDe z1p&M|q02kB;3J2lkR7pKGiUOCxp-u6&XRZ~6O*1G; zTz0=q(36*eYdX6)aO&D0Y0dG^tfiZ5fMh9_zrE)Fk=9?x~Q`j=T&MRI} zXqFTp7u|yJjP}s`e0XEyjDqNx81jjB^YuCRsO!=NYiv4$sB$(tk<*Q-&;%m~=A1u& zyb$EsNR#~~UhS2o#|@HlE~#Go=({z%+f*i$nD9y2kGAbnzSrLGG0yMATR@MSj z+Qzj?Apt{2FX@S@o3&=ksb?GESF?9b9>ZPLAn2M9@1qp|gkG#TG1PApmE)oatU&!W z^QdgHG4wXdZOb8o%D1WcIv_JIt>-2xGNBg_S`|0Nb+^zpZ0Dl?V?;ihPcm1b2?LQCwdpXMAf!xnz7oZd4 zKLWY$UJ7t@FHZTx5&!x;elJb=&=C=!kH$|p`hCE^IQ)TD@C zzYF@Gz#hwRJp@LM{BuG7BNUH;{Z7jHIT(*6njXS<10W0i%-sJ4jK30W`Z-vSwQe55 zS^<3cf5Q3;<(r>F_E;+9Au=wU|2ML~5Dxh{WRLX~9wJM}|9>O{vohy zqW?Fr`xnFCDXsh+t{0`h0k`Ok&yaj&w6;6mB|=ivTb+#a{ZcnGaQ z@V|xjMtY^)1y=drfc;Iy;9ovlk7?NtabaowZ*V;%Y5!cT9&@-JV)8cn-(mUz zqwDYB{kmm;&-3{m-u;$(Z36U=^#d?>&-(eB9qm5Pe$4s#)f|BS1K82{Odptg!~gwXqUpJ!f*)k6;>zO^U{0DdDe)xZn_xsDdhgQ*C9$5YTYVWU%f`J3Z2t)z+)&t}k KGk`w@0{TCAtviPR literal 0 HcmV?d00001 diff --git a/paimon-spark/paimon-spark-4.1/src/test/resources/hive-site.xml b/paimon-spark/paimon-spark-4.1/src/test/resources/hive-site.xml new file mode 100644 index 000000000000..bdf2bb090760 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/resources/hive-site.xml @@ -0,0 +1,56 @@ + + + + + hive.metastore.integral.jdo.pushdown + true + + + + hive.metastore.schema.verification + false + + + + hive.metastore.client.capability.check + false + + + + datanucleus.schema.autoCreateTables + true + + + + datanucleus.schema.autoCreateAll + true + + + + + datanucleus.connectionPoolingType + DBCP + + + + hive.metastore.uris + thrift://localhost:9090 + Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore. + + \ No newline at end of file diff --git a/paimon-spark/paimon-spark-4.1/src/test/resources/log4j2-test.properties b/paimon-spark/paimon-spark-4.1/src/test/resources/log4j2-test.properties new file mode 100644 index 000000000000..6f324f5863ac --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/resources/log4j2-test.properties @@ -0,0 +1,38 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# Set root logger level to OFF to not flood build logs +# set manually to INFO for debugging purposes +rootLogger.level = OFF +rootLogger.appenderRef.test.ref = TestLogger + +appender.testlogger.name = TestLogger +appender.testlogger.type = CONSOLE +appender.testlogger.target = SYSTEM_ERR +appender.testlogger.layout.type = PatternLayout +appender.testlogger.layout.pattern = %-4r [%tid %t] %-5p %c %x - %m%n + +logger.kafka.name = kafka +logger.kafka.level = OFF +logger.kafka2.name = state.change +logger.kafka2.level = OFF + +logger.zookeeper.name = org.apache.zookeeper +logger.zookeeper.level = OFF +logger.I0Itec.name = org.I0Itec +logger.I0Itec.level = OFF diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTest.scala new file mode 100644 index 000000000000..322d50a62127 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +class CompactProcedureTest extends CompactProcedureTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateGlobalVectorIndexProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateGlobalVectorIndexProcedureTest.scala new file mode 100644 index 000000000000..b9283d996cc6 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateGlobalVectorIndexProcedureTest.scala @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.utils.Range + +import scala.collection.JavaConverters._ +import scala.collection.immutable + +class CreateGlobalVectorIndexProcedureTest extends CreateGlobalIndexProcedureTest { + test("create lucene-vector-knn global index") { + withTable("T") { + spark.sql(""" + |CREATE TABLE T (id INT, v ARRAY) + |TBLPROPERTIES ( + | 'bucket' = '-1', + | 'global-index.row-count-per-shard' = '10000', + | 'row-tracking.enabled' = 'true', + | 'data-evolution.enabled' = 'true') + |""".stripMargin) + + val values = (0 until 100) + .map( + i => s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)))") + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + val output = + spark + .sql("CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')") + .collect() + .head + + assert(output.getBoolean(0)) + + val table = loadTable("T") + val indexEntries = table + .store() + .newIndexFileHandler() + .scanEntries() + .asScala + .filter(_.indexFile().indexType() == "lucene-vector-knn") + + assert(indexEntries.nonEmpty) + val totalRowCount = indexEntries.map(_.indexFile().rowCount()).sum + assert(totalRowCount == 100L) + } + } + + test("create lucene-vector-knn global index with partition") { + withTable("T") { + spark.sql(""" + |CREATE TABLE T (id INT, v ARRAY, pt STRING) + |TBLPROPERTIES ( + | 'bucket' = '-1', + | 'global-index.row-count-per-shard' = '10000', + | 'row-tracking.enabled' = 'true', + | 'data-evolution.enabled' = 'true') + | PARTITIONED BY (pt) + |""".stripMargin) + + var values = (0 until 65000) + .map( + i => + s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)), 'p0')") + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + values = (0 until 35000) + .map( + i => + s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)), 'p1')") + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + values = (0 until 22222) + .map( + i => + s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)), 'p0')") + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + val output = + spark + .sql("CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')") + .collect() + .head + + assert(output.getBoolean(0)) + + val table = loadTable("T") + val indexEntries = table + .store() + .newIndexFileHandler() + .scanEntries() + .asScala + .filter(_.indexFile().indexType() == "lucene-vector-knn") + + assert(indexEntries.nonEmpty) + val totalRowCount = indexEntries.map(_.indexFile().rowCount()).sum + assert(totalRowCount == 122222L) + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ProcedureTest.scala new file mode 100644 index 000000000000..d57846709877 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ProcedureTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +class ProcedureTest extends ProcedureTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/AnalyzeTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/AnalyzeTableTest.scala new file mode 100644 index 000000000000..255906d04bf2 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/AnalyzeTableTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class AnalyzeTableTest extends AnalyzeTableTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLTest.scala new file mode 100644 index 000000000000..b729f57b33e7 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class DDLTest extends DDLTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLWithHiveCatalogTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLWithHiveCatalogTest.scala new file mode 100644 index 000000000000..cb139d2a57be --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DDLWithHiveCatalogTest.scala @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class DDLWithHiveCatalogTest extends DDLWithHiveCatalogTestBase {} + +class DefaultDatabaseTest extends DefaultDatabaseTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTest.scala new file mode 100644 index 000000000000..6170e2fd6c5c --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class DataFrameWriteTest extends DataFrameWriteTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DeleteFromTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DeleteFromTableTest.scala new file mode 100644 index 000000000000..8d620ece8245 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DeleteFromTableTest.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.spark.SparkConf + +class DeleteFromTableTest extends DeleteFromTableTestBase { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "false") + } +} + +class V2DeleteFromTableTest extends DeleteFromTableTestBase { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "true") + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DescribeTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DescribeTableTest.scala new file mode 100644 index 000000000000..c6aa77419241 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DescribeTableTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class DescribeTableTest extends DescribeTableTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/FormatTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/FormatTableTest.scala new file mode 100644 index 000000000000..ba49976ab6c0 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/FormatTableTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class FormatTableTest extends FormatTableTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/InsertOverwriteTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/InsertOverwriteTableTest.scala new file mode 100644 index 000000000000..4f66584c303b --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/InsertOverwriteTableTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class InsertOverwriteTableTest extends InsertOverwriteTableTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTest.scala new file mode 100644 index 000000000000..c83ee5493867 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/MergeIntoTableTest.scala @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.paimon.spark.{PaimonAppendBucketedTableTest, PaimonAppendNonBucketTableTest, PaimonPrimaryKeyBucketedTableTest, PaimonPrimaryKeyNonBucketTableTest} + +import org.apache.spark.SparkConf + +class MergeIntoPrimaryKeyBucketedTableTest + extends MergeIntoTableTestBase + with MergeIntoPrimaryKeyTableTest + with MergeIntoNotMatchedBySourceTest + with PaimonPrimaryKeyBucketedTableTest { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "false") + } +} + +class MergeIntoPrimaryKeyNonBucketTableTest + extends MergeIntoTableTestBase + with MergeIntoPrimaryKeyTableTest + with MergeIntoNotMatchedBySourceTest + with PaimonPrimaryKeyNonBucketTableTest { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "false") + } +} + +class MergeIntoAppendBucketedTableTest + extends MergeIntoTableTestBase + with MergeIntoAppendTableTest + with MergeIntoNotMatchedBySourceTest + with PaimonAppendBucketedTableTest { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "false") + } +} + +class MergeIntoAppendNonBucketedTableTest + extends MergeIntoTableTestBase + with MergeIntoAppendTableTest + with MergeIntoNotMatchedBySourceTest + with PaimonAppendNonBucketTableTest { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "false") + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonCompositePartitionKeyTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonCompositePartitionKeyTest.scala new file mode 100644 index 000000000000..635185a9ed0e --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonCompositePartitionKeyTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class PaimonCompositePartitionKeyTest extends PaimonCompositePartitionKeyTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonOptimizationTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonOptimizationTest.scala new file mode 100644 index 000000000000..ec140a89bbd3 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonOptimizationTest.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.expressions.{Attribute, GetStructField, NamedExpression, ScalarSubquery} +import org.apache.spark.sql.paimon.shims.SparkShimLoader + +class PaimonOptimizationTest extends PaimonOptimizationTestBase { + + override def extractorExpression( + cteIndex: Int, + output: Seq[Attribute], + fieldIndex: Int): NamedExpression = { + GetStructField( + ScalarSubquery( + SparkShimLoader.shim + .createCTERelationRef(cteIndex, resolved = true, output.toSeq, isStreaming = false)), + fieldIndex, + None) + .as("scalarsubquery()") + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonPushDownTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonPushDownTest.scala new file mode 100644 index 000000000000..26677d85c71a --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonPushDownTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class PaimonPushDownTest extends PaimonPushDownTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonV1FunctionTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonV1FunctionTest.scala new file mode 100644 index 000000000000..f37fbad27033 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonV1FunctionTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class PaimonV1FunctionTest extends PaimonV1FunctionTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonViewTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonViewTest.scala new file mode 100644 index 000000000000..6ab8a2671b51 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/PaimonViewTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class PaimonViewTest extends PaimonViewTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RewriteUpsertTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RewriteUpsertTableTest.scala new file mode 100644 index 000000000000..412aa3b30351 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RewriteUpsertTableTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class RewriteUpsertTableTest extends RewriteUpsertTableTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowIdPushDownTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowIdPushDownTest.scala new file mode 100644 index 000000000000..da4c9b854df3 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowIdPushDownTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class RowIdPushDownTest extends RowIdPushDownTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowTrackingTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowTrackingTest.scala new file mode 100644 index 000000000000..9f96840a7788 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/RowTrackingTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class RowTrackingTest extends RowTrackingTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/ShowColumnsTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/ShowColumnsTest.scala new file mode 100644 index 000000000000..6601dc2fca37 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/ShowColumnsTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class ShowColumnsTest extends PaimonShowColumnsTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/SparkV2FilterConverterTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/SparkV2FilterConverterTest.scala new file mode 100644 index 000000000000..21c4c8a495ed --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/SparkV2FilterConverterTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class SparkV2FilterConverterTest extends SparkV2FilterConverterTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/TagDdlTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/TagDdlTest.scala new file mode 100644 index 000000000000..92309d54167b --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/TagDdlTest.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +class TagDdlTest extends PaimonTagDdlTestBase {} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTest.scala new file mode 100644 index 000000000000..3a0f56cd4820 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/UpdateTableTest.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.spark.SparkConf + +class UpdateTableTest extends UpdateTableTestBase { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "false") + } +} + +class V2UpdateTableTest extends UpdateTableTestBase { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.write.use-v2-write", "true") + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VariantTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VariantTest.scala new file mode 100644 index 000000000000..94e9ac683f02 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VariantTest.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.spark.SparkConf + +class VariantTest extends VariantTestBase { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.variant.inferShreddingSchema", "false") + } +} + +class VariantInferShreddingTest extends VariantTestBase { + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.paimon.variant.inferShreddingSchema", "true") + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VectorSearchPushDownTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VectorSearchPushDownTest.scala new file mode 100644 index 000000000000..7ac3c5df0d00 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/VectorSearchPushDownTest.scala @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.paimon.spark.PaimonScan + +/** Tests for vector search table-valued function with global vector index. */ +class VectorSearchPushDownTest extends BaseVectorSearchPushDownTest { + test("vector search with global index") { + withTable("T") { + spark.sql(""" + |CREATE TABLE T (id INT, v ARRAY) + |TBLPROPERTIES ( + | 'bucket' = '-1', + | 'global-index.row-count-per-shard' = '10000', + | 'row-tracking.enabled' = 'true', + | 'data-evolution.enabled' = 'true') + |""".stripMargin) + + // Insert 100 rows with predictable vectors + val values = (0 until 100) + .map( + i => s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)))") + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + // Create vector index + val output = spark + .sql("CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')") + .collect() + .head + assert(output.getBoolean(0)) + + // Test vector search with table-valued function syntax + val result = spark + .sql(""" + |SELECT * FROM vector_search('T', 'v', array(50.0f, 51.0f, 52.0f), 5) + |""".stripMargin) + .collect() + + // The result should contain 5 rows + assert(result.length == 5) + + // Vector (50, 51, 52) should be most similar to the row with id=50 + assert(result.map(_.getInt(0)).contains(50)) + } + } + + test("vector search pushdown is applied in plan") { + withTable("T") { + spark.sql(""" + |CREATE TABLE T (id INT, v ARRAY) + |TBLPROPERTIES ( + | 'bucket' = '-1', + | 'global-index.row-count-per-shard' = '10000', + | 'row-tracking.enabled' = 'true', + | 'data-evolution.enabled' = 'true') + |""".stripMargin) + + val values = (0 until 10) + .map( + i => s"($i, array(cast($i as float), cast(${i + 1} as float), cast(${i + 2} as float)))") + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + // Create vector index + spark + .sql("CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')") + .collect() + + // Check that vector search is pushed down with table function syntax + val df = spark.sql(""" + |SELECT * FROM vector_search('T', 'v', array(50.0f, 51.0f, 52.0f), 5) + |""".stripMargin) + + // Get the scan from the executed plan (physical plan) + val executedPlan = df.queryExecution.executedPlan + val batchScans = executedPlan.collect { + case scan: org.apache.spark.sql.execution.datasources.v2.BatchScanExec => scan + } + + assert(batchScans.nonEmpty, "Should have a BatchScanExec in executed plan") + val paimonScans = batchScans.filter(_.scan.isInstanceOf[PaimonScan]) + assert(paimonScans.nonEmpty, "Should have a PaimonScan in executed plan") + + val paimonScan = paimonScans.head.scan.asInstanceOf[PaimonScan] + assert(paimonScan.pushedVectorSearch.isDefined, "Vector search should be pushed down") + assert(paimonScan.pushedVectorSearch.get.fieldName() == "v", "Field name should be 'v'") + assert(paimonScan.pushedVectorSearch.get.limit() == 5, "Limit should be 5") + } + } + + test("vector search topk returns correct results") { + withTable("T") { + spark.sql(""" + |CREATE TABLE T (id INT, v ARRAY) + |TBLPROPERTIES ( + | 'bucket' = '-1', + | 'global-index.row-count-per-shard' = '10000', + | 'row-tracking.enabled' = 'true', + | 'data-evolution.enabled' = 'true') + |""".stripMargin) + + // Insert rows with distinct vectors + val values = (1 to 100) + .map { + i => + val v = math.sqrt(3.0 * i * i) + val normalized = i.toFloat / v.toFloat + s"($i, array($normalized, $normalized, $normalized))" + } + .mkString(",") + spark.sql(s"INSERT INTO T VALUES $values") + + // Create vector index + spark.sql( + "CALL sys.create_global_index(table => 'test.T', index_column => 'v', index_type => 'lucene-vector-knn', options => 'vector.dim=3')") + + // Query for top 10 similar to (1, 1, 1) normalized + val result = spark + .sql(""" + |SELECT * FROM vector_search('T', 'v', array(0.577f, 0.577f, 0.577f), 10) + |""".stripMargin) + .collect() + + assert(result.length == 10) + } + } +} diff --git a/pom.xml b/pom.xml index 0db5ac8d4560..222d81df790a 100644 --- a/pom.xml +++ b/pom.xml @@ -424,6 +424,7 @@ under the License. paimon-spark/paimon-spark4-common paimon-spark/paimon-spark-4.0 + paimon-spark/paimon-spark-4.1 17 From db3a420c1bcdefd9c82b1309e737e27ff0d6601e Mon Sep 17 00:00:00 2001 From: Muhammad Junaid Muzammil <4795269+junmuz@users.noreply.github.com> Date: Mon, 16 Mar 2026 08:30:40 -0700 Subject: [PATCH 2/3] Spark 4.1 test changes made --- .../paimon/spark/PaimonCDCSourceTest.scala | 252 ++++ .../apache/paimon/spark/PaimonSinkTest.scala | 365 +++++ .../procedure/AlterBranchProcedureTest.scala | 97 ++ .../spark/procedure/BranchProcedureTest.scala | 184 +++ .../procedure/CompactProcedureTestBase.scala | 1324 +++++++++++++++++ .../CreateAndDeleteTagProcedureTest.scala | 224 +++ .../CreateTagFromTimestampProcedureTest.scala | 180 +++ .../ExpirePartitionsProcedureTest.scala | 760 ++++++++++ .../ExpireSnapshotsProcedureTest.scala | 284 ++++ .../procedure/RollbackProcedureTest.scala | 228 +++ 10 files changed, 3898 insertions(+) create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonCDCSourceTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSinkTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/AlterBranchProcedureTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/BranchProcedureTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTestBase.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateAndDeleteTagProcedureTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateTagFromTimestampProcedureTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpirePartitionsProcedureTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpireSnapshotsProcedureTest.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/RollbackProcedureTest.scala diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonCDCSourceTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonCDCSourceTest.scala new file mode 100644 index 000000000000..9b9393be7118 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonCDCSourceTest.scala @@ -0,0 +1,252 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest + +class PaimonCDCSourceTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + test("Paimon CDC Source: batch write and streaming read change-log with default scan mode") { + withTempDir { + checkpointDir => + val tableName = "T" + spark.sql(s"DROP TABLE IF EXISTS $tableName") + spark.sql(s""" + |CREATE TABLE $tableName (a INT, b STRING) + |TBLPROPERTIES ( + | 'primary-key'='a', + | 'bucket'='2', + | 'changelog-producer' = 'lookup') + |""".stripMargin) + + spark.sql(s"INSERT INTO $tableName VALUES (1, 'v_1')") + spark.sql(s"INSERT INTO $tableName VALUES (2, 'v_2')") + spark.sql(s"INSERT INTO $tableName VALUES (2, 'v_2_new')") + + val table = loadTable(tableName) + val location = table.location().toString + + val readStream = spark.readStream + .format("paimon") + .option("read.changelog", "true") + .load(location) + .writeStream + .format("memory") + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .queryName("mem_table") + .outputMode("append") + .start() + + val currentResult = () => spark.sql("SELECT * FROM mem_table") + try { + readStream.processAllAvailable() + val expertResult1 = Row("+I", 1, "v_1") :: Row("+I", 2, "v_2_new") :: Nil + checkAnswer(currentResult(), expertResult1) + + spark.sql(s"INSERT INTO $tableName VALUES (1, 'v_1_new'), (3, 'v_3')") + readStream.processAllAvailable() + val expertResult2 = + Row("+I", 1, "v_1") :: Row("-U", 1, "v_1") :: Row("+U", 1, "v_1_new") :: Row( + "+I", + 2, + "v_2_new") :: Row("+I", 3, "v_3") :: Nil + checkAnswer(currentResult(), expertResult2) + } finally { + readStream.stop() + } + } + } + + test("Paimon CDC Source: batch write and streaming read change-log with scan.snapshot-id") { + withTempDir { + checkpointDir => + val tableName = "T" + spark.sql(s"DROP TABLE IF EXISTS $tableName") + spark.sql(s""" + |CREATE TABLE $tableName (a INT, b STRING) + |TBLPROPERTIES ( + | 'primary-key'='a', + | 'bucket'='2', + | 'changelog-producer' = 'lookup') + |""".stripMargin) + + spark.sql(s"INSERT INTO $tableName VALUES (1, 'v_1')") + spark.sql(s"INSERT INTO $tableName VALUES (2, 'v_2')") + spark.sql(s"INSERT INTO $tableName VALUES (2, 'v_2_new')") + + val table = loadTable(tableName) + val location = table.location().toString + + val readStream = spark.readStream + .format("paimon") + .option("read.changelog", "true") + .option("scan.mode", "from-snapshot") + .option("scan.snapshot-id", 1) + .load(location) + .writeStream + .format("memory") + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .queryName("mem_table") + .outputMode("append") + .start() + + val currentResult = () => spark.sql("SELECT * FROM mem_table") + try { + readStream.processAllAvailable() + val expertResult1 = Row("+I", 1, "v_1") :: Row("+I", 2, "v_2") :: Row( + "-U", + 2, + "v_2") :: Row("+U", 2, "v_2_new") :: Nil + checkAnswer(currentResult(), expertResult1) + + spark.sql(s"INSERT INTO $tableName VALUES (1, 'v_1_new'), (3, 'v_3')") + readStream.processAllAvailable() + val expertResult2 = + Row("+I", 1, "v_1") :: Row("-U", 1, "v_1") :: Row("+U", 1, "v_1_new") :: Row( + "+I", + 2, + "v_2") :: Row("-U", 2, "v_2") :: Row("+U", 2, "v_2_new") :: Row("+I", 3, "v_3") :: Nil + checkAnswer(currentResult(), expertResult2) + } finally { + readStream.stop() + } + } + } + + test("Paimon CDC Source: streaming write and streaming read change-log") { + withTempDirs { + (checkpointDir1, checkpointDir2) => + val tableName = "T" + spark.sql(s"DROP TABLE IF EXISTS $tableName") + spark.sql(s""" + |CREATE TABLE $tableName (a INT, b STRING) + |TBLPROPERTIES ( + | 'primary-key'='a', + | 'bucket'='2', + | 'changelog-producer' = 'lookup') + |""".stripMargin) + + val table = loadTable(tableName) + val location = table.location().toString + + // streaming write + val inputData = MemoryStream[(Int, String)] + val writeStream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir1.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + // streaming read + val readStream = spark.readStream + .format("paimon") + .option("read.changelog", "true") + .option("scan.mode", "from-snapshot") + .option("scan.snapshot-id", 1) + .load(location) + .writeStream + .format("memory") + .option("checkpointLocation", checkpointDir2.getCanonicalPath) + .queryName("mem_table") + .outputMode("append") + .start() + + val currentResult = () => spark.sql("SELECT * FROM mem_table") + try { + inputData.addData((1, "v_1")) + writeStream.processAllAvailable() + readStream.processAllAvailable() + val expertResult1 = Row("+I", 1, "v_1") :: Nil + checkAnswer(currentResult(), expertResult1) + + inputData.addData((2, "v_2")) + writeStream.processAllAvailable() + readStream.processAllAvailable() + val expertResult2 = Row("+I", 1, "v_1") :: Row("+I", 2, "v_2") :: Nil + checkAnswer(currentResult(), expertResult2) + + inputData.addData((2, "v_2_new")) + writeStream.processAllAvailable() + readStream.processAllAvailable() + val expertResult3 = Row("+I", 1, "v_1") :: Row("+I", 2, "v_2") :: Row( + "-U", + 2, + "v_2") :: Row("+U", 2, "v_2_new") :: Nil + checkAnswer(currentResult(), expertResult3) + + inputData.addData((1, "v_1_new"), (3, "v_3")) + writeStream.processAllAvailable() + readStream.processAllAvailable() + val expertResult4 = + Row("+I", 1, "v_1") :: Row("-U", 1, "v_1") :: Row("+U", 1, "v_1_new") :: Row( + "+I", + 2, + "v_2") :: Row("-U", 2, "v_2") :: Row("+U", 2, "v_2_new") :: Row("+I", 3, "v_3") :: Nil + checkAnswer(currentResult(), expertResult4) + } finally { + readStream.stop() + } + } + } + + test("Paimon CDC Source: streaming read change-log with audit_log system table") { + withTable("T") { + withTempDir { + checkpointDir => + spark.sql( + s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a','bucket'='2', 'changelog-producer' = 'lookup') + |""".stripMargin) + + val readStream = spark.readStream + .format("paimon") + .table("`T$audit_log`") + .writeStream + .format("memory") + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .queryName("mem_table") + .outputMode("append") + .start() + + val currentResult = () => spark.sql("SELECT * FROM mem_table") + try { + spark.sql(s"INSERT INTO T VALUES (1, 'v_1')") + readStream.processAllAvailable() + checkAnswer(currentResult(), Row("+I", 1, "v_1") :: Nil) + + spark.sql(s"INSERT INTO T VALUES (2, 'v_2')") + readStream.processAllAvailable() + checkAnswer(currentResult(), Row("+I", 1, "v_1") :: Row("+I", 2, "v_2") :: Nil) + } finally { + readStream.stop() + } + } + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSinkTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSinkTest.scala new file mode 100644 index 000000000000..9935288db9a7 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSinkTest.scala @@ -0,0 +1,365 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark + +import org.apache.paimon.Snapshot.CommitKind._ + +import org.apache.spark.SparkConf +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.functions.{col, mean, window} +import org.apache.spark.sql.streaming.StreamTest + +import java.sql.Date + +class PaimonSinkTest extends PaimonSparkTestBase with StreamTest { + + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.sql.catalog.paimon.cache-enabled", "false") + } + + import testImplicits._ + + test("Paimon Sink: forEachBatch") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], id: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Sink: append mode") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and sink into it in append mode + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .format("paimon") + .start(location) + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Sink: complete mode") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define an append-only table and sink into it in complete mode + spark.sql(s""" + |CREATE TABLE T (city String, population Long) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData.toDS + .toDF("uid", "city") + .groupBy("city") + .count() + .toDF("city", "population") + .writeStream + .outputMode("complete") + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .format("paimon") + .start(location) + + val query = () => spark.sql("SELECT * FROM T ORDER BY city") + + try { + inputData.addData((1, "HZ"), (2, "BJ"), (3, "BJ")) + stream.processAllAvailable() + checkAnswer(query(), Row("BJ", 2L) :: Row("HZ", 1L) :: Nil) + + inputData.addData((4, "SH"), (5, "BJ"), (6, "HZ")) + stream.processAllAvailable() + checkAnswer(query(), Row("BJ", 3L) :: Row("HZ", 2L) :: Row("SH", 1L) :: Nil) + + inputData.addData((7, "HZ"), (8, "SH")) + stream.processAllAvailable() + checkAnswer(query(), Row("BJ", 3L) :: Row("HZ", 3L) :: Row("SH", 2L) :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Sink: update mode") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and sink into it in update mode + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + intercept[RuntimeException] { + inputData + .toDF() + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .outputMode("update") + .format("paimon") + .start(location) + } + } + } + } + + test("Paimon Sink: aggregation and watermark") { + withTempDir { + checkpointDir => + // define an append-only table and sink into it with aggregation and watermark in append mode + spark.sql(s""" + |CREATE TABLE T (start Timestamp, stockId INT, avg_price DOUBLE) + |TBLPROPERTIES ('bucket'='3', 'bucket-key'='stockId') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Long, Int, Double)] + val data = inputData.toDS + .toDF("time", "stockId", "price") + .selectExpr("CAST(time AS timestamp) AS timestamp", "stockId", "price") + .withWatermark("timestamp", "10 seconds") + .groupBy(window($"timestamp", "5 seconds"), col("stockId")) + .agg(mean("price").as("avg_price")) + .select("window.start", "stockId", "avg_price") + + val stream = + data.writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .format("paimon") + .start(location) + + val query = () => + spark.sql( + "SELECT CAST(start as BIGINT) AS start, stockId, avg_price FROM T ORDER BY start, stockId") + + try { + inputData.addData((101L, 1, 1.0d), (102, 1, 2.0d), (104, 2, 20.0d)) + stream.processAllAvailable() + inputData.addData((105L, 2, 40.0d), (107, 2, 60.0d), (115, 3, 300.0d)) + stream.processAllAvailable() + inputData.addData((200L, 99, 99.9d)) + stream.processAllAvailable() + checkAnswer( + query(), + Row(100L, 1, 1.5d) :: Row(100L, 2, 20.0d) :: Row(105L, 2, 50.0d) :: Row( + 115L, + 3, + 300.0d) :: Nil) + } finally { + if (stream != null) { + stream.stop() + } + } + } + } + + test("Paimon Sink: enable schema evolution") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and sink into it with schema evolution in append mode + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val date = Date.valueOf("2023-08-10") + spark.sql("INSERT INTO T VALUES (1, '2023-08-09'), (2, '2023-08-09')") + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(1, "2023-08-09") :: Row(2, "2023-08-09") :: Nil) + + val inputData = MemoryStream[(Long, Date, Int)] + val stream = inputData + .toDS() + .toDF("a", "b", "c") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .option("write.merge-schema", "true") + .option("write.merge-schema.explicit-cast", "true") + .format("paimon") + .start(location) + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + inputData.addData((1L, date, 123), (3L, date, 456)) + stream.processAllAvailable() + + checkAnswer( + query(), + Row(1L, date, 123) :: Row(2L, Date.valueOf("2023-08-09"), null) :: Row( + 3L, + date, + 456) :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon SinK: set full-compaction.delta-commits with batch write") { + for (useV2Write <- Seq("true", "false")) { + withSparkSQLConf("spark.paimon.write.use-v2-write" -> useV2Write) { + withTable("t") { + sql(""" + |CREATE TABLE t ( + | a INT, + | b INT + |) TBLPROPERTIES ( + | 'primary-key'='a', + | 'bucket'='1', + | 'full-compaction.delta-commits'='1' + |) + |""".stripMargin) + + sql("INSERT INTO t VALUES (1, 1)") + sql("INSERT INTO t VALUES (2, 2)") + checkAnswer(sql("SELECT * FROM t ORDER BY a"), Seq(Row(1, 1), Row(2, 2))) + assert(loadTable("t").snapshotManager().latestSnapshot().commitKind == COMPACT) + } + } + } + } + + test("Paimon SinK: set full-compaction.delta-commits with streaming write") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b INT) + |TBLPROPERTIES ( + | 'primary-key'='a', + | 'bucket'='1', + | 'full-compaction.delta-commits'='2' + |) + |""".stripMargin) + val table = loadTable("T") + val location = table.location().toString + + val inputData = MemoryStream[(Int, Int)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .format("paimon") + .start(location) + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + inputData.addData((1, 1)) + stream.processAllAvailable() + checkAnswer(query(), Seq(Row(1, 1))) + assert(table.snapshotManager().latestSnapshot().commitKind == APPEND) + + inputData.addData((2, 1)) + stream.processAllAvailable() + checkAnswer(query(), Seq(Row(1, 1), Row(2, 1))) + assert(table.snapshotManager().latestSnapshot().commitKind == COMPACT) + + inputData.addData((2, 2)) + stream.processAllAvailable() + checkAnswer(query(), Seq(Row(1, 1), Row(2, 2))) + assert(table.snapshotManager().latestSnapshot().commitKind == APPEND) + + inputData.addData((3, 1)) + stream.processAllAvailable() + checkAnswer(query(), Seq(Row(1, 1), Row(2, 2), Row(3, 1))) + assert(table.snapshotManager().latestSnapshot().commitKind == COMPACT) + } finally { + stream.stop() + } + } + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/AlterBranchProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/AlterBranchProcedureTest.scala new file mode 100644 index 000000000000..df1df747897d --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/AlterBranchProcedureTest.scala @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest + +class AlterBranchProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + test("Paimon Procedure: alter schema structure and test $branch syntax.") { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + val table = loadTable("T") + val branchManager = table.branchManager() + + // create branch with tag + checkAnswer( + spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 's_2', snapshot => 2)"), + Row(true) :: Nil) + checkAnswer( + spark.sql( + "CALL paimon.sys.create_branch(table => 'test.T', branch => 'snapshot_branch', tag => 's_2')"), + Row(true) :: Nil) + assert(branchManager.branchExists("snapshot_branch")) + + spark.sql("INSERT INTO T VALUES (1, 'APPLE'), (2,'DOG'), (2, 'horse')") + spark.sql("ALTER TABLE `T$branch_snapshot_branch` ADD COLUMNS(c INT)") + spark.sql( + "INSERT INTO `T$branch_snapshot_branch` VALUES " + "(1,'cherry', 100), (2,'bird', 200), (3, 'wolf', 400)") + + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(1, "APPLE") :: Row(2, "horse") :: Nil) + checkAnswer( + spark.sql("SELECT * FROM `T$branch_snapshot_branch` ORDER BY a, b,c"), + Row(1, "cherry", 100) :: Row(2, "bird", 200) :: Row(3, "wolf", 400) :: Nil) + assert(branchManager.branchExists("snapshot_branch")) + } + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/BranchProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/BranchProcedureTest.scala new file mode 100644 index 000000000000..111e604b1ef0 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/BranchProcedureTest.scala @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest + +class BranchProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + test("Paimon Procedure: create, query, write and delete branch") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + // create tags + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(table => 'test.T', tag => 'test_tag', snapshot => 2)"), + Row(true) :: Nil) + checkAnswer( + spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), + Row("test_tag") :: Nil) + + // create branch with tag + checkAnswer( + spark.sql( + "CALL paimon.sys.create_branch(table => 'test.T', branch => 'test_branch', tag => 'test_tag')"), + Row(true) :: Nil) + val table = loadTable("T") + val branchManager = table.branchManager() + assert(branchManager.branchExists("test_branch")) + + // query from branch + checkAnswer( + spark.sql("SELECT * FROM `T$branch_test_branch` ORDER BY a"), + Row(1, "a") :: Row(2, "b") :: Nil + ) + checkAnswer( + spark.read.format("paimon").option("branch", "test_branch").table("T").orderBy("a"), + Row(1, "a") :: Row(2, "b") :: Nil + ) + + // update branch + spark.sql("INSERT INTO `T$branch_test_branch` VALUES (3, 'c')") + checkAnswer( + spark.sql("SELECT * FROM `T$branch_test_branch` ORDER BY a"), + Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil + ) + // create tags + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(table => 'test.`T$branch_test_branch`', tag => 'test_tag2', snapshot => 3)"), + Row(true) :: Nil) + + // create branch from another branch. + checkAnswer( + spark.sql( + "CALL paimon.sys.create_branch(table => 'test.`T$branch_test_branch`', branch => 'test_branch2', tag => 'test_tag2')"), + Row(true) :: Nil) + checkAnswer( + spark.sql("SELECT * FROM `T$branch_test_branch2` ORDER BY a"), + Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil + ) + + // create empty branch + checkAnswer( + spark.sql( + "CALL paimon.sys.create_branch(table => 'test.T', branch => 'empty_branch')"), + Row(true) :: Nil) + assert(branchManager.branchExists("empty_branch")) + checkAnswer( + spark.sql("SELECT * FROM `T$branch_empty_branch` ORDER BY a"), + Nil + ) + + // delete branch + checkAnswer( + spark.sql( + "CALL paimon.sys.delete_branch(table => 'test.T', branch => 'test_branch')"), + Row(true) :: Nil) + assert(!branchManager.branchExists("test_branch")) + intercept[Exception] { + spark.sql("SELECT * FROM `T$branch_test_branch` ORDER BY a") + } + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Branch: read with scan.fallback-branch") { + withTable("T") { + sql(""" + |CREATE TABLE T ( + | dt STRING NOT NULL, + | name STRING NOT NULL, + | amount BIGINT + |) PARTITIONED BY (dt) + |""".stripMargin) + + sql("ALTER TABLE T SET TBLPROPERTIES ('k1' = 'v1')") + sql("ALTER TABLE T SET TBLPROPERTIES ('k2' = 'v2')") + + sql("CALL sys.create_branch('test.T', 'test')") + sql("ALTER TABLE T SET TBLPROPERTIES ('scan.fallback-branch' = 'test')") + + sql( + "INSERT INTO `T$branch_test` VALUES ('20240725', 'apple', 4), ('20240725', 'peach', 10), ('20240726', 'cherry', 3), ('20240726', 'pear', 6)") + sql("INSERT INTO T VALUES ('20240725', 'apple', 5), ('20240725', 'banana', 7)") + + checkAnswer( + sql("SELECT * FROM T ORDER BY amount"), + Seq( + Row("20240726", "cherry", 3), + Row("20240725", "apple", 5), + Row("20240726", "pear", 6), + Row("20240725", "banana", 7)) + ) + + sql("ALTER TABLE T UNSET TBLPROPERTIES ('scan.fallback-branch')") + checkAnswer( + sql("SELECT * FROM T ORDER BY amount"), + Seq(Row("20240725", "apple", 5), Row("20240725", "banana", 7))) + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTestBase.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTestBase.scala new file mode 100644 index 000000000000..19f6bc25280e --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CompactProcedureTestBase.scala @@ -0,0 +1,1324 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.Snapshot.CommitKind +import org.apache.paimon.fs.Path +import org.apache.paimon.spark.PaimonSparkTestBase +import org.apache.paimon.spark.utils.SparkProcedureUtils +import org.apache.paimon.table.FileStoreTable +import org.apache.paimon.table.source.DataSplit + +import org.apache.spark.scheduler.{SparkListener, SparkListenerStageSubmitted} +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest +import org.assertj.core.api.Assertions +import org.scalatest.time.Span + +import java.util + +import scala.collection.JavaConverters._ +import scala.util.Random + +/** Test compact procedure. See [[CompactProcedure]]. */ +abstract class CompactProcedureTestBase extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + // ----------------------- Minor Compact ----------------------- + + test("Paimon Procedure: compact aware bucket pk table with minor compact strategy") { + withTable("T") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='id, pt', 'bucket'='1', 'write-only'='true') + |PARTITIONED BY (pt) + |""".stripMargin) + + val table = loadTable("T") + + spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1'), (2, 'b', 'p2')") + spark.sql(s"INSERT INTO T VALUES (3, 'c', 'p1'), (4, 'd', 'p2')") + + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.APPEND)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(2) + + spark.sql( + "CALL sys.compact(table => 'T', compact_strategy => 'minor'," + + "options => 'num-sorted-run.compaction-trigger=3')") + + // Due to the limitation of parameter 'num-sorted-run.compaction-trigger' = 3, so compact is not + // performed. + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.APPEND)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(2) + + // Make par-p1 has 3 datafile and par-p2 has 2 datafile, so par-p2 will not be picked out to + // compact. + spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1')") + + spark.sql( + "CALL sys.compact(table => 'T', compact_strategy => 'minor'," + + "options => 'num-sorted-run.compaction-trigger=3')") + + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4) + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + + val splits = table.newSnapshotReader.read.dataSplits + splits.forEach( + split => { + Assertions + .assertThat(split.dataFiles.size) + .isEqualTo(if (split.partition().getString(0).toString == "p2") 2 else 1) + }) + } + } + + // ----------------------- Sort Compact ----------------------- + + test("Paimon Procedure: sort compact") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b INT) + |TBLPROPERTIES ('bucket'='-1') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, Int)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // test zorder sort + inputData.addData((0, 0)) + inputData.addData((0, 1)) + inputData.addData((0, 2)) + inputData.addData((1, 0)) + inputData.addData((1, 1)) + inputData.addData((1, 2)) + inputData.addData((2, 0)) + inputData.addData((2, 1)) + inputData.addData((2, 2)) + stream.processAllAvailable() + + val result = new util.ArrayList[Row]() + for (a <- 0 until 3) { + for (b <- 0 until 3) { + result.add(Row(a, b)) + } + } + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + + checkAnswer( + spark.sql( + "CALL paimon.sys.compact(table => 'T', order_strategy => 'zorder', order_by => 'a,b')"), + Row(true) :: Nil) + + val result2 = new util.ArrayList[Row]() + result2.add(0, Row(0, 0)) + result2.add(1, Row(0, 1)) + result2.add(2, Row(1, 0)) + result2.add(3, Row(1, 1)) + result2.add(4, Row(0, 2)) + result2.add(5, Row(1, 2)) + result2.add(6, Row(2, 0)) + result2.add(7, Row(2, 1)) + result2.add(8, Row(2, 2)) + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result2) + + // test hilbert sort + val result3 = new util.ArrayList[Row]() + result3.add(0, Row(0, 0)) + result3.add(1, Row(0, 1)) + result3.add(2, Row(1, 1)) + result3.add(3, Row(1, 0)) + result3.add(4, Row(2, 0)) + result3.add(5, Row(2, 1)) + result3.add(6, Row(2, 2)) + result3.add(7, Row(1, 2)) + result3.add(8, Row(0, 2)) + + checkAnswer( + spark.sql( + "CALL paimon.sys.compact(table => 'T', order_strategy => 'hilbert', order_by => 'a,b')"), + Row(true) :: Nil) + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result3) + + // test order sort + checkAnswer( + spark.sql( + "CALL paimon.sys.compact(table => 'T', order_strategy => 'order', order_by => 'a,b')"), + Row(true) :: Nil) + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: sort compact with partition") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (p INT, a INT, b INT) + |TBLPROPERTIES ('bucket'='-1') + |PARTITIONED BY (p) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, Int, Int)] + val stream = inputData + .toDS() + .toDF("p", "a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query0 = () => spark.sql("SELECT * FROM T WHERE p=0") + val query1 = () => spark.sql("SELECT * FROM T WHERE p=1") + + try { + // test zorder sort + inputData.addData((0, 0, 0)) + inputData.addData((0, 0, 1)) + inputData.addData((0, 0, 2)) + inputData.addData((0, 1, 0)) + inputData.addData((0, 1, 1)) + inputData.addData((0, 1, 2)) + inputData.addData((0, 2, 0)) + inputData.addData((0, 2, 1)) + inputData.addData((0, 2, 2)) + + inputData.addData((1, 0, 0)) + inputData.addData((1, 0, 1)) + inputData.addData((1, 0, 2)) + inputData.addData((1, 1, 0)) + inputData.addData((1, 1, 1)) + inputData.addData((1, 1, 2)) + inputData.addData((1, 2, 0)) + inputData.addData((1, 2, 1)) + inputData.addData((1, 2, 2)) + stream.processAllAvailable() + + val result0 = new util.ArrayList[Row]() + for (a <- 0 until 3) { + for (b <- 0 until 3) { + result0.add(Row(0, a, b)) + } + } + val result1 = new util.ArrayList[Row]() + for (a <- 0 until 3) { + for (b <- 0 until 3) { + result1.add(Row(1, a, b)) + } + } + Assertions.assertThat(query0().collect()).containsExactlyElementsOf(result0) + Assertions.assertThat(query1().collect()).containsExactlyElementsOf(result1) + + checkAnswer( + spark.sql( + "CALL paimon.sys.compact(table => 'T', partitions => 'p=0', order_strategy => 'zorder', order_by => 'a,b')"), + Row(true) :: Nil) + + val result2 = new util.ArrayList[Row]() + result2.add(0, Row(0, 0, 0)) + result2.add(1, Row(0, 0, 1)) + result2.add(2, Row(0, 1, 0)) + result2.add(3, Row(0, 1, 1)) + result2.add(4, Row(0, 0, 2)) + result2.add(5, Row(0, 1, 2)) + result2.add(6, Row(0, 2, 0)) + result2.add(7, Row(0, 2, 1)) + result2.add(8, Row(0, 2, 2)) + + Assertions.assertThat(query0().collect()).containsExactlyElementsOf(result2) + Assertions.assertThat(query1().collect()).containsExactlyElementsOf(result1) + + // test hilbert sort + val result3 = new util.ArrayList[Row]() + result3.add(0, Row(0, 0, 0)) + result3.add(1, Row(0, 0, 1)) + result3.add(2, Row(0, 1, 1)) + result3.add(3, Row(0, 1, 0)) + result3.add(4, Row(0, 2, 0)) + result3.add(5, Row(0, 2, 1)) + result3.add(6, Row(0, 2, 2)) + result3.add(7, Row(0, 1, 2)) + result3.add(8, Row(0, 0, 2)) + + checkAnswer( + spark.sql( + "CALL paimon.sys.compact(table => 'T', partitions => 'p=0', order_strategy => 'hilbert', order_by => 'a,b')"), + Row(true) :: Nil) + + Assertions.assertThat(query0().collect()).containsExactlyElementsOf(result3) + Assertions.assertThat(query1().collect()).containsExactlyElementsOf(result1) + + // test order sort + checkAnswer( + spark.sql( + "CALL paimon.sys.compact(table => 'T', partitions => 'p=0', order_strategy => 'order', order_by => 'a,b')"), + Row(true) :: Nil) + Assertions.assertThat(query0().collect()).containsExactlyElementsOf(result0) + Assertions.assertThat(query1().collect()).containsExactlyElementsOf(result1) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: sort compact with multi-partitions") { + Seq("order", "zorder").foreach { + orderStrategy => + { + withTable("T") { + spark.sql(s""" + |CREATE TABLE T (id INT, pt STRING) + |PARTITIONED BY (pt) + |""".stripMargin) + + spark.sql(s"""INSERT INTO T VALUES + |(1, 'p1'), (3, 'p1'), + |(1, 'p2'), (4, 'p2'), + |(3, 'p3'), (2, 'p3'), + |(1, 'p4'), (2, 'p4') + |""".stripMargin) + + spark.sql(s"""INSERT INTO T VALUES + |(4, 'p1'), (2, 'p1'), + |(2, 'p2'), (3, 'p2'), + |(1, 'p3'), (4, 'p3'), + |(3, 'p4'), (4, 'p4') + |""".stripMargin) + + checkAnswer( + spark.sql( + s"CALL sys.compact(table => 'T', order_strategy => '$orderStrategy', order_by => 'id')"), + Seq(true).toDF()) + + val result = List(Row(1), Row(2), Row(3), Row(4)).asJava + Seq("p1", "p2", "p3", "p4").foreach { + pt => + Assertions + .assertThat(spark.sql(s"SELECT id FROM T WHERE pt='$pt'").collect()) + .containsExactlyElementsOf(result) + } + } + } + } + } + + test("Paimon Procedure: sort compact with partition filter") { + withTable("t") { + sql("CREATE TABLE t (a INT, pt INT) PARTITIONED BY (pt)") + sql("INSERT INTO t VALUES (1, 1)") + sql("INSERT INTO t VALUES (2, 1)") + sql( + "CALL sys.compact(table => 't', order_strategy => 'order', where => 'pt = 1', order_by => 'a')") + val table = loadTable("t") + assert(table.latestSnapshot().get().commitKind.equals(CommitKind.OVERWRITE)) + checkAnswer(sql("SELECT * FROM t ORDER BY a"), Seq(Row(1, 1), Row(2, 1))) + } + } + + test("Paimon Procedure: compact for pk") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b INT) + |TBLPROPERTIES ('primary-key'='a,b', 'bucket'='1') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, Int)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + inputData.addData((0, 0)) + inputData.addData((0, 1)) + inputData.addData((0, 2)) + inputData.addData((1, 0)) + inputData.addData((1, 1)) + inputData.addData((1, 2)) + inputData.addData((2, 0)) + inputData.addData((2, 1)) + inputData.addData((2, 2)) + stream.processAllAvailable() + + val result = new util.ArrayList[Row]() + for (a <- 0 until 3) { + for (b <- 0 until 3) { + result.add(Row(a, b)) + } + } + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: compact aware bucket pk table") { + Seq(1, -1).foreach( + bucket => { + withTable("T") { + spark.sql( + s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='id, pt', 'bucket'='$bucket', 'write-only'='true') + |PARTITIONED BY (pt) + |""".stripMargin) + + val table = loadTable("T") + + spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1'), (2, 'b', 'p2')") + spark.sql(s"INSERT INTO T VALUES (3, 'c', 'p1'), (4, 'd', 'p2')") + + spark.sql("CALL sys.compact(table => 'T', partitions => 'pt=\"p1\"')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(3) + + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4) + + // compact condition no longer met + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4) + + checkAnswer( + spark.sql(s"SELECT * FROM T ORDER BY id"), + Row(1, "a", "p1") :: Row(2, "b", "p2") :: Row(3, "c", "p1") :: Row(4, "d", "p2") :: Nil) + } + }) + } + + test("Paimon Procedure: compact aware bucket pk table with many small files") { + Seq(3, -1).foreach( + bucket => { + withTable("T") { + spark.sql( + s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='id, pt', 'bucket'='$bucket', 'write-only'='true', + |'source.split.target-size'='128m','source.split.open-file-cost'='32m') -- simulate multiple splits in a single bucket + |PARTITIONED BY (pt) + |""".stripMargin) + + val table = loadTable("T") + + val count = 100 + for (i <- 0 until count) { + spark.sql(s"INSERT INTO T VALUES ($i, 'a', 'p${i % 2}')") + } + + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + checkAnswer(spark.sql(s"SELECT COUNT(*) FROM T"), Row(count) :: Nil) + } + }) + } + + test("Paimon Procedure: compact unaware bucket append table") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('bucket'='-1', 'write-only'='true', 'compaction.min.file-num'='2') + |PARTITIONED BY (pt) + |""".stripMargin) + + val table = loadTable("T") + + spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1'), (2, 'b', 'p2')") + spark.sql(s"INSERT INTO T VALUES (3, 'c', 'p1'), (4, 'd', 'p2')") + spark.sql(s"INSERT INTO T VALUES (5, 'e', 'p1'), (6, 'f', 'p2')") + + spark.sql("CALL sys.compact(table => 'T', partitions => 'pt=\"p1\"')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4) + + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(5) + + // compact condition no longer met + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(5) + + checkAnswer( + spark.sql(s"SELECT * FROM T ORDER BY id"), + Row(1, "a", "p1") :: Row(2, "b", "p2") :: Row(3, "c", "p1") :: Row(4, "d", "p2") :: Row( + 5, + "e", + "p1") :: Row(6, "f", "p2") :: Nil) + } + + test("Paimon Procedure: compact unaware bucket append table with many small files") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('bucket'='-1', 'write-only'='true') + |PARTITIONED BY (pt) + |""".stripMargin) + + val table = loadTable("T") + + val count = 100 + for (i <- 0 until count) { + spark.sql(s"INSERT INTO T VALUES ($i, 'a', 'p${i % 2}')") + } + + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + checkAnswer(spark.sql(s"SELECT COUNT(*) FROM T"), Row(count) :: Nil) + } + + test("Paimon Procedure: compact with wrong usage") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('bucket'='-1', 'write-only'='true') + |PARTITIONED BY (pt) + |""".stripMargin) + + assert(intercept[IllegalArgumentException] { + spark.sql( + "CALL sys.compact(table => 'T', partitions => 'pt = \"p1\"', where => 'pt = \"p1\"')") + }.getMessage.contains("partitions and where cannot be used together")) + + assert(intercept[IllegalArgumentException] { + spark.sql("CALL sys.compact(table => 'T', partitions => 'id = 1')") + }.getMessage.contains("Only partition predicate is supported")) + + assert(intercept[IllegalArgumentException] { + spark.sql("CALL sys.compact(table => 'T', where => 'id > 1 AND pt = \"p1\"')") + }.getMessage.contains("Only partition predicate is supported")) + + assert(intercept[IllegalArgumentException] { + spark.sql("CALL sys.compact(table => 'T', order_strategy => 'sort', order_by => 'pt')") + }.getMessage.contains("order_by should not contain partition cols")) + + assert(intercept[IllegalArgumentException] { + spark.sql( + "CALL sys.compact(table => 'T', order_strategy => 'sort', order_by => 'id', partition_idle_time =>'5s')") + }.getMessage.contains("sort compact do not support 'partition_idle_time'")) + } + + test("Paimon Procedure: compact with where") { + spark.sql( + s""" + |CREATE TABLE T (id INT, value STRING, dt STRING, hh INT) + |TBLPROPERTIES ('bucket'='1', 'bucket-key'='id', 'write-only'='true', 'compaction.min.file-num'='1') + |PARTITIONED BY (dt, hh) + |""".stripMargin) + + val table = loadTable("T") + val fileIO = table.fileIO() + + spark.sql(s"INSERT INTO T VALUES (1, '1', '2024-01-01', 0), (2, '2', '2024-01-01', 1)") + spark.sql(s"INSERT INTO T VALUES (3, '3', '2024-01-01', 0), (4, '4', '2024-01-01', 1)") + spark.sql(s"INSERT INTO T VALUES (5, '5', '2024-01-02', 0), (6, '6', '2024-01-02', 1)") + spark.sql(s"INSERT INTO T VALUES (7, '7', '2024-01-02', 0), (8, '8', '2024-01-02', 1)") + + spark.sql("CALL sys.compact(table => 'T', where => 'dt = \"2024-01-01\" and hh >= 1')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions + .assertThat( + fileIO.listStatus(new Path(table.location(), "dt=2024-01-01/hh=0/bucket-0")).length) + .isEqualTo(2) + Assertions + .assertThat( + fileIO.listStatus(new Path(table.location(), "dt=2024-01-01/hh=1/bucket-0")).length) + .isEqualTo(3) + Assertions + .assertThat( + fileIO.listStatus(new Path(table.location(), "dt=2024-01-02/hh=0/bucket-0")).length) + .isEqualTo(2) + Assertions + .assertThat( + fileIO.listStatus(new Path(table.location(), "dt=2024-01-02/hh=1/bucket-0")).length) + .isEqualTo(2) + } + + test("Paimon test: toWhere method in CompactProcedure") { + val conditions = "f0=0,f1=0,f2=0;f0=1,f1=1,f2=1;f0=1,f1=2,f2=2;f3=3" + + val where = SparkProcedureUtils.toWhere(conditions) + val whereExpected = + "(f0=0 AND f1=0 AND f2=0) OR (f0=1 AND f1=1 AND f2=1) OR (f0=1 AND f1=2 AND f2=2) OR (f3=3)" + + Assertions.assertThat(where).isEqualTo(whereExpected) + } + + test("Paimon Procedure: compact unaware bucket append table with option") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING, pt STRING) + |TBLPROPERTIES ('bucket'='-1', 'write-only'='true') + |PARTITIONED BY (pt) + |""".stripMargin) + + val table = loadTable("T") + + spark.sql(s"INSERT INTO T VALUES (1, 'a', 'p1'), (2, 'b', 'p2')") + spark.sql(s"INSERT INTO T VALUES (3, 'c', 'p1'), (4, 'd', 'p2')") + spark.sql(s"INSERT INTO T VALUES (5, 'e', 'p1'), (6, 'f', 'p2')") + + spark.sql( + "CALL sys.compact(table => 'T', partitions => 'pt=\"p1\"', options => 'compaction.min.file-num=2')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(4) + + spark.sql("CALL sys.compact(table => 'T', options => 'compaction.min.file-num=2')") + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(5) + + // compact condition no longer met + spark.sql(s"CALL sys.compact(table => 'T')") + Assertions.assertThat(lastSnapshotId(table)).isEqualTo(5) + + checkAnswer( + spark.sql(s"SELECT * FROM T ORDER BY id"), + Row(1, "a", "p1") :: Row(2, "b", "p2") :: Row(3, "c", "p1") :: Row(4, "d", "p2") :: + Row(5, "e", "p1") :: Row(6, "f", "p2") :: Nil) + } + + test("Paimon Procedure: compact with partition_idle_time for pk table") { + Seq(1, -1).foreach( + bucket => { + withTable("T") { + val dynamicBucketArgs = if (bucket == -1) " ,'dynamic-bucket.initial-buckets'='1'" else "" + spark.sql( + s""" + |CREATE TABLE T (id INT, value STRING, dt STRING, hh INT) + |TBLPROPERTIES ('primary-key'='id, dt, hh', 'bucket'='$bucket', 'write-only'='true'$dynamicBucketArgs) + |PARTITIONED BY (dt, hh) + |""".stripMargin) + + val table = loadTable("T") + + spark.sql(s"INSERT INTO T VALUES (1, '1', '2024-01-01', 0), (2, '2', '2024-01-01', 1)") + spark.sql(s"INSERT INTO T VALUES (5, '5', '2024-01-02', 0), (6, '6', '2024-01-02', 1)") + spark.sql(s"INSERT INTO T VALUES (3, '3', '2024-01-01', 0), (4, '4', '2024-01-01', 1)") + spark.sql(s"INSERT INTO T VALUES (7, '7', '2024-01-02', 0), (8, '8', '2024-01-02', 1)") + + Thread.sleep(10000); + spark.sql(s"INSERT INTO T VALUES (9, '9', '2024-01-01', 0), (10, '10', '2024-01-02', 0)") + + spark.sql("CALL sys.compact(table => 'T', partition_idle_time => '10s')") + val dataSplits = table.newSnapshotReader.read.dataSplits.asScala.toList + Assertions + .assertThat(dataSplits.size) + .isEqualTo(4) + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + for (dataSplit: DataSplit <- dataSplits) { + if (dataSplit.partition().getInt(1) == 0) { + Assertions + .assertThat(dataSplit.dataFiles().size()) + .isEqualTo(3) + } else { + Assertions + .assertThat(dataSplit.dataFiles().size()) + .isEqualTo(1) + } + } + } + }) + + } + + test("Paimon Procedure: compact with partition_idle_time for unaware bucket append table") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING, dt STRING, hh INT) + |TBLPROPERTIES ('bucket'='-1', 'write-only'='true', 'compaction.min.file-num'='2') + |PARTITIONED BY (dt, hh) + |""".stripMargin) + + val table = loadTable("T") + + spark.sql(s"INSERT INTO T VALUES (1, '1', '2024-01-01', 0), (2, '2', '2024-01-01', 1)") + spark.sql(s"INSERT INTO T VALUES (5, '5', '2024-01-02', 0), (6, '6', '2024-01-02', 1)") + spark.sql(s"INSERT INTO T VALUES (3, '3', '2024-01-01', 0), (4, '4', '2024-01-01', 1)") + spark.sql(s"INSERT INTO T VALUES (7, '7', '2024-01-02', 0), (8, '8', '2024-01-02', 1)") + + Thread.sleep(10000); + spark.sql(s"INSERT INTO T VALUES (9, '9', '2024-01-01', 0), (10, '10', '2024-01-02', 0)") + + spark.sql("CALL sys.compact(table => 'T', partition_idle_time => '10s')") + val dataSplits = table.newSnapshotReader.read.dataSplits.asScala.toList + Assertions + .assertThat(dataSplits.size) + .isEqualTo(4) + Assertions.assertThat(lastSnapshotCommand(table).equals(CommitKind.COMPACT)).isTrue + for (dataSplit: DataSplit <- dataSplits) { + if (dataSplit.partition().getInt(1) == 0) { + Assertions + .assertThat(dataSplit.dataFiles().size()) + .isEqualTo(3) + } else { + Assertions + .assertThat(dataSplit.dataFiles().size()) + .isEqualTo(1) + } + } + } + + test("Paimon Procedure: test aware-bucket compaction read parallelism") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING) + |TBLPROPERTIES ('primary-key'='id', 'bucket'='3', 'write-only'='true') + |""".stripMargin) + + val table = loadTable("T") + for (i <- 1 to 10) { + sql(s"INSERT INTO T VALUES ($i, '$i')") + } + assertResult(10)(table.snapshotManager().snapshotCount()) + + val buckets = table.newSnapshotReader().bucketEntries().asScala.map(_.bucket()).distinct.size + assertResult(3)(buckets) + + val taskBuffer = scala.collection.mutable.ListBuffer.empty[Int] + val listener = new SparkListener { + override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = { + taskBuffer += stageSubmitted.stageInfo.numTasks + } + } + + try { + spark.sparkContext.addSparkListener(listener) + + // spark.default.parallelism cannot be change in spark session + // sparkParallelism is 2, bucket is 3, use 2 as the read parallelism + spark.conf.set("spark.sql.shuffle.partitions", 2) + spark.sql("CALL sys.compact(table => 'T')") + + // sparkParallelism is 5, bucket is 3, use 3 as the read parallelism + spark.conf.set("spark.sql.shuffle.partitions", 5) + spark.sql("CALL sys.compact(table => 'T')") + + assertResult(Seq(2, 3))(taskBuffer) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + } + + test("Paimon Procedure: test unaware-bucket compaction read parallelism") { + spark.sql(s""" + |CREATE TABLE T (id INT, value STRING) + |TBLPROPERTIES ('bucket'='-1', 'write-only'='true') + |""".stripMargin) + + val table = loadTable("T") + for (i <- 1 to 12) { + sql(s"INSERT INTO T VALUES ($i, '$i')") + } + assertResult(12)(table.snapshotManager().snapshotCount()) + + val buckets = table.newSnapshotReader().bucketEntries().asScala.map(_.bucket()).distinct.size + // only has bucket-0 + assertResult(1)(buckets) + + val taskBuffer = scala.collection.mutable.ListBuffer.empty[Int] + val listener = new SparkListener { + override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = { + taskBuffer += stageSubmitted.stageInfo.numTasks + } + } + + try { + spark.sparkContext.addSparkListener(listener) + + // spark.default.parallelism cannot be change in spark session + // sparkParallelism is 2, task groups is 6, use 2 as the read parallelism + spark.conf.set("spark.sql.shuffle.partitions", 2) + spark.sql( + "CALL sys.compact(table => 'T', options => 'source.split.open-file-cost=3200M, compaction.min.file-num=2')") + + // sparkParallelism is 5, task groups is 1, use 1 as the read parallelism + spark.conf.set("spark.sql.shuffle.partitions", 5) + spark.sql( + "CALL sys.compact(table => 'T', options => 'source.split.open-file-cost=3200M, compaction.min.file-num=2')") + + assertResult(Seq(2, 3))(taskBuffer) + } finally { + spark.sparkContext.removeSparkListener(listener) + } + } + + test("Paimon Procedure: type cast in where") { + withTable("t") { + sql(""" + |CREATE TABLE t (id INT, value STRING, day_part LONG) + |TBLPROPERTIES ('compaction.min.file-num'='2') + |PARTITIONED BY (day_part) + |""".stripMargin) + sql("INSERT INTO t VALUES (1, 'a', 20250810)") + sql("INSERT INTO t VALUES (2, 'b', 20250810)") + sql("INSERT INTO t VALUES (3, 'c', 20250811)") + + sql("CALL sys.compact(table => 't', where => 'day_part < 20250811 and day_part > 20250809')") + val table = loadTable("t") + assert(table.snapshotManager().latestSnapshot().commitKind().equals(CommitKind.COMPACT)) + } + } + + test("Paimon Procedure: cluster for unpartitioned table") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql( + s""" + |CREATE TABLE T (a INT, b INT, c STRING) + |TBLPROPERTIES ('bucket'='-1','num-levels'='6', 'num-sorted-run.compaction-trigger'='2', 'clustering.columns'='a,b', 'clustering.strategy'='zorder', 'clustering.incremental' = 'true') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b", "c") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + val random = new Random() + val randomStr = random.nextString(40) + // first write + inputData.addData((0, 0, randomStr)) + inputData.addData((0, 1, randomStr)) + inputData.addData((0, 2, randomStr)) + inputData.addData((1, 0, randomStr)) + inputData.addData((1, 1, randomStr)) + inputData.addData((1, 2, randomStr)) + inputData.addData((2, 0, randomStr)) + inputData.addData((2, 1, randomStr)) + inputData.addData((2, 2, randomStr)) + stream.processAllAvailable() + + val result = new util.ArrayList[Row]() + for (a <- 0 until 3) { + for (b <- 0 until 3) { + result.add(Row(a, b, randomStr)) + } + } + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + + // first cluster, the outputLevel should be 5 + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + + // first cluster result + val result2 = new util.ArrayList[Row]() + result2.add(0, Row(0, 0, randomStr)) + result2.add(1, Row(0, 1, randomStr)) + result2.add(2, Row(1, 0, randomStr)) + result2.add(3, Row(1, 1, randomStr)) + result2.add(4, Row(0, 2, randomStr)) + result2.add(5, Row(1, 2, randomStr)) + result2.add(6, Row(2, 0, randomStr)) + result2.add(7, Row(2, 1, randomStr)) + result2.add(8, Row(2, 2, randomStr)) + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result2) + + var clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + var dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5) + + // second write + inputData.addData((0, 3, null), (1, 3, null), (2, 3, null)) + inputData.addData((3, 0, null), (3, 1, null), (3, 2, null), (3, 3, null)) + stream.processAllAvailable() + + val result3 = new util.ArrayList[Row]() + result3.addAll(result2) + for (a <- 0 until 3) { + result3.add(Row(a, 3, null)) + } + for (b <- 0 until 4) { + result3.add(Row(3, b, null)) + } + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result3) + + // second cluster, the outputLevel should be 4 + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + // second cluster result, level-5 and level-4 are individually ordered + val result4 = new util.ArrayList[Row]() + result4.addAll(result2) + result4.add(Row(0, 3, null)) + result4.add(Row(1, 3, null)) + result4.add(Row(3, 0, null)) + result4.add(Row(3, 1, null)) + result4.add(Row(2, 3, null)) + result4.add(Row(3, 2, null)) + result4.add(Row(3, 3, null)) + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result4) + + clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(2) + Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5) + Assertions.assertThat(dataSplits.get(0).dataFiles().get(1).level()).isEqualTo(4) + + // full cluster + checkAnswer( + spark.sql("CALL paimon.sys.compact(table => 'T', compact_strategy => 'full')"), + Row(true) :: Nil) + val result5 = new util.ArrayList[Row]() + result5.add(Row(0, 0, randomStr)) + result5.add(Row(0, 1, randomStr)) + result5.add(Row(1, 0, randomStr)) + result5.add(Row(1, 1, randomStr)) + result5.add(Row(0, 2, randomStr)) + result5.add(Row(0, 3, null)) + result5.add(Row(1, 2, randomStr)) + result5.add(Row(1, 3, null)) + result5.add(Row(2, 0, randomStr)) + result5.add(Row(2, 1, randomStr)) + result5.add(Row(3, 0, null)) + result5.add(Row(3, 1, null)) + result5.add(Row(2, 2, randomStr)) + result5.add(Row(2, 3, null)) + result5.add(Row(3, 2, null)) + result5.add(Row(3, 3, null)) + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result5) + + clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: cluster for partitioned table") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql( + s""" + |CREATE TABLE T (a INT, b INT, c STRING, pt INT) + |PARTITIONED BY (pt) + |TBLPROPERTIES ('bucket'='-1', 'num-levels'='6', 'num-sorted-run.compaction-trigger'='2', 'clustering.columns'='a,b', 'clustering.strategy'='zorder', 'clustering.incremental' = 'true') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, Int, String, Int)] + val stream = inputData + .toDS() + .toDF("a", "b", "c", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY pt") + + try { + val random = new Random() + val randomStr = random.nextString(50) + // first write + for (pt <- 0 until 2) { + val c = if (pt == 0) randomStr else null + inputData.addData((0, 0, c, pt)) + inputData.addData((0, 1, c, pt)) + inputData.addData((0, 2, c, pt)) + inputData.addData((1, 0, c, pt)) + inputData.addData((1, 1, c, pt)) + inputData.addData((1, 2, c, pt)) + inputData.addData((2, 0, c, pt)) + inputData.addData((2, 1, c, pt)) + inputData.addData((2, 2, c, pt)) + } + stream.processAllAvailable() + + val result = new util.ArrayList[Row]() + for (pt <- 0 until 2) { + for (a <- 0 until 3) { + for (b <- 0 until 3) { + val c = if (pt == 0) randomStr else null + result.add(Row(a, b, c, pt)) + } + } + } + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + + // first cluster, the outputLevel should be 5 + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + + // first cluster result + val result2 = new util.ArrayList[Row]() + for (pt <- 0 until 2) { + val c = if (pt == 0) randomStr else null + result2.add(Row(0, 0, c, pt)) + result2.add(Row(0, 1, c, pt)) + result2.add(Row(1, 0, c, pt)) + result2.add(Row(1, 1, c, pt)) + result2.add(Row(0, 2, c, pt)) + result2.add(Row(1, 2, c, pt)) + result2.add(Row(2, 0, c, pt)) + result2.add(Row(2, 1, c, pt)) + result2.add(Row(2, 2, c, pt)) + } + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result2) + + var clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + var dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(2) + dataSplits.forEach( + dataSplit => { + Assertions.assertThat(dataSplit.dataFiles().size()).isEqualTo(1) + Assertions.assertThat(dataSplit.dataFiles().get(0).level()).isEqualTo(5) + }) + + // second write + for (pt <- 0 until 2) { + inputData.addData((0, 3, null, pt), (1, 3, null, pt), (2, 3, null, pt)) + inputData.addData( + (3, 0, null, pt), + (3, 1, null, pt), + (3, 2, null, pt), + (3, 3, null, pt)) + } + stream.processAllAvailable() + + val result3 = new util.ArrayList[Row]() + for (pt <- 0 until 2) { + val c = if (pt == 0) randomStr else null + result3.add(Row(0, 0, c, pt)) + result3.add(Row(0, 1, c, pt)) + result3.add(Row(1, 0, c, pt)) + result3.add(Row(1, 1, c, pt)) + result3.add(Row(0, 2, c, pt)) + result3.add(Row(1, 2, c, pt)) + result3.add(Row(2, 0, c, pt)) + result3.add(Row(2, 1, c, pt)) + result3.add(Row(2, 2, c, pt)) + for (a <- 0 until 3) { + result3.add(Row(a, 3, null, pt)) + } + for (b <- 0 until 4) { + result3.add(Row(3, b, null, pt)) + } + } + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result3) + + // second cluster + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + val result4 = new util.ArrayList[Row]() + // for partition-0: only file in level-0 will be picked for clustering, outputLevel is 4 + result4.add(Row(0, 0, randomStr, 0)) + result4.add(Row(0, 1, randomStr, 0)) + result4.add(Row(1, 0, randomStr, 0)) + result4.add(Row(1, 1, randomStr, 0)) + result4.add(Row(0, 2, randomStr, 0)) + result4.add(Row(1, 2, randomStr, 0)) + result4.add(Row(2, 0, randomStr, 0)) + result4.add(Row(2, 1, randomStr, 0)) + result4.add(Row(2, 2, randomStr, 0)) + result4.add(Row(0, 3, null, 0)) + result4.add(Row(1, 3, null, 0)) + result4.add(Row(3, 0, null, 0)) + result4.add(Row(3, 1, null, 0)) + result4.add(Row(2, 3, null, 0)) + result4.add(Row(3, 2, null, 0)) + result4.add(Row(3, 3, null, 0)) + // for partition-1:all files will be picked for clustering, outputLevel is 5 + result4.add(Row(0, 0, null, 1)) + result4.add(Row(0, 1, null, 1)) + result4.add(Row(1, 0, null, 1)) + result4.add(Row(1, 1, null, 1)) + result4.add(Row(0, 2, null, 1)) + result4.add(Row(0, 3, null, 1)) + result4.add(Row(1, 2, null, 1)) + result4.add(Row(1, 3, null, 1)) + result4.add(Row(2, 0, null, 1)) + result4.add(Row(2, 1, null, 1)) + result4.add(Row(3, 0, null, 1)) + result4.add(Row(3, 1, null, 1)) + result4.add(Row(2, 2, null, 1)) + result4.add(Row(2, 3, null, 1)) + result4.add(Row(3, 2, null, 1)) + result4.add(Row(3, 3, null, 1)) + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result4) + + clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(2) + dataSplits.forEach( + dataSplit => { + if (dataSplit.partition().getInt(0) == 1) { + // partition-1 + Assertions.assertThat(dataSplit.dataFiles().size()).isEqualTo(1) + Assertions.assertThat(dataSplit.dataFiles().get(0).level()).isEqualTo(5) + } else { + // partition-0 + Assertions.assertThat(dataSplit.dataFiles().size()).isEqualTo(2) + Assertions.assertThat(dataSplit.dataFiles().get(0).level()).isEqualTo(5) + Assertions.assertThat(dataSplit.dataFiles().get(1).level()).isEqualTo(4) + } + }) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: cluster for partitioned table with partition filter") { + sql( + """ + |CREATE TABLE T (a INT, b INT, pt INT) + |PARTITIONED BY (pt) + |TBLPROPERTIES ( + | 'bucket'='-1', 'num-levels'='6', 'num-sorted-run.compaction-trigger'='2', + | 'clustering.columns'='a,b', 'clustering.strategy'='zorder', 'clustering.incremental' = 'true' + |) + |""".stripMargin) + + sql("INSERT INTO T VALUES (0, 0, 0), (0, 0, 1)") + sql("INSERT INTO T VALUES (0, 1, 0), (0, 1, 1)") + sql("INSERT INTO T VALUES (0, 2, 0), (0, 2, 1)") + sql("INSERT INTO T VALUES (1, 0, 0), (1, 0, 1)") + sql("INSERT INTO T VALUES (1, 1, 0), (1, 1, 1)") + sql("INSERT INTO T VALUES (1, 2, 0), (1, 2, 1)") + sql("INSERT INTO T VALUES (2, 0, 0), (2, 0, 1)") + sql("INSERT INTO T VALUES (2, 1, 0), (2, 1, 1)") + sql("INSERT INTO T VALUES (2, 2, 0), (2, 2, 1)") + + sql("CALL sys.compact(table => 'T', where => 'pt = 0')") + checkAnswer( + sql("select distinct partition, level from `T$files` order by partition"), + Seq(Row("{0}", 5), Row("{1}", 0)) + ) + + sql("CALL sys.compact(table => 'T', where => 'pt = 1')") + checkAnswer( + sql("select distinct partition, level from `T$files` order by partition"), + Seq(Row("{0}", 5), Row("{1}", 5)) + ) + } + + test("Paimon Procedure: cluster with deletion vectors") { + failAfter(Span(5, org.scalatest.time.Minutes)) { + withTempDir { + checkpointDir => + spark.sql( + s""" + |CREATE TABLE T (a INT, b INT, c STRING) + |TBLPROPERTIES ('bucket'='-1', 'deletion-vectors.enabled'='true','num-levels'='6', 'num-sorted-run.compaction-trigger'='2', 'clustering.columns'='a,b', 'clustering.strategy'='zorder', 'clustering.incremental' = 'true') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b", "c") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + val random = new Random() + val randomStr = random.nextString(40) + // first write + inputData.addData((0, 0, randomStr)) + inputData.addData((0, 1, randomStr)) + inputData.addData((0, 2, randomStr)) + inputData.addData((1, 0, randomStr)) + inputData.addData((1, 1, randomStr)) + inputData.addData((1, 2, randomStr)) + inputData.addData((2, 0, randomStr)) + inputData.addData((2, 1, randomStr)) + inputData.addData((2, 2, randomStr)) + stream.processAllAvailable() + + val result = new util.ArrayList[Row]() + for (a <- 0 until 3) { + for (b <- 0 until 3) { + result.add(Row(a, b, randomStr)) + } + } + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result) + + // first cluster, the outputLevel should be 5 + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + + // first cluster result + val result2 = new util.ArrayList[Row]() + result2.add(0, Row(0, 0, randomStr)) + result2.add(1, Row(0, 1, randomStr)) + result2.add(2, Row(1, 0, randomStr)) + result2.add(3, Row(1, 1, randomStr)) + result2.add(4, Row(0, 2, randomStr)) + result2.add(5, Row(1, 2, randomStr)) + result2.add(6, Row(2, 0, randomStr)) + result2.add(7, Row(2, 1, randomStr)) + result2.add(8, Row(2, 2, randomStr)) + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result2) + + var clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + var dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5) + + // second write + inputData.addData((0, 3, null), (1, 3, null), (2, 3, null)) + inputData.addData((3, 0, null), (3, 1, null), (3, 2, null), (3, 3, null)) + stream.processAllAvailable() + + // delete (0,0), which is in level-5 file + spark.sql("DELETE FROM T WHERE a=0 and b=0;").collect() + // delete (0,3), which is in level-0 file + spark.sql("DELETE FROM T WHERE a=0 and b=3;").collect() + + val result3 = new util.ArrayList[Row]() + result3.addAll(result2.subList(1, result2.size())) + for (a <- 1 until 3) { + result3.add(Row(a, 3, null)) + } + for (b <- 0 until 4) { + result3.add(Row(3, b, null)) + } + + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result3) + + // second cluster, the outputLevel should be 4. dv index for level-0 will be updated + // and dv index for level-5 will be retained + checkAnswer(spark.sql("CALL paimon.sys.compact(table => 'T')"), Row(true) :: Nil) + // second cluster result, level-5 and level-4 are individually ordered + val result4 = new util.ArrayList[Row]() + result4.addAll(result2.subList(1, result2.size())) + result4.add(Row(1, 3, null)) + result4.add(Row(3, 0, null)) + result4.add(Row(3, 1, null)) + result4.add(Row(2, 3, null)) + result4.add(Row(3, 2, null)) + result4.add(Row(3, 3, null)) + Assertions.assertThat(query().collect()).containsExactlyElementsOf(result4) + + clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(2) + Assertions.assertThat(dataSplits.get(0).dataFiles().get(0).level()).isEqualTo(5) + Assertions.assertThat(dataSplits.get(0).deletionFiles().get().get(0)).isNotNull + Assertions.assertThat(dataSplits.get(0).dataFiles().get(1).level()).isEqualTo(4) + Assertions.assertThat(dataSplits.get(0).deletionFiles().get().get(1)).isNull() + + // full cluster + checkAnswer( + spark.sql("CALL paimon.sys.compact(table => 'T', compact_strategy => 'full')"), + Row(true) :: Nil) + clusteredTable = loadTable("T") + checkSnapshot(clusteredTable) + dataSplits = clusteredTable.newSnapshotReader().read().dataSplits() + Assertions.assertThat(dataSplits.size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).dataFiles().size()).isEqualTo(1) + Assertions.assertThat(dataSplits.get(0).deletionFiles().get().get(0)).isNull() + + } finally { + stream.stop() + } + } + } + } + + def checkSnapshot(table: FileStoreTable): Unit = { + Assertions + .assertThat(table.latestSnapshot().get().commitKind().toString) + .isEqualTo(CommitKind.COMPACT.toString) + } + + def lastSnapshotCommand(table: FileStoreTable): CommitKind = { + table.snapshotManager().latestSnapshot().commitKind() + } + + def lastSnapshotId(table: FileStoreTable): Long = { + table.snapshotManager().latestSnapshotId() + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateAndDeleteTagProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateAndDeleteTagProcedureTest.scala new file mode 100644 index 000000000000..605f80e27ad3 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateAndDeleteTagProcedureTest.scala @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest + +class CreateAndDeleteTagProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + test("Paimon Procedure: create and delete tag") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(" + + "table => 'test.T', tag => 'test_tag', time_retained => '5 d', snapshot => 2)"), + Row(true) :: Nil) + checkAnswer( + spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), + Row("test_tag") :: Nil) + checkAnswer( + spark.sql("CALL paimon.sys.delete_tag(table => 'test.T', tag => 'test_tag')"), + Row(true) :: Nil) + checkAnswer(spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), Nil) + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(table => 'test.T', tag => 'test_latestSnapshot_tag')"), + Row(true) :: Nil) + checkAnswer( + spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), + Row("test_latestSnapshot_tag") :: Nil) + checkAnswer( + spark.sql( + "CALL paimon.sys.delete_tag(table => 'test.T', tag => 'test_latestSnapshot_tag')"), + Row(true) :: Nil) + checkAnswer(spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), Nil) + + // create test_tag_1 and test_tag_2 + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(" + + "table => 'test.T', tag => 'test_tag_1', snapshot => 1)"), + Row(true) :: Nil) + + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(" + + "table => 'test.T', tag => 'test_tag_2', snapshot => 2)"), + Row(true) :: Nil) + + checkAnswer( + spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), + Row("test_tag_1") :: Row("test_tag_2") :: Nil) + + // test rename_tag + checkAnswer( + spark.sql( + "CALL paimon.sys.rename_tag(table => 'test.T', tag => 'test_tag_1', target_tag => 'test_tag_3')"), + Row(true) :: Nil + ) + checkAnswer( + spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), + Row("test_tag_2") :: Row("test_tag_3") :: Nil) + + // delete test_tag_2 and test_tag_3 + checkAnswer( + spark.sql( + "CALL paimon.sys.delete_tag(table => 'test.T', tag => 'test_tag_2,test_tag_3')"), + Row(true) :: Nil) + + checkAnswer(spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: create same tag with same snapshot") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(" + + "table => 'test.T', tag => 'test_tag', snapshot => 1)"), + Row(true) :: Nil) + checkAnswer( + spark.sql("SELECT count(*) FROM paimon.test.`T$tags` where tag_name = 'test_tag'"), + Row(1) :: Nil) + + // throw exception "Tag test_tag already exists" + assertThrows[IllegalArgumentException] { + spark.sql( + "CALL paimon.sys.create_tag(" + + "table => 'test.T', tag => 'test_tag', time_retained => '5 d', snapshot => 1)") + } + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: delete tag not failed if tag not exists") { + spark.sql("CREATE TABLE T (id STRING, name STRING) USING PAIMON") + + checkAnswer( + spark.sql("CALL paimon.sys.delete_tag(table => 'test.T', tag => 'test_tag')"), + Row(true) :: Nil) + } + + test("Paimon Procedure: delete multiple tags") { + spark.sql("CREATE TABLE T (id INT, name STRING) USING PAIMON") + spark.sql("insert into T values (1, 'a')") + + // create four tags + spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 'tag-1')") + spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 'tag-2')") + spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 'tag-3')") + spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => 'tag-4')") + checkAnswer(spark.sql("SELECT count(*) FROM paimon.test.`T$tags`"), Row(4) :: Nil) + + // multiple tags with no space + checkAnswer( + spark.sql("CALL paimon.sys.delete_tag(table => 'test.T', tag => 'tag-1,tag-2')"), + Row(true) :: Nil) + checkAnswer( + spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), + Row("tag-3") :: Row("tag-4") :: Nil) + + // multiple tags with space + checkAnswer( + spark.sql("CALL paimon.sys.delete_tag(table => 'test.T', tag => 'tag-3, tag-4')"), + Row(true) :: Nil) + checkAnswer(spark.sql("SELECT tag_name FROM paimon.test.`T$tags`"), Nil) + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateTagFromTimestampProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateTagFromTimestampProcedureTest.scala new file mode 100644 index 000000000000..b4f7d63086ae --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/CreateTagFromTimestampProcedureTest.scala @@ -0,0 +1,180 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase +import org.apache.paimon.utils.SnapshotNotExistException + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest + +class CreateTagFromTimestampProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + test("Paimon Procedure: Create tags from snapshots commit-time ") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + try { + + for (i <- 1 to 4) { + inputData.addData((i, "a")) + stream.processAllAvailable() + Thread.sleep(500L) + } + + val table = loadTable("T") + val earliestCommitTime = table.snapshotManager.earliestSnapshot.timeMillis + val commitTime3 = table.snapshotManager.snapshot(3).timeMillis + val commitTime4 = table.snapshotManager.snapshot(4).timeMillis + + // create tag from timestamp that earlier than the earliest snapshot commit time. + checkAnswer( + spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp( + |table => 'test.T', + | tag => 'test_tag', + | timestamp => ${earliestCommitTime - 1})""".stripMargin), + Row("test_tag", 1, earliestCommitTime, "null") :: Nil + ) + + // create tag from timestamp that equals to snapshot-3 commit time. + checkAnswer( + spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp( + |table => 'test.T', + | tag => 'test_tag2', + | timestamp => $commitTime3)""".stripMargin), + Row("test_tag2", 3, commitTime3, "null") :: Nil + ) + + // create tag from timestamp that later than snapshot-3 commit time. + checkAnswer( + spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp( + |table => 'test.T', + |tag => 'test_tag3', + |timestamp => ${commitTime3 + 1})""".stripMargin), + Row("test_tag3", 4, commitTime4, "null") :: Nil + ) + + // create tag from timestamp that later than the latest snapshot commit time and throw SnapshotNotExistException. + assertThrows[SnapshotNotExistException] { + spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp( + |table => 'test.T', + |tag => 'test_tag3', + |timestamp => ${Long.MaxValue})""".stripMargin) + } + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: Create tags from tags commit-time") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + try { + for (i <- 1 to 2) { + inputData.addData((i, "a")) + stream.processAllAvailable() + Thread.sleep(500L) + } + + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(" + + "table => 'test.T', tag => 'test_tag', snapshot => 1)"), + Row(true) :: Nil) + + val table = loadTable("T") + val latestCommitTime = table.snapshotManager.latestSnapshot().timeMillis + val tagsCommitTime = table.tagManager().getOrThrow("test_tag").timeMillis + assert(latestCommitTime > tagsCommitTime) + + // make snapshot 1 expire. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_snapshots(table => 'test.T', retain_max => 1, retain_min => 1)"), + Row(1) :: Nil) + + // create tag from timestamp that earlier than the expired snapshot 1. + checkAnswer( + spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp( + |table => 'test.T', + | tag => 'test_tag1', + | timestamp => ${tagsCommitTime - 1})""".stripMargin), + Row("test_tag1", 1, tagsCommitTime, "null") :: Nil + ) + + // create tag from timestamp that later than the expired snapshot 1. + checkAnswer( + spark.sql(s"""CALL paimon.sys.create_tag_from_timestamp( + |table => 'test.T', + |tag => 'test_tag2', + |timestamp => ${tagsCommitTime + 1})""".stripMargin), + Row("test_tag2", 2, latestCommitTime, "null") :: Nil + ) + + } finally { + stream.stop() + } + } + } + } + +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpirePartitionsProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpirePartitionsProcedureTest.scala new file mode 100644 index 000000000000..c7cdc0f517a7 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpirePartitionsProcedureTest.scala @@ -0,0 +1,760 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest +import org.assertj.core.api.Assertions.assertThatThrownBy + +/** IT Case for [[ExpirePartitionsProcedure]]. */ +class ExpirePartitionsProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + test("Paimon Procedure: expire partitions") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1') + | PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // snapshot-1 + inputData.addData(("a", "2024-06-01")) + stream.processAllAvailable() + + // This partition never expires. + inputData.addData(("Never-expire", "9999-09-09")) + stream.processAllAvailable() + + checkAnswer(query(), Row("a", "2024-06-01") :: Row("Never-expire", "9999-09-09") :: Nil) + // call expire_partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd')"), + Row("pt=2024-06-01") :: Nil + ) + + checkAnswer(query(), Row("Never-expire", "9999-09-09") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon procedure : expire partitions show a list of expired partitions.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING, hm STRING) + |TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1') + | PARTITIONED BY (pt,hm) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt", "hm") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // Show results : There are no expired partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd')"), + Row("No expired partitions.") :: Nil + ) + + // snapshot-1 + inputData.addData(("a", "2024-06-01", "01:00")) + stream.processAllAvailable() + // snapshot-2 + inputData.addData(("b", "2024-06-02", "02:00")) + stream.processAllAvailable() + // snapshot-3, never expires. + inputData.addData(("Never-expire", "9999-09-09", "99:99")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("a", "2024-06-01", "01:00") :: Row("b", "2024-06-02", "02:00") :: Row( + "Never-expire", + "9999-09-09", + "99:99") :: Nil) + + // Show a list of expired partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'" + + ", expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd')"), + Row("pt=2024-06-01, hm=01:00") :: Row("pt=2024-06-02, hm=02:00") :: Nil + ) + + checkAnswer(query(), Row("Never-expire", "9999-09-09", "99:99") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions with values-time strategy.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1') + | PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // snapshot-1 + inputData.addData(("HXH", "2024-06-01")) + stream.processAllAvailable() + + // Never expire. + inputData.addData(("Never-expire", "9999-09-09")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("HXH", "2024-06-01") :: Row("Never-expire", "9999-09-09") :: Nil) + // expire + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'," + + " expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd'" + + ",expire_strategy => 'values-time')"), + Row("pt=2024-06-01") :: Nil + ) + + checkAnswer(query(), Row("Never-expire", "9999-09-09") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions with update-time strategy.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1') + | PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // This partition will expire. + inputData.addData(("HXH", "9999-09-09")) + stream.processAllAvailable() + // Waiting for partition 'pt=9999-09-09' to expire. + Thread.sleep(2500L) + // snapshot-2 + inputData.addData(("HXH", "2024-06-01")) + stream.processAllAvailable() + + // Partitions that are updated within 2 second would be retained. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(" + + "table => 'test.T'," + + " expiration_time => '2 s'" + + ",expire_strategy => 'update-time')"), + Row("pt=9999-09-09") :: Nil + ) + + checkAnswer(query(), Row("HXH", "2024-06-01") :: Nil) + + // Waiting for all partitions to expire. + Thread.sleep(1500) + // All partition will expire. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(" + + "table => 'test.T'," + + " expiration_time => '1 s'" + + ",expire_strategy => 'update-time')"), + Row("pt=2024-06-01") :: Nil + ) + + checkAnswer(query(), Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions with update-time strategy in same partition.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING, hm STRING) + |TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1') + | PARTITIONED BY (pt,hm) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt", "hm") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // This partition will not expire. + inputData.addData(("HXH", "2024-06-01", "01:00")) + stream.processAllAvailable() + // Waiting for 'pt=9999-09-09, hm=99:99' partitions to expire. + Thread.sleep(2500L) + // Updating the same partition data will update partition last update time, then this partition will not expire. + inputData.addData(("HXH", "2024-06-01", "01:00")) + stream.processAllAvailable() + + // The last update time of the 'pt=9999-09-09, hm=99:99' partition is updated so the partition would not expire. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'," + + " expiration_time => '2 s'" + + ",expire_strategy => 'update-time')"), + Row("No expired partitions.") :: Nil + ) + + checkAnswer(query(), Row("HXH", "2024-06-01", "01:00") :: Nil) + // Waiting for all partitions to expire. + Thread.sleep(1500) + + // The partition 'dt=2024-06-01, hm=01:00' will expire. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'," + + " expiration_time => '1 s'" + + ",expire_strategy => 'update-time')"), + Row("pt=2024-06-01, hm=01:00") :: Nil + ) + + checkAnswer(query(), Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions with non-date format partition.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1') + | PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // This partition will expire. + inputData.addData(("HXH", "pt-1")) + stream.processAllAvailable() + Thread.sleep(2500L) + // snapshot-2 + inputData.addData(("HXH", "pt-2")) + stream.processAllAvailable() + + // Only update-time strategy support non date format partition to expire. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'," + + " expiration_time => '2 s'" + + ",expire_strategy => 'update-time')"), + Row("pt=pt-1") :: Nil + ) + + checkAnswer(query(), Row("HXH", "pt-2") :: Nil) + + // Waiting for all partitions to expire. + Thread.sleep(1500) + // call expire_partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'," + + " expiration_time => '1 s'" + + ",expire_strategy => 'update-time')"), + Row("pt=pt-2") :: Nil + ) + + checkAnswer(query(), Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon procedure : expire partitions with specified time-pattern partitions.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING, hm STRING) + |TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1') + | PARTITIONED BY (hm, pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt", "hm") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // Show results : There are no expired partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd', timestamp_pattern => '$pt')"), + Row("No expired partitions.") :: Nil + ) + + // snapshot-1 + inputData.addData(("a", "2024-06-01", "01:00")) + stream.processAllAvailable() + // snapshot-2 + inputData.addData(("b", "2024-06-02", "02:00")) + stream.processAllAvailable() + // snapshot-3, never expires. + inputData.addData(("Never-expire", "9999-09-09", "99:99")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("a", "2024-06-01", "01:00") :: Row("b", "2024-06-02", "02:00") :: Row( + "Never-expire", + "9999-09-09", + "99:99") :: Nil) + + // Show a list of expired partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'" + + ", expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd HH:mm'" + + ", timestamp_pattern => '$pt $hm')"), + Row("hm=01:00, pt=2024-06-01") :: Row("hm=02:00, pt=2024-06-02") :: Nil + ) + + checkAnswer(query(), Row("Never-expire", "9999-09-09", "99:99") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon procedure : sorted the expired partitions with max_expires.") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING, hm STRING) + |TBLPROPERTIES ('primary-key'='k,pt,hm', 'bucket'='1') + | PARTITIONED BY (pt,hm) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt", "hm") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // Show results : There are no expired partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd')"), + Row("No expired partitions.") :: Nil + ) + + inputData.addData(("a", "2024-06-02", "02:00")) + stream.processAllAvailable() + inputData.addData(("b", "2024-06-02", "01:00")) + stream.processAllAvailable() + inputData.addData(("d", "2024-06-03", "01:00")) + stream.processAllAvailable() + inputData.addData(("c", "2024-06-01", "01:00")) + stream.processAllAvailable() + // this snapshot never expires. + inputData.addData(("Never-expire", "9999-09-09", "99:99")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("a", "2024-06-02", "02:00") :: Row("b", "2024-06-02", "01:00") :: Row( + "d", + "2024-06-03", + "01:00") :: Row("c", "2024-06-01", "01:00") :: Row( + "Never-expire", + "9999-09-09", + "99:99") :: Nil + ) + + // sorted result of limited expired partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T'" + + ", expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd', max_expires => 3)"), + Row("pt=2024-06-01, hm=01:00") :: Row("pt=2024-06-02, hm=01:00") :: Row( + "pt=2024-06-02, hm=02:00") :: Nil + ) + + checkAnswer( + query(), + Row("d", "2024-06-03", "01:00") :: Row("Never-expire", "9999-09-09", "99:99") :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions with default num") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql( + s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ('primary-key'='k,pt', 'bucket'='1', 'partition.expiration-max-num'='2') + |PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // snapshot-1 + inputData.addData(("a", "2024-06-01")) + stream.processAllAvailable() + + // snapshot-2 + inputData.addData(("b", "2024-06-02")) + stream.processAllAvailable() + + // snapshot-3 + inputData.addData(("c", "2024-06-03")) + stream.processAllAvailable() + + // This partition never expires. + inputData.addData(("Never-expire", "9999-09-09")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("a", "2024-06-01") :: Row("b", "2024-06-02") :: Row("c", "2024-06-03") :: Row( + "Never-expire", + "9999-09-09") :: Nil) + // call expire_partitions. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d'" + + ", timestamp_formatter => 'yyyy-MM-dd')"), + Row("pt=2024-06-01") :: Row("pt=2024-06-02") :: Nil + ) + + checkAnswer(query(), Row("c", "2024-06-03") :: Row("Never-expire", "9999-09-09") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions load table property first") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ( + | 'primary-key' = 'k,pt', + | 'bucket' = '1', + | 'write-only' = 'true', + | 'partition.timestamp-formatter' = 'yyyy-MM-dd', + | 'partition.expiration-max-num'='2') + |PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // snapshot-1 + inputData.addData(("a", "2024-06-01")) + stream.processAllAvailable() + + // snapshot-2 + inputData.addData(("b", "2024-06-02")) + stream.processAllAvailable() + + // snapshot-3 + inputData.addData(("c", "2024-06-03")) + stream.processAllAvailable() + + // This partition never expires. + inputData.addData(("Never-expire", "9999-09-09")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("a", "2024-06-01") :: Row("b", "2024-06-02") :: Row("c", "2024-06-03") :: Row( + "Never-expire", + "9999-09-09") :: Nil) + + // 'partition.timestamp-formatter' value using table property. + // 'partition.expiration-time' value using procedure parameter. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', expiration_time => '1 d')"), + Row("pt=2024-06-01") :: Row("pt=2024-06-02") :: Nil + ) + + checkAnswer(query(), Row("c", "2024-06-03") :: Row("Never-expire", "9999-09-09") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire partitions add options parameter") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (k STRING, pt STRING) + |TBLPROPERTIES ( + | 'primary-key' = 'k,pt', + | 'bucket' = '1') + |PARTITIONED BY (pt) + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(String, String)] + val stream = inputData + .toDS() + .toDF("k", "pt") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T") + + try { + // snapshot-1 + inputData.addData(("a", "2024-06-01")) + stream.processAllAvailable() + + // snapshot-2 + inputData.addData(("b", "2024-06-02")) + stream.processAllAvailable() + + // snapshot-3 + inputData.addData(("c", "2024-06-03")) + stream.processAllAvailable() + + // This partition never expires. + inputData.addData(("Never-expire", "9999-09-09")) + stream.processAllAvailable() + + checkAnswer( + query(), + Row("a", "2024-06-01") :: Row("b", "2024-06-02") :: Row("c", "2024-06-03") :: Row( + "Never-expire", + "9999-09-09") :: Nil) + + // set conf in options. + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_partitions(table => 'test.T', " + + "options => 'partition.expiration-time = 1d," + + " partition.expiration-max-num = 2," + + " partition.expiration-batch-size = 2," + + " partition.timestamp-formatter = yyyy-MM-dd')"), + Row("pt=2024-06-01") :: Row("pt=2024-06-02") :: Nil + ) + + checkAnswer(query(), Row("c", "2024-06-03") :: Row("Never-expire", "9999-09-09") :: Nil) + + } finally { + stream.stop() + } + } + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpireSnapshotsProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpireSnapshotsProcedureTest.scala new file mode 100644 index 000000000000..bbaf88568e2d --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/ExpireSnapshotsProcedureTest.scala @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase +import org.apache.paimon.utils.SnapshotManager + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest +import org.assertj.core.api.Assertions.{assertThat, assertThatIllegalArgumentException} + +import java.sql.Timestamp + +class ExpireSnapshotsProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + test("Paimon Procedure: expire snapshots") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3', + |'write-only' = 'true', 'snapshot.num-retained.min' = '1') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + // expire + checkAnswer( + spark.sql("CALL paimon.sys.expire_snapshots(table => 'test.T', retain_max => 2)"), + Row(1) :: Nil) + + checkAnswer( + spark.sql("SELECT snapshot_id FROM paimon.test.`T$snapshots`"), + Row(2L) :: Row(3L) :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire snapshots retainMax retainMin value check") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + // expire assert throw exception + assertThrows[IllegalArgumentException] { + spark.sql( + "CALL paimon.sys.expire_snapshots(table => 'test.T', retain_max => 2, retain_min => 3)") + } + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: test parameter order_than with string type") { + sql( + "CREATE TABLE T (a INT, b STRING) " + + "TBLPROPERTIES ( 'num-sorted-run.compaction-trigger' = '999'," + + "'write-only' = 'true', 'snapshot.num-retained.min' = '1')") + val table = loadTable("T") + val snapshotManager = table.snapshotManager + + // generate 5 snapshot + for (i <- 1 to 5) { + sql(s"INSERT INTO T VALUES ($i, '$i')") + } + checkSnapshots(snapshotManager, 1, 5) + + val timestamp = new Timestamp(snapshotManager.latestSnapshot().timeMillis) + spark.sql( + s"CALL paimon.sys.expire_snapshots(table => 'test.T', older_than => '${timestamp.toString}', max_deletes => 2)") + checkSnapshots(snapshotManager, 3, 5) + } + + test("Paimon Procedure: expire snapshots load table property first") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3', + |'snapshot.num-retained.max' = '2', + |'snapshot.num-retained.min' = '1', + |'write-only' = 'true') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + // expire + checkAnswer( + spark.sql("CALL paimon.sys.expire_snapshots(table => 'test.T')"), + Row(1) :: Nil) + + checkAnswer( + spark.sql("SELECT snapshot_id FROM paimon.test.`T$snapshots`"), + Row(2L) :: Row(3L) :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: expire snapshots add options parameter") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3', 'write-only' = 'true') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + checkAnswer( + spark.sql( + "CALL paimon.sys.expire_snapshots(table => 'test.T', options => 'snapshot.num-retained.max=2, snapshot.num-retained.min=1')"), + Row(1L) :: Nil + ) + + checkAnswer( + spark.sql("SELECT snapshot_id FROM paimon.test.`T$snapshots`"), + Row(2L) :: Row(3L) :: Nil) + } finally { + stream.stop() + } + } + } + } + + def checkSnapshots(sm: SnapshotManager, earliest: Int, latest: Int): Unit = { + assertThat(sm.snapshotCount).isEqualTo(latest - earliest + 1) + assertThat(sm.earliestSnapshotId).isEqualTo(earliest) + assertThat(sm.latestSnapshotId).isEqualTo(latest) + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/RollbackProcedureTest.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/RollbackProcedureTest.scala new file mode 100644 index 000000000000..078823c3ef37 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/procedure/RollbackProcedureTest.scala @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.procedure + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.sql.{Dataset, Row} +import org.apache.spark.sql.execution.streaming.runtime.MemoryStream +import org.apache.spark.sql.streaming.StreamTest + +class RollbackProcedureTest extends PaimonSparkTestBase with StreamTest { + + import testImplicits._ + + test("Paimon Procedure: rollback to snapshot and tag") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val table = loadTable("T") + val location = table.location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + checkAnswer( + spark.sql( + "CALL paimon.sys.create_tag(table => 'test.T', tag => 'test_tag', snapshot => 1)"), + Row(true) :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + assertThrows[RuntimeException] { + spark.sql("CALL paimon.sys.rollback(table => 'test.T_exception', version => '2')") + } + // rollback to snapshot + checkAnswer( + spark.sql("CALL paimon.sys.rollback(table => 'test.T', version => '2')"), + Row(table.latestSnapshot().get().id, 2) :: Nil) + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // rollback to tag + val taggedSnapshotId = table.tagManager().getOrThrow("test_tag").trimToSnapshot().id + checkAnswer( + spark.sql("CALL paimon.sys.rollback(table => 'test.T', version => 'test_tag')"), + Row(table.latestSnapshot().get().id, taggedSnapshotId) :: Nil) + checkAnswer(query(), Row(1, "a") :: Nil) + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: rollback to tag check test") { + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3', 'file.format'='orc') + |""".stripMargin) + + val table = loadTable("T") + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + // snapshot-1 + spark.sql("insert into T select 1, 'a'") + checkAnswer(query(), Row(1, "a") :: Nil) + + checkAnswer( + spark.sql("CALL paimon.sys.create_tag(table => 'test.T', tag => '20250122', snapshot => 1)"), + Row(true) :: Nil) + + // snapshot-2 + spark.sql("insert into T select 2, 'b'") + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // snapshot-3 + spark.sql("insert into T select 3, 'c'") + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil) + + // snapshot-4 + spark.sql("insert into T select 4, 'd'") + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Row(4, "d") :: Nil) + + assertThrows[RuntimeException] { + spark.sql("CALL paimon.sys.rollback(table => 'test.T_exception', version => '4')") + } + // rollback to snapshot + checkAnswer( + spark.sql("CALL paimon.sys.rollback(table => 'test.T', version => '3')"), + Row(table.latestSnapshot().get().id, 3) :: Nil) + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil) + + // version/snapshot/tag can only set one of them + assertThrows[RuntimeException] { + spark.sql( + "CALL paimon.sys.rollback(table => 'test.T', version => '20250122', tag => '20250122')") + } + + assertThrows[RuntimeException] { + spark.sql("CALL paimon.sys.rollback(table => 'test.T', version => '20250122', snapshot => 1)") + } + + assertThrows[RuntimeException] { + spark.sql("CALL paimon.sys.rollback(table => 'test.T', tag => '20250122', snapshot => 1)") + } + + // rollback to snapshot + spark.sql("CALL paimon.sys.rollback(table => 'test.T', snapshot => 2)") + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + // rollback to tag + spark.sql("CALL paimon.sys.rollback(table => 'test.T', tag => '20250122')") + checkAnswer(query(), Row(1, "a") :: Nil) + } + + test("Paimon Procedure: rollback to timestamp") { + failAfter(streamingTimeout) { + withTempDir { + checkpointDir => + // define a change-log table and test `forEachBatch` api + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ('primary-key'='a', 'bucket'='3') + |""".stripMargin) + val location = loadTable("T").location().toString + + val inputData = MemoryStream[(Int, String)] + val stream = inputData + .toDS() + .toDF("a", "b") + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .foreachBatch { + (batch: Dataset[Row], _: Long) => + batch.write.format("paimon").mode("append").save(location) + } + .start() + + val table = loadTable("T") + + val query = () => spark.sql("SELECT * FROM T ORDER BY a") + + try { + // snapshot-1 + inputData.addData((1, "a")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Nil) + + // snapshot-2 + inputData.addData((2, "b")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + val timestamp = System.currentTimeMillis() + + // snapshot-3 + inputData.addData((2, "b2")) + stream.processAllAvailable() + checkAnswer(query(), Row(1, "a") :: Row(2, "b2") :: Nil) + + // rollback to timestamp + checkAnswer( + spark.sql( + s"CALL paimon.sys.rollback_to_timestamp(table => 'test.T', timestamp => $timestamp)"), + Row(table.latestSnapshot().get().id, 2) :: Nil) + checkAnswer(query(), Row(1, "a") :: Row(2, "b") :: Nil) + + } finally { + stream.stop() + } + } + } + } + + test("Paimon Procedure: rollback with cache") { + sql("CREATE TABLE T (id INT)") + sql("INSERT INTO T VALUES (1), (2), (3), (4)") + sql("DELETE FROM T WHERE id = 1") + sql("CALL sys.rollback(table => 'T', version => '1')") + sql("DELETE FROM T WHERE id = 1") + checkAnswer(sql("SELECT * FROM T ORDER BY id"), Seq(Row(2), Row(3), Row(4))) + } +} From 0f5ba6608feda025e5c44f423354245d1fdec127 Mon Sep 17 00:00:00 2001 From: Muhammad Junaid Muzammil <4795269+junmuz@users.noreply.github.com> Date: Mon, 16 Mar 2026 10:46:08 -0700 Subject: [PATCH 3/3] Updating Scala lib version and test signature fixes --- paimon-spark/paimon-spark-4.1/pom.xml | 6 + .../sql/execution/PaimonStrategyHelper.scala | 41 + .../PaimonCreateTableAsSelectStrategy.scala | 90 +++ .../paimon/spark/PaimonSparkTestBase.scala | 197 +++++ .../spark/sql/DataFrameWriteTestBase.scala | 701 ++++++++++++++++++ pom.xml | 2 +- 6 files changed, 1036 insertions(+), 1 deletion(-) create mode 100644 paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/PaimonStrategyHelper.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/shim/PaimonCreateTableAsSelectStrategy.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSparkTestBase.scala create mode 100644 paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTestBase.scala diff --git a/paimon-spark/paimon-spark-4.1/pom.xml b/paimon-spark/paimon-spark-4.1/pom.xml index 91aa2c76eac4..74a30570cc5b 100644 --- a/paimon-spark/paimon-spark-4.1/pom.xml +++ b/paimon-spark/paimon-spark-4.1/pom.xml @@ -45,6 +45,12 @@ under the License. org.apache.paimon paimon-spark4-common_${scala.binary.version} ${project.version} + + + org.apache.spark + spark-sql-api_${scala.binary.version} + + diff --git a/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/PaimonStrategyHelper.scala b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/PaimonStrategyHelper.scala new file mode 100644 index 000000000000..9fb3a7b54a25 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/PaimonStrategyHelper.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.catalog.CatalogUtils +import org.apache.spark.sql.catalyst.plans.logical.TableSpec +import org.apache.spark.sql.internal.StaticSQLConf.WAREHOUSE_PATH + +trait PaimonStrategyHelper { + + def spark: SparkSession + + protected def makeQualifiedDBObjectPath(location: String): String = { + CatalogUtils.makeQualifiedDBObjectPath( + spark.sharedState.conf.get(WAREHOUSE_PATH), + location, + spark.sharedState.hadoopConf) + } + + protected def qualifyLocInTableSpec(tableSpec: TableSpec): TableSpec = { + tableSpec.copy(location = tableSpec.location.map(makeQualifiedDBObjectPath(_))) + } + +} diff --git a/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/shim/PaimonCreateTableAsSelectStrategy.scala b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/shim/PaimonCreateTableAsSelectStrategy.scala new file mode 100644 index 000000000000..61e25b7c16a9 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/main/scala/org/apache/spark/sql/execution/shim/PaimonCreateTableAsSelectStrategy.scala @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.shim + +import org.apache.paimon.CoreOptions +import org.apache.paimon.iceberg.IcebergOptions +import org.apache.paimon.spark.SparkCatalog +import org.apache.paimon.spark.catalog.FormatTableCatalog + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.analysis.ResolvedIdentifier +import org.apache.spark.sql.catalyst.plans.logical.{CreateTableAsSelect, LogicalPlan, TableSpec} +import org.apache.spark.sql.connector.catalog.StagingTableCatalog +import org.apache.spark.sql.execution.{PaimonStrategyHelper, SparkPlan, SparkStrategy} +import org.apache.spark.sql.execution.datasources.v2.CreateTableAsSelectExec + +import scala.collection.JavaConverters._ + +case class PaimonCreateTableAsSelectStrategy(spark: SparkSession) + extends SparkStrategy + with PaimonStrategyHelper { + + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case CreateTableAsSelect( + ResolvedIdentifier(catalog: SparkCatalog, ident), + parts, + query, + tableSpec: TableSpec, + options, + ifNotExists, + true) => + catalog match { + case _: StagingTableCatalog => + throw new RuntimeException("Paimon can't extend StagingTableCatalog for now.") + case _ => + val coreOptionKeys = CoreOptions.getOptions.asScala.map(_.key()).toSeq + + // Include Iceberg compatibility options in table properties (fix for DataFrame writer options) + val icebergOptionKeys = IcebergOptions.getOptions.asScala.map(_.key()).toSeq + + val allTableOptionKeys = coreOptionKeys ++ icebergOptionKeys + + val (tableOptions, writeOptions) = options.partition { + case (key, _) => allTableOptionKeys.contains(key) + } + val newTableSpec = tableSpec.copy(properties = tableSpec.properties ++ tableOptions) + + val isPartitionedFormatTable = { + catalog match { + case catalog: FormatTableCatalog => + catalog.isFormatTable(newTableSpec.provider.orNull) && parts.nonEmpty + case _ => false + } + } + + if (isPartitionedFormatTable) { + throw new UnsupportedOperationException( + "Using CTAS with partitioned format table is not supported yet.") + } + + CreateTableAsSelectExec( + catalog.asTableCatalog, + ident, + parts, + query, + qualifyLocInTableSpec(newTableSpec), + writeOptions, + ifNotExists) :: Nil + } + case _ => Nil + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSparkTestBase.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSparkTestBase.scala new file mode 100644 index 000000000000..3208609835f1 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/PaimonSparkTestBase.scala @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark + +import org.apache.paimon.catalog.{Catalog, Identifier} +import org.apache.paimon.data.GenericRow +import org.apache.paimon.fs.FileIO +import org.apache.paimon.fs.local.LocalFileIO +import org.apache.paimon.spark.catalog.WithPaimonCatalog +import org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions +import org.apache.paimon.spark.sql.{SparkVersionSupport, WithTableOptions} +import org.apache.paimon.table.FileStoreTable + +import org.apache.spark.SparkConf +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.connector.catalog.{Identifier => SparkIdentifier} +import org.apache.spark.sql.connector.read.Scan +import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2ScanRelation} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.paimon.Utils +import org.apache.spark.sql.test.SharedSparkSession + +import java.io.File +import java.util.{TimeZone, UUID} + +import scala.util.Random + +class PaimonSparkTestBase + extends QueryTest + with SharedSparkSession + with WithTableOptions + with SparkVersionSupport { + + protected lazy val commitUser: String = UUID.randomUUID.toString + + protected lazy val fileIO: FileIO = LocalFileIO.create + + protected lazy val tempDBDir: File = Utils.createTempDir + + protected def paimonCatalog: Catalog = { + spark.sessionState.catalogManager.currentCatalog.asInstanceOf[WithPaimonCatalog].paimonCatalog() + } + + protected val dbName0: String = "test" + + protected val tableName0: String = "T" + + /** Add paimon ([[SparkCatalog]] in fileSystem) catalog */ + override protected def sparkConf: SparkConf = { + val serializer = if (Random.nextBoolean()) { + "org.apache.spark.serializer.KryoSerializer" + } else { + "org.apache.spark.serializer.JavaSerializer" + } + super.sparkConf + .set("spark.sql.warehouse.dir", tempDBDir.getCanonicalPath) + .set("spark.sql.catalog.paimon", classOf[SparkCatalog].getName) + .set("spark.sql.catalog.paimon.warehouse", tempDBDir.getCanonicalPath) + .set("spark.sql.extensions", classOf[PaimonSparkSessionExtensions].getName) + .set("spark.serializer", serializer) + } + + override protected def beforeAll(): Unit = { + super.beforeAll() + spark.sql(s"USE paimon") + spark.sql(s"CREATE DATABASE IF NOT EXISTS paimon.$dbName0") + spark.sql(s"USE paimon.$dbName0") + } + + override protected def afterAll(): Unit = { + try { + spark.sql(s"USE paimon") + spark.sql(s"DROP TABLE IF EXISTS $dbName0.$tableName0") + spark.sql("USE default") + spark.sql(s"DROP DATABASE paimon.$dbName0 CASCADE") + } finally { + super.afterAll() + } + } + + /** Default is paimon catalog */ + override protected def beforeEach(): Unit = { + super.beforeAll() + spark.sql(s"USE paimon") + spark.sql(s"USE paimon.$dbName0") + spark.sql(s"DROP TABLE IF EXISTS $tableName0") + } + + protected def withTempDirs(f: (File, File) => Unit): Unit = { + withTempDir(file1 => withTempDir(file2 => f(file1, file2))) + } + + protected def withTimeZone(timeZone: String)(f: => Unit): Unit = { + withSparkSQLConf("spark.sql.session.timeZone" -> timeZone) { + val originTimeZone = TimeZone.getDefault + try { + TimeZone.setDefault(TimeZone.getTimeZone(timeZone)) + f + } finally { + TimeZone.setDefault(originTimeZone) + } + } + } + + // Since SPARK-46227 has changed the definition of withSQLConf that resulted in + // incompatibility between the Spark3.x and Spark4.x, So Paimon declare a separate method + // to provide the same function. + protected def withSparkSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { + withSparkSQLConf0(pairs: _*)(f) + } + + private def withSparkSQLConf0(pairs: (String, String)*)(f: => Unit): Unit = { + val conf = SQLConf.get + val (keys, values) = pairs.unzip + val currentValues = keys.map { + key => + if (conf.contains(key)) { + Some(conf.getConfString(key)) + } else { + None + } + } + (keys, values).zipped.foreach { + (k, v) => + if (SQLConf.isStaticConfigKey(k)) { + throw new RuntimeException(s"Cannot modify the value of a static config: $k") + } + conf.setConfString(k, v) + } + try f + finally { + keys.zip(currentValues).foreach { + case (key, Some(value)) => conf.setConfString(key, value) + case (key, None) => conf.unsetConf(key) + } + } + } + + def loadTable(tableName: String): FileStoreTable = { + loadTable(dbName0, tableName) + } + + def loadTable(dbName: String, tableName: String): FileStoreTable = { + paimonCatalog.getTable(Identifier.create(dbName, tableName)).asInstanceOf[FileStoreTable] + } + + protected def createRelationV2(tableName: String): DataSourceV2Relation = { + val sparkTable = SparkTable(loadTable(tableName)) + DataSourceV2Relation.create( + sparkTable, + Some(spark.sessionState.catalogManager.currentCatalog), + Some(SparkIdentifier.of(Array(this.dbName0), tableName)) + ) + } + + def getScan(sqlText: String): Scan = { + sql(sqlText).queryExecution.optimizedPlan + .collectFirst { case relation: DataSourceV2ScanRelation => relation } + .get + .scan + } + + protected def getPaimonScan(sqlText: String): PaimonScan = { + getScan(sqlText).asInstanceOf[PaimonScan] + } + + protected def getFormatTableScan(sqlText: String): PaimonFormatTableScan = { + getScan(sqlText).asInstanceOf[PaimonFormatTableScan] + } + + object GenericRow { + def of(values: Any*): GenericRow = { + val row = new GenericRow(values.length) + values.zipWithIndex.foreach { + case (value, index) => + row.setField(index, value) + } + row + } + } +} diff --git a/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTestBase.scala b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTestBase.scala new file mode 100644 index 000000000000..b25e41a3fb42 --- /dev/null +++ b/paimon-spark/paimon-spark-4.1/src/test/scala/org/apache/paimon/spark/sql/DataFrameWriteTestBase.scala @@ -0,0 +1,701 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.sql + +import org.apache.paimon.spark.PaimonSparkTestBase + +import org.apache.spark.SparkConf +import org.apache.spark.sql.Row +import org.apache.spark.sql.types.DecimalType +import org.junit.jupiter.api.Assertions + +import java.sql.{Date, Timestamp} + +abstract class DataFrameWriteTestBase extends PaimonSparkTestBase { + + override protected def sparkConf: SparkConf = { + super.sparkConf.set("spark.sql.catalog.paimon.cache-enabled", "false") + } + + import testImplicits._ + + test("Paimon dataframe: insert into partitioned table") { + for (useV2Write <- Seq("true", "false")) { + withSparkSQLConf("spark.paimon.write.use-v2-write" -> useV2Write) { + withTable("t") { + // create table + Seq((1, "x1", "p1"), (2, "x2", "p2")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .option("primary-key", "a,pt") + .partitionBy("pt") + .saveAsTable("t") + + // insert into + Seq((3, "x3", "p3")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .mode("append") + .insertInto("t") + checkAnswer( + spark.read.format("paimon").table("t").orderBy("a"), + Seq(Row(1, "x1", "p1"), Row(2, "x2", "p2"), Row(3, "x3", "p3")) + ) + checkAnswer( + sql("SHOW PARTITIONS t"), + Seq(Row("pt=p1"), Row("pt=p2"), Row("pt=p3")) + ) + + // dynamic insert overwrite + withSparkSQLConf("spark.sql.sources.partitionOverwriteMode" -> "dynamic") { + Seq((4, "x4", "p1")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .mode("overwrite") + .insertInto("t") + } + checkAnswer( + spark.read.format("paimon").table("t").orderBy("a"), + Seq(Row(2, "x2", "p2"), Row(3, "x3", "p3"), Row(4, "x4", "p1")) + ) + checkAnswer( + sql("SHOW PARTITIONS t"), + Seq(Row("pt=p1"), Row("pt=p2"), Row("pt=p3")) + ) + + // insert overwrite + Seq((5, "x5", "p1")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .mode("overwrite") + .insertInto("t") + checkAnswer( + spark.read.format("paimon").table("t").orderBy("a"), + Seq(Row(5, "x5", "p1")) + ) + checkAnswer( + sql("SHOW PARTITIONS t"), + Seq(Row("pt=p1")) + ) + } + } + } + } + + test("Paimon dataframe: save as partitioned table") { + for (useV2Write <- Seq("true", "false")) { + withSparkSQLConf("spark.paimon.write.use-v2-write" -> useV2Write) { + withTable("t") { + // create table + Seq((1, "x1", "p1"), (2, "x2", "p2")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .mode("append") + .option("primary-key", "a,pt") + .partitionBy("pt") + .saveAsTable("t") + + // saveAsTable with append mode + Seq((3, "x3", "p3")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .mode("append") + .saveAsTable("t") + checkAnswer( + spark.read.format("paimon").table("t").orderBy("a"), + Seq(Row(1, "x1", "p1"), Row(2, "x2", "p2"), Row(3, "x3", "p3")) + ) + checkAnswer( + sql("SHOW PARTITIONS t"), + Seq(Row("pt=p1"), Row("pt=p2"), Row("pt=p3")) + ) + + // saveAsTable with overwrite mode will call replace table internal, + // so here we set the props and partitions again. + Seq((5, "x5", "p1")) + .toDF("a", "b", "pt") + .write + .format("paimon") + .option("primary-key", "a,pt") + .partitionBy("pt") + .mode("overwrite") + .saveAsTable("t") + checkAnswer( + spark.read.format("paimon").table("t").orderBy("a"), + Seq(Row(5, "x5", "p1")) + ) + checkAnswer( + sql("SHOW PARTITIONS t"), + Seq(Row("pt=p1")) + ) + } + } + } + } + + test("Paimon: DataFrameWrite.saveAsTable") { + withTable("test_ctas") { + Seq((1L, "x1"), (2L, "x2")) + .toDF("a", "b") + .write + .format("paimon") + .mode("append") + .option("primary-key", "a") + .option("bucket", "-1") + .option("target-file-size", "256MB") + .option("write.merge-schema", "true") + .option("write.merge-schema.explicit-cast", "true") + .saveAsTable("test_ctas") + + val paimonTable = loadTable("test_ctas") + Assertions.assertEquals(1, paimonTable.primaryKeys().size()) + Assertions.assertEquals("a", paimonTable.primaryKeys().get(0)) + + // check all the core options + Assertions.assertEquals("-1", paimonTable.options().get("bucket")) + Assertions.assertEquals("256MB", paimonTable.options().get("target-file-size")) + + // non-core options should not be here. + Assertions.assertFalse(paimonTable.options().containsKey("write.merge-schema")) + Assertions.assertFalse(paimonTable.options().containsKey("write.merge-schema.explicit-cast")) + } + } + + test("Paimon: DataFrameWrite partition table") { + withTable("t") { + spark.sql(s""" + |CREATE TABLE t (a INT, b STRING, dt STRING) PARTITIONED BY(dt) + |TBLPROPERTIES ('file.format' = 'avro', 'bucket' = 2, 'bucket-key' = 'b') + |""".stripMargin) + + val table = loadTable("t") + val location = table.location().toString + + Seq((1, "x1", "a"), (2, "x2", "b")) + .toDF("a", "b", "c") + .write + .format("paimon") + .mode("append") + .save(location) + checkAnswer(sql("SELECT * FROM t"), Row(1, "x1", "a") :: Row(2, "x2", "b") :: Nil) + } + } + + fileFormats.foreach { + fileFormat => + test(s"Paimon: DataFrameWrite.saveAsTable in ByName mode, file.format: $fileFormat") { + withTable("t1", "t2") { + spark.sql(s""" + |CREATE TABLE t1 (col1 STRING, col2 INT, col3 DOUBLE) + |TBLPROPERTIES ('file.format' = '$fileFormat') + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE t2 (col2 INT, col3 DOUBLE, col1 STRING) + |TBLPROPERTIES ('file.format' = '$fileFormat') + |""".stripMargin) + + sql(s""" + |INSERT INTO TABLE t1 VALUES + |("Hello", 1, 1.1), + |("World", 2, 2.2), + |("Paimon", 3, 3.3); + |""".stripMargin) + + spark.table("t1").write.format("paimon").mode("append").saveAsTable("t2") + checkAnswer( + sql("SELECT * FROM t2 ORDER BY col2"), + Row(1, 1.1d, "Hello") :: Row(2, 2.2d, "World") :: Row(3, 3.3d, "Paimon") :: Nil) + } + } + } + + fileFormats.foreach { + fileFormat => + test( + s"Paimon: DataFrameWrite.saveAsTable with complex data type in ByName mode, file.format: $fileFormat") { + withTable("t1", "t2") { + spark.sql( + s""" + |CREATE TABLE t1 (a STRING, b INT, c STRUCT, d ARRAY>>, e ARRAY) + |TBLPROPERTIES ('file.format' = '$fileFormat') + |""".stripMargin) + + spark.sql( + s""" + |CREATE TABLE t2 (b INT, c STRUCT, d ARRAY, d1 TIMESTAMP>>, e ARRAY, a STRING) + |TBLPROPERTIES ('file.format' = '$fileFormat') + |""".stripMargin) + + sql(s""" + |INSERT INTO TABLE t1 VALUES + |("Hello", 1, struct(1.1, 1000), array(struct(timestamp'2024-01-01 00:00:00', map("k1", "v1")), struct(timestamp'2024-08-01 00:00:00', map("k1", "v11"))), array(123, 345)), + |("World", 2, struct(2.2, 2000), array(struct(timestamp'2024-02-01 00:00:00', map("k2", "v2"))), array(234, 456)), + |("Paimon", 3, struct(3.3, 3000), null, array(345, 567)); + |""".stripMargin) + + spark.table("t1").write.format("paimon").mode("append").saveAsTable("t2") + checkAnswer( + sql("SELECT * FROM t2 ORDER BY b"), + Row( + 1, + Row(1000L, 1.1d), + Array( + Row(Map("k1" -> "v1"), Timestamp.valueOf("2024-01-01 00:00:00")), + Row(Map("k1" -> "v11"), Timestamp.valueOf("2024-08-01 00:00:00"))), + Array(123, 345), + "Hello" + ) + :: Row( + 2, + Row(2000L, 2.2d), + Array(Row(Map("k2" -> "v2"), Timestamp.valueOf("2024-02-01 00:00:00"))), + Array(234, 456), + "World") + :: Row(3, Row(3000L, 3.3d), null, Array(345, 567), "Paimon") :: Nil + ) + } + } + } + + withPk.foreach { + hasPk => + bucketModes.foreach { + bucket => + test(s"Write data into Paimon directly: has-pk: $hasPk, bucket: $bucket") { + + val prop = if (hasPk) { + s"'primary-key'='a', 'bucket' = '$bucket' " + } else if (bucket != -1) { + s"'bucket-key'='a', 'bucket' = '$bucket' " + } else { + "'write-only'='true'" + } + + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ($prop) + |""".stripMargin) + + val paimonTable = loadTable("T") + val location = paimonTable.location().toString + + val df1 = Seq((1, "a"), (2, "b")).toDF("a", "b") + df1.write.format("paimon").mode("append").save(location) + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(1, "a") :: Row(2, "b") :: Nil) + + val df2 = Seq((1, "a2"), (3, "c")).toDF("a", "b") + df2.write.format("paimon").mode("append").save(location) + val expected = if (hasPk) { + Row(1, "a2") :: Row(2, "b") :: Row(3, "c") :: Nil + } else { + Row(1, "a") :: Row(1, "a2") :: Row(2, "b") :: Row(3, "c") :: Nil + } + checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected) + + val df3 = Seq((4, "d"), (5, "e")).toDF("a", "b") + df3.write.format("paimon").mode("overwrite").save(location) + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(4, "d") :: Row(5, "e") :: Nil) + } + } + } + + fileFormats.foreach { + format => + withPk.foreach { + hasPk => + bucketModes.foreach { + bucket => + test( + s"Schema evolution: write data into Paimon: $hasPk, bucket: $bucket, format: $format") { + val _spark = spark + import _spark.implicits._ + + val prop = if (hasPk) { + s"'primary-key'='a', 'bucket' = '$bucket', 'file.format' = '$format'" + } else if (bucket != -1) { + s"'bucket-key'='a', 'bucket' = '$bucket', 'file.format' = '$format'" + } else { + s"'write-only'='true', 'file.format' = '$format'" + } + + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ($prop) + |""".stripMargin) + + val paimonTable = loadTable("T") + val location = paimonTable.location().toString + + val df1 = Seq((1, "a"), (2, "b")).toDF("a", "b") + df1.write.format("paimon").mode("append").save(location) + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(1, "a") :: Row(2, "b") :: Nil) + + // Case 1: two additional fields + val df2 = Seq((1, "a2", 123L, Map("k" -> 11.1)), (3, "c", 345L, Map("k" -> 33.3))) + .toDF("a", "b", "c", "d") + df2.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + val expected2 = if (hasPk) { + Row(1, "a2", 123L, Map("k" -> 11.1)) :: + Row(2, "b", null, null) :: Row(3, "c", 345L, Map("k" -> 33.3)) :: Nil + } else { + Row(1, "a", null, null) :: Row(1, "a2", 123L, Map("k" -> 11.1)) :: Row( + 2, + "b", + null, + null) :: Row(3, "c", 345L, Map("k" -> 33.3)) :: Nil + } + checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected2) + + // Case 2: two fields with the evolved types: Int -> Long, Long -> Decimal + val df3 = Seq( + (2L, "b2", BigDecimal.decimal(234), Map("k" -> 22.2)), + (4L, "d", BigDecimal.decimal(456), Map("k" -> 44.4))).toDF("a", "b", "c", "d") + df3.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + val expected3 = if (hasPk) { + Row(1L, "a2", BigDecimal.decimal(123), Map("k" -> 11.1)) :: Row( + 2L, + "b2", + BigDecimal.decimal(234), + Map("k" -> 22.2)) :: Row( + 3L, + "c", + BigDecimal.decimal(345), + Map("k" -> 33.3)) :: Row( + 4L, + "d", + BigDecimal.decimal(456), + Map("k" -> 44.4)) :: Nil + } else { + Row(1L, "a", null, null) :: Row( + 1L, + "a2", + BigDecimal.decimal(123), + Map("k" -> 11.1)) :: Row(2L, "b", null, null) :: Row( + 2L, + "b2", + BigDecimal.decimal(234), + Map("k" -> 22.2)) :: Row( + 3L, + "c", + BigDecimal.decimal(345), + Map("k" -> 33.3)) :: Row( + 4L, + "d", + BigDecimal.decimal(456), + Map("k" -> 44.4)) :: Nil + } + checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected3) + + // Case 3: insert Decimal(20,18) to Decimal(38,18) + val df4 = Seq((99L, "df4", BigDecimal.decimal(4.0), Map("4" -> 4.1))) + .toDF("a", "b", "c", "d") + .selectExpr("a", "b", "cast(c as decimal(20,18)) as c", "d") + df4.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + val expected4 = + expected3 ++ Seq(Row(99L, "df4", BigDecimal.decimal(4.0), Map("4" -> 4.1))) + checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected4) + val decimalType = + spark.table("T").schema.apply(2).dataType.asInstanceOf[DecimalType] + assert(decimalType.precision == 38) + assert(decimalType.scale == 18) + } + } + } + } + + withPk.foreach { + hasPk => + bucketModes.foreach { + bucket => + test( + s"Schema evolution: write data into Paimon with allowExplicitCast = true: $hasPk, bucket: $bucket") { + + val prop = if (hasPk) { + s"'primary-key'='a', 'bucket' = '$bucket' " + } else if (bucket != -1) { + s"'bucket-key'='a', 'bucket' = '$bucket' " + } else { + "'write-only'='true'" + } + + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |TBLPROPERTIES ($prop) + |""".stripMargin) + + val paimonTable = loadTable("T") + val location = paimonTable.location().toString + + val df1 = Seq((1, "2023-08-01"), (2, "2023-08-02")).toDF("a", "b") + df1.write.format("paimon").mode("append").save(location) + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(1, "2023-08-01") :: Row(2, "2023-08-02") :: Nil) + + // Case 1: two additional fields: DoubleType and TimestampType + val ts = java.sql.Timestamp.valueOf("2023-08-01 10:00:00.0") + val df2 = Seq((1, "2023-08-01", 12.3d, ts), (3, "2023-08-03", 34.5d, ts)) + .toDF("a", "b", "c", "d") + df2.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + val expected2 = if (hasPk) { + Row(1, "2023-08-01", 12.3d, ts) :: + Row(2, "2023-08-02", null, null) :: Row(3, "2023-08-03", 34.5d, ts) :: Nil + } else { + Row(1, "2023-08-01", null, null) :: Row(1, "2023-08-01", 12.3d, ts) :: Row( + 2, + "2023-08-02", + null, + null) :: Row(3, "2023-08-03", 34.5d, ts) :: Nil + } + checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected2) + + // Case 2: a: Int -> Long, b: String -> Date, c: Long -> Int, d: Map -> String + val date = java.sql.Date.valueOf("2023-07-31") + val df3 = Seq((2L, date, 234, null), (4L, date, 456, "2023-08-01 11:00:00.0")).toDF( + "a", + "b", + "c", + "d") + + // throw UnsupportedOperationException if write.merge-schema.explicit-cast = false + assertThrows[UnsupportedOperationException] { + df3.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + } + // merge schema and write data when write.merge-schema.explicit-cast = true + df3.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .option("write.merge-schema.explicit-cast", "true") + .save(location) + val expected3 = if (hasPk) { + Row(1L, Date.valueOf("2023-08-01"), 12, ts.toString) :: Row( + 2L, + date, + 234, + null) :: Row(3L, Date.valueOf("2023-08-03"), 34, ts.toString) :: Row( + 4L, + date, + 456, + "2023-08-01 11:00:00.0") :: Nil + } else { + Row(1L, Date.valueOf("2023-08-01"), null, null) :: Row( + 1L, + Date.valueOf("2023-08-01"), + 12, + ts.toString) :: Row(2L, date, 234, null) :: Row( + 2L, + Date.valueOf("2023-08-02"), + null, + null) :: Row(3L, Date.valueOf("2023-08-03"), 34, ts.toString) :: Row( + 4L, + date, + 456, + "2023-08-01 11:00:00.0") :: Nil + } + checkAnswer( + spark.sql("SELECT a, b, c, substring(d, 0, 21) FROM T ORDER BY a, b"), + expected3) + + } + } + } + + withPk.foreach { + hasPk => + test(s"Support v2 write with overwrite, hasPk: $hasPk") { + withTable("t") { + val prop = if (hasPk) { + "'primary-key'='c1'" + } else { + "'write-only'='true'" + } + spark.sql(s""" + |CREATE TABLE t (c1 INT, c2 STRING) PARTITIONED BY(p1 String, p2 string) + |TBLPROPERTIES ($prop) + |""".stripMargin) + + spark + .range(3) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite($"p1" === "a") + checkAnswer( + spark.sql("SELECT * FROM t ORDER BY c1"), + Row(0, "0", "a", "0") :: Row(1, "1", "a", "1") :: Row(2, "2", "a", "2") :: Nil + ) + + spark + .range(7, 10) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite($"p1" === "a") + checkAnswer( + spark.sql("SELECT * FROM t ORDER BY c1"), + Row(7, "7", "a", "7") :: Row(8, "8", "a", "8") :: Row(9, "9", "a", "9") :: Nil + ) + + spark + .range(2) + .selectExpr("id as c1", "id as c2", "'a' as p1", "9 as p2") + .writeTo("t") + .overwrite(($"p1" <=> "a").and($"p2" === "9")) + checkAnswer( + spark.sql("SELECT * FROM t ORDER BY c1"), + Row(0, "0", "a", "9") :: Row(1, "1", "a", "9") :: Row(7, "7", "a", "7") :: + Row(8, "8", "a", "8") :: Nil + ) + + // bad case + val msg1 = intercept[Exception] { + spark + .range(2) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite($"p1" =!= "a") + }.getMessage + assert(msg1.contains("Only support Overwrite filters with Equal and EqualNullSafe")) + + val msg2 = intercept[Exception] { + spark + .range(2) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite($"p1" === $"c2") + }.getMessage + if (gteqSpark3_4) { + assert(msg2.contains("Table does not support overwrite by expression")) + } else { + assert(msg2.contains("cannot translate expression to source filter")) + } + + val msg3 = intercept[Exception] { + spark + .range(2) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite($"c1" === ($"c2" + 1)) + }.getMessage + if (gteqSpark4_0) { + assert(msg3.contains("Table does not support overwrite by expression")) + } else { + assert(msg3.contains("cannot translate expression to source filter")) + } + + val msg4 = intercept[Exception] { + spark + .range(2) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite(($"p1" === "a").and($"p1" === "b")) + }.getMessage + assert(msg4.contains("Only support Overwrite with one filter for each partition column")) + + // Overwrite a partition which is not the specified + val msg5 = intercept[Exception] { + spark + .range(2) + .selectExpr("id as c1", "id as c2", "'a' as p1", "id as p2") + .writeTo("t") + .overwrite($"p1" === "b") + }.getMessage + assert(msg5.contains("does not belong to this partition")) + } + } + } + + test("Paimon Schema Evolution: some columns is absent in the coming data") { + + spark.sql(s""" + |CREATE TABLE T (a INT, b STRING) + |""".stripMargin) + + val paimonTable = loadTable("T") + val location = paimonTable.location().toString + + val df1 = Seq((1, "2023-08-01"), (2, "2023-08-02")).toDF("a", "b") + df1.write.format("paimon").mode("append").save(location) + checkAnswer( + spark.sql("SELECT * FROM T ORDER BY a, b"), + Row(1, "2023-08-01") :: Row(2, "2023-08-02") :: Nil) + + // Case 1: two additional fields: DoubleType and TimestampType + val ts = java.sql.Timestamp.valueOf("2023-08-01 10:00:00.0") + val df2 = Seq((1, "2023-08-01", 12.3d, ts), (3, "2023-08-03", 34.5d, ts)) + .toDF("a", "b", "c", "d") + df2.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + + // Case 2: colum b and d are absent in the coming data + val df3 = Seq((4, 45.6d), (5, 56.7d)) + .toDF("a", "c") + df3.write + .format("paimon") + .mode("append") + .option("write.merge-schema", "true") + .save(location) + val expected3 = + Row(1, "2023-08-01", null, null) :: Row(1, "2023-08-01", 12.3d, ts) :: Row( + 2, + "2023-08-02", + null, + null) :: Row(3, "2023-08-03", 34.5d, ts) :: Row(4, null, 45.6d, null) :: Row( + 5, + null, + 56.7d, + null) :: Nil + checkAnswer(spark.sql("SELECT * FROM T ORDER BY a, b"), expected3) + } +} diff --git a/pom.xml b/pom.xml index 222d81df790a..eb5844e7b559 100644 --- a/pom.xml +++ b/pom.xml @@ -89,7 +89,7 @@ under the License. 1.20.1 2.12 2.12.18 - 2.13.16 + 2.13.17 ${scala212.version} ${scala212.version} 1.1.10.8