From a30703b8fa1f4bfee2f5d2d0a6a41d43747410c2 Mon Sep 17 00:00:00 2001 From: guihuawen Date: Mon, 9 Feb 2026 22:28:54 +0800 Subject: [PATCH 1/7] [AURON #2080] Support Hive Parquet table to NativeParquetHiveTableScanExec --- dev/mvn-build-helper/assembly/pom.xml | 8 +- spark-extension-shims-spark/pom.xml | 43 +++ ...pache.spark.sql.auron.AuronConvertProvider | 18 ++ .../auron/plan/HiveConvertProvider.scala | 73 +++++ .../plan/NativeParquetHiveTableScanExec.scala | 272 ++++++++++++++++++ .../org/apache/auron/BaseAuronSQLSuite.scala | 8 + .../hive/execution/BaseAuronHiveSuite.scala | 85 ++++++ .../HiveParquetTableScanExecSuite.scala | 53 ++++ .../spark/sql/auron/AuronConverters.scala | 6 +- .../auron/AuronSparkSessionExtension.scala | 2 +- .../apache/spark/sql/auron/NativeHelper.scala | 2 +- .../sql/auron/util/TaskContextHelper.scala | 2 +- .../auron/plan/NativeHiveTableScanBase.scala | 3 +- .../uniffle/AuronUniffleShuffleReader.scala | 2 +- 14 files changed, 565 insertions(+), 12 deletions(-) create mode 100644 spark-extension-shims-spark/src/main/resources/META-INF/services/org.apache.spark.sql.auron.AuronConvertProvider create mode 100644 spark-extension-shims-spark/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/HiveConvertProvider.scala create mode 100644 spark-extension-shims-spark/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/NativeParquetHiveTableScanExec.scala create mode 100644 spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/BaseAuronHiveSuite.scala create mode 100644 spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala diff --git a/dev/mvn-build-helper/assembly/pom.xml b/dev/mvn-build-helper/assembly/pom.xml index 0b8ab272e..6aeada7ee 100644 --- a/dev/mvn-build-helper/assembly/pom.xml +++ b/dev/mvn-build-helper/assembly/pom.xml @@ -120,10 +120,10 @@ org.apache.arrow.c.** - - io.netty - ${auron.shade.packageName}.io.netty - + + + + javax.annotation ${auron.shade.packageName}.javax.annotation diff --git a/spark-extension-shims-spark/pom.xml b/spark-extension-shims-spark/pom.xml index 4c75845bf..f3e0d7ce3 100644 --- a/spark-extension-shims-spark/pom.xml +++ b/spark-extension-shims-spark/pom.xml @@ -55,11 +55,27 @@ spark-core_${scalaVersion} provided + + org.apache.spark + spark-catalyst_${scalaVersion} + provided + + + org.apache.arrow + arrow-memory-netty + + + org.apache.spark spark-hive_${scalaVersion} provided + + org.apache.hadoop + hadoop-client-api + ${hadoopVersion} + org.apache.spark spark-sql_${scalaVersion} @@ -103,15 +119,42 @@ spark-core_${scalaVersion} test-jar + + org.apache.arrow + arrow-memory-core + ${arrowVersion} + org.apache.spark spark-catalyst_${scalaVersion} test-jar + + + org.apache.arrow + arrow-memory-netty + + + org.apache.spark spark-sql_${scalaVersion} test-jar + + org.apache.spark + spark-hive_${scalaVersion} + test-jar + + + + + + + + + + + diff --git a/spark-extension-shims-spark/src/main/resources/META-INF/services/org.apache.spark.sql.auron.AuronConvertProvider b/spark-extension-shims-spark/src/main/resources/META-INF/services/org.apache.spark.sql.auron.AuronConvertProvider new file mode 100644 index 000000000..8e5cc72ed --- /dev/null +++ b/spark-extension-shims-spark/src/main/resources/META-INF/services/org.apache.spark.sql.auron.AuronConvertProvider @@ -0,0 +1,18 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +org.apache.spark.sql.hive.execution.auron.plan.HiveConvertProvider diff --git a/spark-extension-shims-spark/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/HiveConvertProvider.scala b/spark-extension-shims-spark/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/HiveConvertProvider.scala new file mode 100644 index 000000000..99f821b93 --- /dev/null +++ b/spark-extension-shims-spark/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/HiveConvertProvider.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.hive.execution.auron.plan + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.auron.{AuronConverters, AuronConvertProvider} +import org.apache.spark.sql.auron.AuronConverters.getBooleanConf +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.hive.client.HiveClientImpl +import org.apache.spark.sql.hive.execution.HiveTableScanExec + +class HiveConvertProvider extends AuronConvertProvider with Logging { + override def isEnabled: Boolean = + getBooleanConf("spark.auron.enable.hiveTable", defaultValue = true) + + private def enableHiveTableScanExec: Boolean = + getBooleanConf("spark.auron.enable.parquetHiveTableScanExec", defaultValue = false) + + override def isSupported(exec: SparkPlan): Boolean = + exec match { + case e: HiveTableScanExec + if enableHiveTableScanExec && + e.relation.tableMeta.provider.isDefined && + e.relation.tableMeta.provider.get.equals("hive") => + true + case _ => false + } + + override def convert(exec: SparkPlan): SparkPlan = { + exec match { + case hiveExec: HiveTableScanExec + if enableHiveTableScanExec + && HiveTableUtil.isParquetTable(hiveExec) => + convertParquetHiveTableScanExec(hiveExec) + case _ => exec + } + } + + private def convertParquetHiveTableScanExec(hiveExec: HiveTableScanExec): SparkPlan = { + AuronConverters.addRenameColumnsExec(NativeParquetHiveTableScanExec(hiveExec)) + } +} + +object HiveTableUtil { + private val parquetFormat = "MapredParquetInputFormat" + + def isParquetTable(basedHiveScan: HiveTableScanExec): Boolean = { + if (HiveClientImpl + .toHiveTable(basedHiveScan.relation.tableMeta) + .getInputFormatClass + .getSimpleName + .equalsIgnoreCase(parquetFormat)) { + true + } else { + false + } + } + +} diff --git a/spark-extension-shims-spark/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/NativeParquetHiveTableScanExec.scala b/spark-extension-shims-spark/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/NativeParquetHiveTableScanExec.scala new file mode 100644 index 000000000..669c14749 --- /dev/null +++ b/spark-extension-shims-spark/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/NativeParquetHiveTableScanExec.scala @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.hive.execution.auron.plan + +import java.util.UUID + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + +import org.apache.hadoop.conf.Configurable +import org.apache.hadoop.hive.ql.exec.Utilities +import org.apache.hadoop.hive.ql.metadata.{Table => HiveTable} +import org.apache.hadoop.hive.ql.plan.TableDesc +import org.apache.hadoop.hive.serde.serdeConstants +import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspectorUtils, StructObjectInspector} +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils +import org.apache.hadoop.io.Writable +import org.apache.hadoop.mapred.{FileSplit, InputFormat, JobConf} +import org.apache.hadoop.mapreduce.{InputFormat => newInputClass} +import org.apache.hadoop.util.ReflectionUtils +import org.apache.spark.{Partition, TaskContext} +import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.HADOOP_RDD_IGNORE_EMPTY_SPLITS +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.auron.{NativeRDD, Shims} +import org.apache.spark.sql.catalyst.expressions.{AttributeMap, GenericInternalRow} +import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile} +import org.apache.spark.sql.hive.{HadoopTableReader, HiveShim} +import org.apache.spark.sql.hive.client.HiveClientImpl +import org.apache.spark.sql.hive.execution.HiveTableScanExec + +import org.apache.auron.{protobuf => pb} +import org.apache.auron.metric.SparkMetricNode + +case class NativeParquetHiveTableScanExec(basedHiveScan: HiveTableScanExec) + extends NativeHiveTableScanBase(basedHiveScan) + with Logging { + + @transient private lazy val nativeTable: HiveTable = + HiveClientImpl.toHiveTable(relation.tableMeta) + @transient private lazy val fileFormat = + ParquetHiveTableUtil.getFileFormat(nativeTable.getInputFormatClass) + @transient private lazy val nativeTableDesc = new TableDesc( + nativeTable.getInputFormatClass, + nativeTable.getOutputFormatClass, + nativeTable.getMetadata) + + @transient private lazy val nativeHadoopConf = { + val hiveConf = SparkSession.getActiveSession.get.sessionState.newHadoopConf() + // append columns ids and names before broadcast + val columnOrdinals = AttributeMap(relation.dataCols.zipWithIndex) + val neededColumnIDs = output.flatMap(columnOrdinals.get).map(o => o: Integer) + val neededColumnNames = output.filter(columnOrdinals.contains).map(_.name) + + HiveShim.appendReadColumns(hiveConf, neededColumnIDs, neededColumnNames) + + val deserializer = nativeTableDesc.getDeserializerClass.getConstructor().newInstance() + deserializer.initialize(hiveConf, nativeTableDesc.getProperties) + + // Specifies types and object inspectors of columns to be scanned. + val structOI = ObjectInspectorUtils + .getStandardObjectInspector(deserializer.getObjectInspector, ObjectInspectorCopyOption.JAVA) + .asInstanceOf[StructObjectInspector] + + val columnTypeNames = structOI.getAllStructFieldRefs.asScala + .map(_.getFieldObjectInspector) + .map(TypeInfoUtils.getTypeInfoFromObjectInspector(_).getTypeName) + .mkString(",") + + hiveConf.set(serdeConstants.LIST_COLUMN_TYPES, columnTypeNames) + hiveConf.set(serdeConstants.LIST_COLUMNS, relation.dataCols.map(_.name).mkString(",")) + hiveConf + } + + private val minPartitions = if (SparkSession.getActiveSession.get.sparkContext.isLocal) { + 0 // will splitted based on block by default. + } else { + math.max( + nativeHadoopConf.getInt("mapreduce.job.maps", 1), + SparkSession.getActiveSession.get.sparkContext.defaultMinPartitions) + } + + private val ignoreEmptySplits = + SparkSession.getActiveSession.get.sparkContext.conf.get(HADOOP_RDD_IGNORE_EMPTY_SPLITS) + + override val nodeName: String = + s"NativeHiveTableScan $tableName" + + override def doExecuteNative(): NativeRDD = { + val nativeMetrics = SparkMetricNode( + metrics, + Nil, + Some({ + case ("bytes_scanned", v) => + val inputMetric = TaskContext.get.taskMetrics().inputMetrics + inputMetric.incBytesRead(v) + case ("output_rows", v) => + val inputMetric = TaskContext.get.taskMetrics().inputMetrics + inputMetric.incRecordsRead(v) + case _ => + })) + val nativeFileSchema = this.nativeFileSchema + val nativeFileGroups = this.nativeFileGroups + val nativePartitionSchema = this.nativePartitionSchema + + val projection = schema.map(field => relation.schema.fieldIndex(field.name)) + val broadcastedHadoopConf = this.broadcastedHadoopConf + val numPartitions = partitions.length + + new NativeRDD( + sparkContext, + nativeMetrics, + partitions.asInstanceOf[Array[Partition]], + None, + Nil, + rddShuffleReadFull = true, + (partition, _) => { + val resourceId = s"NativeParquetHiveTableScan:${UUID.randomUUID().toString}" + putJniBridgeResource(resourceId, broadcastedHadoopConf) + + val nativeFileGroup = nativeFileGroups(partition.asInstanceOf[FilePartition]) + val nativeFileScanConf = pb.FileScanExecConf + .newBuilder() + .setNumPartitions(numPartitions) + .setPartitionIndex(partition.index) + .setStatistics(pb.Statistics.getDefaultInstance) + .setSchema(nativeFileSchema) + .setFileGroup(nativeFileGroup) + .addAllProjection(projection.map(Integer.valueOf).asJava) + .setPartitionSchema(nativePartitionSchema) + .build() + fileFormat match { + case "parquet" => + val nativeParquetScanExecBuilder = pb.ParquetScanExecNode + .newBuilder() + .setBaseConf(nativeFileScanConf) + .setFsResourceId(resourceId) + .addAllPruningPredicates(new java.util.ArrayList()) // not support this filter + + pb.PhysicalPlanNode + .newBuilder() + .setParquetScan(nativeParquetScanExecBuilder.build()) + .build() + case "other" => + throw new Exception("HiveTableExec only support parquet") + } + }, + friendlyName = "NativeRDD.ParquetHiveTableScan") + } + + override def getFilePartitions(): Array[FilePartition] = { + val newJobConf = new JobConf(nativeHadoopConf) + val arrayFilePartition = ArrayBuffer[FilePartition]() + val partitionedFiles = if (relation.isPartitioned) { + val partitions = basedHiveScan.prunedPartitions + val arrayPartitionedFile = ArrayBuffer[PartitionedFile]() + partitions.foreach { partition => + val partDesc = Utilities.getPartitionDescFromTableDesc(nativeTableDesc, partition, true) + val partPath = partition.getDataLocation + HadoopTableReader.initializeLocalJobConfFunc(partPath.toString, nativeTableDesc)( + newJobConf) + val partitionValues = partition.getTPartition.getValues + + val partitionInternalRow = new GenericInternalRow(partitionValues.size()) + for (partitionIndex <- 0 until partitionValues.size) { + partitionInternalRow.update(partitionIndex, partitionValues.get(partitionIndex)) + } + + val inputFormatClass = partDesc.getInputFileFormatClass + .asInstanceOf[Class[newInputClass[Writable, Writable]]] + newJobConf.set("mapred.input.dir", partPath.toString) + arrayPartitionedFile ++= getArrayPartitionedFile( + newJobConf, + inputFormatClass, + partitionInternalRow) + } + arrayPartitionedFile + .sortBy(_.length)(implicitly[Ordering[Long]].reverse) + .toArray + } else { + newJobConf.set("mapred.input.dir", nativeTableDesc.getProperties().getProperty("location")) + val inputFormatClass = + nativeTable.getInputFormatClass.asInstanceOf[Class[newInputClass[Writable, Writable]]] + getArrayPartitionedFile(newJobConf, inputFormatClass, new GenericInternalRow(0)) + .sortBy(_.length)(implicitly[Ordering[Long]].reverse) + .toArray + } + arrayFilePartition ++= FilePartition.getFilePartitions( + SparkSession.getActiveSession.get, + partitionedFiles, + getMaxSplitBytes(SparkSession.getActiveSession.get)) + + arrayFilePartition.toArray + } + + private def getMaxSplitBytes(sparkSession: SparkSession): Long = { + val defaultMaxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes + val openCostInBytes = sparkSession.sessionState.conf.filesOpenCostInBytes + Math.min(defaultMaxSplitBytes, openCostInBytes) + } + + private def getArrayPartitionedFile( + newJobConf: JobConf, + inputFormatClass: Class[newInputClass[Writable, Writable]], + partitionInternalRow: GenericInternalRow): ArrayBuffer[PartitionedFile] = { + val allInputSplits = + getInputFormat(newJobConf, inputFormatClass).getSplits(newJobConf, minPartitions) + val inputSplits = if (ignoreEmptySplits) { + allInputSplits.filter(_.getLength > 0) + } else { + allInputSplits + } + + val arrayFilePartition = ArrayBuffer[PartitionedFile]() + for (i <- 0 until inputSplits.size) { + val inputSplit = inputSplits(i) + if (inputSplit.isInstanceOf[FileSplit]) { + val orcInputSplit = inputSplit.asInstanceOf[FileSplit] + arrayFilePartition += + Shims.get.getPartitionedFile( + partitionInternalRow, + orcInputSplit.getPath.toString, + orcInputSplit.getStart, + orcInputSplit.getLength) + } + } + arrayFilePartition + } + + private def getInputFormat( + conf: JobConf, + inputFormatClass: Class[newInputClass[Writable, Writable]]) + : InputFormat[Writable, Writable] = { + val newInputFormat = ReflectionUtils + .newInstance(inputFormatClass.asInstanceOf[Class[_]], conf) + .asInstanceOf[InputFormat[Writable, Writable]] + newInputFormat match { + case c: Configurable => c.setConf(conf) + case _ => + } + newInputFormat + } + +} + +object ParquetHiveTableUtil { + private val parquetFormat = "MapredParquetInputFormat" + + def getFileFormat(inputFormatClass: Class[_ <: InputFormat[_, _]]): String = { + if (inputFormatClass.getSimpleName.equalsIgnoreCase(parquetFormat)) { + "parquet" + } else { + "other" + } + } + +} diff --git a/spark-extension-shims-spark/src/test/scala/org/apache/auron/BaseAuronSQLSuite.scala b/spark-extension-shims-spark/src/test/scala/org/apache/auron/BaseAuronSQLSuite.scala index cd3ce9759..0129739db 100644 --- a/spark-extension-shims-spark/src/test/scala/org/apache/auron/BaseAuronSQLSuite.scala +++ b/spark-extension-shims-spark/src/test/scala/org/apache/auron/BaseAuronSQLSuite.scala @@ -20,6 +20,8 @@ import java.io.File import org.apache.commons.io.FileUtils import org.apache.spark.SparkConf +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.auron.util.SparkVersionUtil @@ -61,6 +63,12 @@ trait BaseAuronSQLSuite extends SharedSparkSession { .set("spark.ui.enabled", "false") .set("spark.sql.warehouse.dir", warehouseDir) .set("spark.auron.udf.singleChildFallback.enabled", "false") + .set(SQLConf.CODEGEN_FALLBACK.key, "false") + .set(SQLConf.CODEGEN_FACTORY_MODE.key, CodegenObjectFactoryMode.CODEGEN_ONLY.toString) + .set( + "spark.sql.hive.metastore.barrierPrefixes", + "org.apache.spark.sql.hive.execution.PairSerDe") + .set("spark.hadoop.hive.metastore.disallow.incompatible.col.type.changes", "false") if (SparkVersionUtil.isSparkV40OrGreater) { // Spark 4.0+: Disable session artifact isolation, align with Spark 3.x behavior diff --git a/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/BaseAuronHiveSuite.scala b/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/BaseAuronHiveSuite.scala new file mode 100644 index 000000000..815b12629 --- /dev/null +++ b/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/BaseAuronHiveSuite.scala @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.hive.execution + +import org.apache.auron.sparkver + +import java.io.File +import org.apache.commons.io.FileUtils +import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} +import org.apache.spark.sql.hive.test.TestHiveContext +import org.scalatest.BeforeAndAfterAll + +trait BaseAuronHiveSuite extends SparkFunSuite with BeforeAndAfterAll { + + + lazy val spark = getAuronSparkSession() + + protected val suiteWorkspace: String = classOf[BaseAuronHiveSuite]. + getResource("/").getPath + "auron-tests-workdir" + protected val warehouseDir: String = suiteWorkspace + "/spark-warehouse" + protected val metastoreDir: String = suiteWorkspace + "/meta" + + protected def resetSuiteWorkspace(): Unit = { + val workdir = new File(suiteWorkspace) + if (workdir.exists()) { + FileUtils.forceDelete(workdir) + } + FileUtils.forceMkdir(workdir) + FileUtils.forceMkdir(new File(warehouseDir)) + FileUtils.forceMkdir(new File(metastoreDir)) + } + + @sparkver("3.0 / 3.1 / 3.2 / 3.3/ 3.4/ 3.5") + def getAuronSparkSession(): org.apache.spark.sql.SparkSession = { + TestAuronHive.sparkSession + } + + @sparkver("4.0 / 4.1") + def getAuronSparkSession(): org.apache.spark.sql.classic.SparkSession = { + TestAuronHive.sparkSession + } + + override def beforeAll(): Unit = { + // Prepare a clean workspace before SparkSession initialization + resetSuiteWorkspace() + super.beforeAll() + spark.sparkContext.setLogLevel("WARN") + } + +} + +object TestAuronHive + extends TestHiveContext( + new SparkContext( + System.getProperty("spark.sql.test.master", "local[1]"), + "TestSQLContext", + new SparkConf() + .set("spark.sql.test", "") + .set("spark.sql.extensions", "org.apache.spark.sql.auron.AuronSparkSessionExtension") + .set( + "spark.shuffle.manager", + "org.apache.spark.sql.execution.auron.shuffle.AuronShuffleManager") + .set("spark.memory.offHeap.enabled", "false") + .set("spark.auron.enable", "true") + .set("spark.ui.enabled", "false") + .set( + "spark.sql.warehouse.dir", + classOf[BaseAuronHiveSuite].getResource("/").getPath + "auron-tests-workdir/spark-warehouse") + .set("spark.auron.udf.singleChildFallback.enabled", "false") + .set("spark.auron.enable.parquetHiveTableScanExec", "true") + .set("spark.sql.hive.convertMetastoreParquet", "false"))) {} diff --git a/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala b/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala new file mode 100644 index 000000000..4c8d48f7c --- /dev/null +++ b/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.hive.execution + +import org.apache.spark.sql.AuronQueryTest +import org.apache.spark.sql.hive.execution.auron.plan.NativeParquetHiveTableScanExec + +class HiveParquetTableScanExecSuite extends AuronQueryTest with BaseAuronHiveSuite { + + test("test hive parquet table without partition to native") { + withTempView("hive_table_without_partition") { + spark.sql("create table hive_table_without_partition (a string) stored as parquet") + spark.sql("insert into hive_table_without_partition values(1)") + val df = spark.sql("select * from hive_table_without_partition") + assert(df.collect().toList.head.get(0) == "1") + val plan = df.queryExecution.executedPlan + assert(collect(plan) { case e: NativeParquetHiveTableScanExec => + e + }.size == 1) + } + } + + test("test hive parquet table partition to native") { + withTempView("hive_table_with_partition") { + spark.sql("create table hive_table_with_partition (a string) stored as parquet partitioned by(pt string)") + spark.sql("insert into hive_table_with_partition partition(pt='2026-03-10') values('1')") + spark.sql("insert into hive_table_with_partition partition(pt='2026-03-11') values('1')") + val df = spark.sql("select * from hive_table_with_partition where pt = '2026-03-10'") + df.show() + assert(df.collect().toList.head.get(0) == "1") + assert(df.collect().toList.head.get(1) == "2026-03-10") + val plan = df.queryExecution.executedPlan + assert(collect(plan) { case e: NativeParquetHiveTableScanExec => + e + }.size == 1) + } + } + +} diff --git a/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronConverters.scala b/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronConverters.scala index d01c82710..824dd6759 100644 --- a/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronConverters.scala +++ b/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronConverters.scala @@ -424,7 +424,7 @@ object AuronConverters extends Logging { assert( !exec.requiredSchema.exists(e => existTimestampType(e.dataType)), s"Parquet scan with timestamp type is not supported for table: ${tableIdentifier - .getOrElse("unknown")}. " + + .getOrElse("unknown")}. " + "Set spark.auron.enable.scan.parquet.timestamp=true to enable timestamp support " + "or remove timestamp columns from the query.") } @@ -435,7 +435,7 @@ object AuronConverters extends Logging { assert( !exec.requiredSchema.exists(e => existTimestampType(e.dataType)), s"ORC scan with timestamp type is not supported for tableIdentifier: ${tableIdentifier - .getOrElse("unknown")}. " + + .getOrElse("unknown")}. " + "Set spark.auron.enable.scan.orc.timestamp=true to enable timestamp support " + "or remove timestamp columns from the query.") } @@ -443,7 +443,7 @@ object AuronConverters extends Logging { case p => throw new NotImplementedError( s"Cannot convert FileSourceScanExec tableIdentifier: ${tableIdentifier.getOrElse( - "unknown")}, class: ${p.getClass.getName}") + "unknown")}, class: ${p.getClass.getName}") } } diff --git a/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronSparkSessionExtension.scala b/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronSparkSessionExtension.scala index b68b04954..47492aa3d 100644 --- a/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronSparkSessionExtension.scala +++ b/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronSparkSessionExtension.scala @@ -91,7 +91,7 @@ case class AuronColumnarOverrides(sparkSession: SparkSession) extends ColumnarRu dumpSimpleSparkPlanTreeNode(sparkPlanTransformed) logInfo(s"Transformed spark plan after preColumnarTransitions:\n${sparkPlanTransformed - .treeString(verbose = true, addSuffix = true)}") + .treeString(verbose = true, addSuffix = true)}") // post-transform Shims.get.postTransform(sparkPlanTransformed, sparkSession.sparkContext) diff --git a/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeHelper.scala b/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeHelper.scala index e16656471..7a1e34724 100644 --- a/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeHelper.scala +++ b/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeHelper.scala @@ -74,7 +74,7 @@ object NativeHelper extends Logging { val heapMemory = Runtime.getRuntime.maxMemory() val offheapMemory = totalMemory - heapMemory logWarning(s"memory total: ${Utils.bytesToString(totalMemory)}, onheap: ${Utils.bytesToString( - heapMemory)}, offheap: ${Utils.bytesToString(offheapMemory)}") + heapMemory)}, offheap: ${Utils.bytesToString(offheapMemory)}") offheapMemory } diff --git a/spark-extension/src/main/scala/org/apache/spark/sql/auron/util/TaskContextHelper.scala b/spark-extension/src/main/scala/org/apache/spark/sql/auron/util/TaskContextHelper.scala index 8e5d7353f..eecec5ef4 100644 --- a/spark-extension/src/main/scala/org/apache/spark/sql/auron/util/TaskContextHelper.scala +++ b/spark-extension/src/main/scala/org/apache/spark/sql/auron/util/TaskContextHelper.scala @@ -47,7 +47,7 @@ object TaskContextHelper extends Logging { val thread = Thread.currentThread() val threadName = if (context != null) { s"auron native task ${context.partitionId()}.${context.attemptNumber()} in stage ${context - .stageId()}.${context.stageAttemptNumber()} (TID ${context.taskAttemptId()})" + .stageId()}.${context.stageAttemptNumber()} (TID ${context.taskAttemptId()})" } else { "auron native task " + thread.getName } diff --git a/spark-extension/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/NativeHiveTableScanBase.scala b/spark-extension/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/NativeHiveTableScanBase.scala index 6dfc8be79..951c3b7fb 100644 --- a/spark-extension/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/NativeHiveTableScanBase.scala +++ b/spark-extension/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/NativeHiveTableScanBase.scala @@ -85,7 +85,8 @@ abstract class NativeHiveTableScanBase(basedHiveScan: HiveTableScanExec) val nativePartitionedFile = (file: PartitionedFile) => { val nativePartitionValues = partitionSchema.zipWithIndex.map { case (field, index) => NativeConverters - .convertExpr(Literal(file.partitionValues.get(index, field.dataType), field.dataType)) + .convertExpr( + Literal.create(file.partitionValues.get(index, field.dataType), field.dataType)) .getLiteral } pb.PartitionedFile diff --git a/thirdparty/auron-uniffle/src/main/scala/org/apache/spark/sql/execution/auron/shuffle/uniffle/AuronUniffleShuffleReader.scala b/thirdparty/auron-uniffle/src/main/scala/org/apache/spark/sql/execution/auron/shuffle/uniffle/AuronUniffleShuffleReader.scala index 94e139ac4..48938a706 100644 --- a/thirdparty/auron-uniffle/src/main/scala/org/apache/spark/sql/execution/auron/shuffle/uniffle/AuronUniffleShuffleReader.scala +++ b/thirdparty/auron-uniffle/src/main/scala/org/apache/spark/sql/execution/auron/shuffle/uniffle/AuronUniffleShuffleReader.scala @@ -172,7 +172,7 @@ class AuronUniffleShuffleReader[K, C]( } if (!emptyPartitionIds.isEmpty) { logDebug(s"Found ${emptyPartitionIds - .size()} empty shuffle partitions: ${emptyPartitionIds.asScala.mkString(",")}") + .size()} empty shuffle partitions: ${emptyPartitionIds.asScala.mkString(",")}") } iterators = shuffleDataIterList.iterator() if (iterators.hasNext) { From 47b4e2282eb6fb509061e22be14608f3189425ce Mon Sep 17 00:00:00 2001 From: guihuawen Date: Thu, 30 Apr 2026 14:24:11 +0800 Subject: [PATCH 2/7] [AURON #2080] Support Hive Parquet table to NativeParquetHiveTableScanExec --- dev/mvn-build-helper/assembly/pom.xml | 8 ++++---- .../plan/NativeParquetHiveTableScanExec.scala | 14 ++++++++++++-- .../sql/hive/execution/BaseAuronHiveSuite.scala | 16 +++++++++------- .../HiveParquetTableScanExecSuite.scala | 3 ++- .../apache/spark/sql/auron/NativeHelper.scala | 2 +- .../spark/sql/auron/util/TaskContextHelper.scala | 2 +- 6 files changed, 29 insertions(+), 16 deletions(-) diff --git a/dev/mvn-build-helper/assembly/pom.xml b/dev/mvn-build-helper/assembly/pom.xml index 6aeada7ee..2d6279f36 100644 --- a/dev/mvn-build-helper/assembly/pom.xml +++ b/dev/mvn-build-helper/assembly/pom.xml @@ -120,10 +120,10 @@ org.apache.arrow.c.** - - - - + + + + javax.annotation ${auron.shade.packageName}.javax.annotation diff --git a/spark-extension-shims-spark/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/NativeParquetHiveTableScanExec.scala b/spark-extension-shims-spark/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/NativeParquetHiveTableScanExec.scala index 669c14749..f8d6a04dc 100644 --- a/spark-extension-shims-spark/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/NativeParquetHiveTableScanExec.scala +++ b/spark-extension-shims-spark/src/main/scala/org/apache/spark/sql/hive/execution/auron/plan/NativeParquetHiveTableScanExec.scala @@ -44,7 +44,7 @@ import org.apache.spark.sql.hive.{HadoopTableReader, HiveShim} import org.apache.spark.sql.hive.client.HiveClientImpl import org.apache.spark.sql.hive.execution.HiveTableScanExec -import org.apache.auron.{protobuf => pb} +import org.apache.auron.{protobuf => pb, sparkver} import org.apache.auron.metric.SparkMetricNode case class NativeParquetHiveTableScanExec(basedHiveScan: HiveTableScanExec) @@ -163,11 +163,21 @@ case class NativeParquetHiveTableScanExec(basedHiveScan: HiveTableScanExec) friendlyName = "NativeRDD.ParquetHiveTableScan") } + @sparkver("3.1 / 3.2 / 3.3 / 3.4 / 3.5 / 4.0 / 4.1") + private def getPrunedPartitions(): Seq[org.apache.hadoop.hive.ql.metadata.Partition] = { + basedHiveScan.prunedPartitions + } + + @sparkver("3.0") + private def getPrunedPartitions(): Seq[org.apache.hadoop.hive.ql.metadata.Partition] = { + basedHiveScan.rawPartitions + } + override def getFilePartitions(): Array[FilePartition] = { val newJobConf = new JobConf(nativeHadoopConf) val arrayFilePartition = ArrayBuffer[FilePartition]() val partitionedFiles = if (relation.isPartitioned) { - val partitions = basedHiveScan.prunedPartitions + val partitions = getPrunedPartitions val arrayPartitionedFile = ArrayBuffer[PartitionedFile]() partitions.foreach { partition => val partDesc = Utilities.getPartitionDescFromTableDesc(nativeTableDesc, partition, true) diff --git a/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/BaseAuronHiveSuite.scala b/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/BaseAuronHiveSuite.scala index 815b12629..5fda90f5f 100644 --- a/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/BaseAuronHiveSuite.scala +++ b/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/BaseAuronHiveSuite.scala @@ -16,21 +16,21 @@ */ package org.apache.spark.sql.hive.execution -import org.apache.auron.sparkver - import java.io.File + import org.apache.commons.io.FileUtils import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.sql.hive.test.TestHiveContext import org.scalatest.BeforeAndAfterAll -trait BaseAuronHiveSuite extends SparkFunSuite with BeforeAndAfterAll { +import org.apache.auron.sparkver +trait BaseAuronHiveSuite extends SparkFunSuite with BeforeAndAfterAll { lazy val spark = getAuronSparkSession() - protected val suiteWorkspace: String = classOf[BaseAuronHiveSuite]. - getResource("/").getPath + "auron-tests-workdir" + protected val suiteWorkspace: String = + classOf[BaseAuronHiveSuite].getResource("/").getPath + "auron-tests-workdir" protected val warehouseDir: String = suiteWorkspace + "/spark-warehouse" protected val metastoreDir: String = suiteWorkspace + "/meta" @@ -45,7 +45,7 @@ trait BaseAuronHiveSuite extends SparkFunSuite with BeforeAndAfterAll { } @sparkver("3.0 / 3.1 / 3.2 / 3.3/ 3.4/ 3.5") - def getAuronSparkSession(): org.apache.spark.sql.SparkSession = { + def getAuronSparkSession(): org.apache.spark.sql.SparkSession = { TestAuronHive.sparkSession } @@ -79,7 +79,9 @@ object TestAuronHive .set("spark.ui.enabled", "false") .set( "spark.sql.warehouse.dir", - classOf[BaseAuronHiveSuite].getResource("/").getPath + "auron-tests-workdir/spark-warehouse") + classOf[BaseAuronHiveSuite] + .getResource("/") + .getPath + "auron-tests-workdir/spark-warehouse") .set("spark.auron.udf.singleChildFallback.enabled", "false") .set("spark.auron.enable.parquetHiveTableScanExec", "true") .set("spark.sql.hive.convertMetastoreParquet", "false"))) {} diff --git a/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala b/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala index 4c8d48f7c..d79a125b3 100644 --- a/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala +++ b/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala @@ -36,7 +36,8 @@ class HiveParquetTableScanExecSuite extends AuronQueryTest with BaseAuronHiveSui test("test hive parquet table partition to native") { withTempView("hive_table_with_partition") { - spark.sql("create table hive_table_with_partition (a string) stored as parquet partitioned by(pt string)") + spark.sql( + "create table hive_table_with_partition (a string) stored as parquet partitioned by(pt string)") spark.sql("insert into hive_table_with_partition partition(pt='2026-03-10') values('1')") spark.sql("insert into hive_table_with_partition partition(pt='2026-03-11') values('1')") val df = spark.sql("select * from hive_table_with_partition where pt = '2026-03-10'") diff --git a/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeHelper.scala b/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeHelper.scala index 7a1e34724..e16656471 100644 --- a/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeHelper.scala +++ b/spark-extension/src/main/scala/org/apache/spark/sql/auron/NativeHelper.scala @@ -74,7 +74,7 @@ object NativeHelper extends Logging { val heapMemory = Runtime.getRuntime.maxMemory() val offheapMemory = totalMemory - heapMemory logWarning(s"memory total: ${Utils.bytesToString(totalMemory)}, onheap: ${Utils.bytesToString( - heapMemory)}, offheap: ${Utils.bytesToString(offheapMemory)}") + heapMemory)}, offheap: ${Utils.bytesToString(offheapMemory)}") offheapMemory } diff --git a/spark-extension/src/main/scala/org/apache/spark/sql/auron/util/TaskContextHelper.scala b/spark-extension/src/main/scala/org/apache/spark/sql/auron/util/TaskContextHelper.scala index eecec5ef4..8e5d7353f 100644 --- a/spark-extension/src/main/scala/org/apache/spark/sql/auron/util/TaskContextHelper.scala +++ b/spark-extension/src/main/scala/org/apache/spark/sql/auron/util/TaskContextHelper.scala @@ -47,7 +47,7 @@ object TaskContextHelper extends Logging { val thread = Thread.currentThread() val threadName = if (context != null) { s"auron native task ${context.partitionId()}.${context.attemptNumber()} in stage ${context - .stageId()}.${context.stageAttemptNumber()} (TID ${context.taskAttemptId()})" + .stageId()}.${context.stageAttemptNumber()} (TID ${context.taskAttemptId()})" } else { "auron native task " + thread.getName } From 7c666483f6594d020608ef956c2f5d16bfae30f6 Mon Sep 17 00:00:00 2001 From: guihuawen Date: Thu, 30 Apr 2026 15:46:49 +0800 Subject: [PATCH 3/7] [AURON #2080] Support Hive Parquet table to NativeParquetHiveTableScanExec --- .../scala/org/apache/spark/sql/auron/AuronConverters.scala | 6 +++--- .../apache/spark/sql/auron/AuronSparkSessionExtension.scala | 2 +- .../auron/shuffle/uniffle/AuronUniffleShuffleReader.scala | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronConverters.scala b/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronConverters.scala index 824dd6759..d01c82710 100644 --- a/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronConverters.scala +++ b/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronConverters.scala @@ -424,7 +424,7 @@ object AuronConverters extends Logging { assert( !exec.requiredSchema.exists(e => existTimestampType(e.dataType)), s"Parquet scan with timestamp type is not supported for table: ${tableIdentifier - .getOrElse("unknown")}. " + + .getOrElse("unknown")}. " + "Set spark.auron.enable.scan.parquet.timestamp=true to enable timestamp support " + "or remove timestamp columns from the query.") } @@ -435,7 +435,7 @@ object AuronConverters extends Logging { assert( !exec.requiredSchema.exists(e => existTimestampType(e.dataType)), s"ORC scan with timestamp type is not supported for tableIdentifier: ${tableIdentifier - .getOrElse("unknown")}. " + + .getOrElse("unknown")}. " + "Set spark.auron.enable.scan.orc.timestamp=true to enable timestamp support " + "or remove timestamp columns from the query.") } @@ -443,7 +443,7 @@ object AuronConverters extends Logging { case p => throw new NotImplementedError( s"Cannot convert FileSourceScanExec tableIdentifier: ${tableIdentifier.getOrElse( - "unknown")}, class: ${p.getClass.getName}") + "unknown")}, class: ${p.getClass.getName}") } } diff --git a/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronSparkSessionExtension.scala b/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronSparkSessionExtension.scala index 47492aa3d..b68b04954 100644 --- a/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronSparkSessionExtension.scala +++ b/spark-extension/src/main/scala/org/apache/spark/sql/auron/AuronSparkSessionExtension.scala @@ -91,7 +91,7 @@ case class AuronColumnarOverrides(sparkSession: SparkSession) extends ColumnarRu dumpSimpleSparkPlanTreeNode(sparkPlanTransformed) logInfo(s"Transformed spark plan after preColumnarTransitions:\n${sparkPlanTransformed - .treeString(verbose = true, addSuffix = true)}") + .treeString(verbose = true, addSuffix = true)}") // post-transform Shims.get.postTransform(sparkPlanTransformed, sparkSession.sparkContext) diff --git a/thirdparty/auron-uniffle/src/main/scala/org/apache/spark/sql/execution/auron/shuffle/uniffle/AuronUniffleShuffleReader.scala b/thirdparty/auron-uniffle/src/main/scala/org/apache/spark/sql/execution/auron/shuffle/uniffle/AuronUniffleShuffleReader.scala index 48938a706..94e139ac4 100644 --- a/thirdparty/auron-uniffle/src/main/scala/org/apache/spark/sql/execution/auron/shuffle/uniffle/AuronUniffleShuffleReader.scala +++ b/thirdparty/auron-uniffle/src/main/scala/org/apache/spark/sql/execution/auron/shuffle/uniffle/AuronUniffleShuffleReader.scala @@ -172,7 +172,7 @@ class AuronUniffleShuffleReader[K, C]( } if (!emptyPartitionIds.isEmpty) { logDebug(s"Found ${emptyPartitionIds - .size()} empty shuffle partitions: ${emptyPartitionIds.asScala.mkString(",")}") + .size()} empty shuffle partitions: ${emptyPartitionIds.asScala.mkString(",")}") } iterators = shuffleDataIterList.iterator() if (iterators.hasNext) { From 225522e254d28c775389defac7c6e89c77af1c78 Mon Sep 17 00:00:00 2001 From: guihuawen Date: Wed, 6 May 2026 21:32:28 +0800 Subject: [PATCH 4/7] [AURON #2080] Support Hive Parquet table to NativeParquetHiveTableScanExec --- .../HiveParquetTableScanExecSuite.scala | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala b/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala index d79a125b3..632f473f5 100644 --- a/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala +++ b/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala @@ -22,10 +22,10 @@ import org.apache.spark.sql.hive.execution.auron.plan.NativeParquetHiveTableScan class HiveParquetTableScanExecSuite extends AuronQueryTest with BaseAuronHiveSuite { test("test hive parquet table without partition to native") { - withTempView("hive_table_without_partition") { - spark.sql("create table hive_table_without_partition (a string) stored as parquet") - spark.sql("insert into hive_table_without_partition values(1)") - val df = spark.sql("select * from hive_table_without_partition") + withTempView("hive_without_partition") { + spark.sql("create table hive_without_partition (a string) stored as parquet") + spark.sql("insert into hive_without_partition values(1)") + val df = spark.sql("select * from hive_without_partition") assert(df.collect().toList.head.get(0) == "1") val plan = df.queryExecution.executedPlan assert(collect(plan) { case e: NativeParquetHiveTableScanExec => @@ -35,12 +35,12 @@ class HiveParquetTableScanExecSuite extends AuronQueryTest with BaseAuronHiveSui } test("test hive parquet table partition to native") { - withTempView("hive_table_with_partition") { + withTempView("hive_with_partition") { spark.sql( - "create table hive_table_with_partition (a string) stored as parquet partitioned by(pt string)") - spark.sql("insert into hive_table_with_partition partition(pt='2026-03-10') values('1')") - spark.sql("insert into hive_table_with_partition partition(pt='2026-03-11') values('1')") - val df = spark.sql("select * from hive_table_with_partition where pt = '2026-03-10'") + "create table hive_with_partition (a string) stored as parquet partitioned by(pt string)") + spark.sql("insert into hive_with_partition partition(pt='2026-03-10') values('1')") + spark.sql("insert into hive_with_partition partition(pt='2026-03-11') values('1')") + val df = spark.sql("select * from hive_with_partition where pt = '2026-03-10'") df.show() assert(df.collect().toList.head.get(0) == "1") assert(df.collect().toList.head.get(1) == "2026-03-10") From 31eb74416c5b39c3ec3d45aa5ab2ffd579e92899 Mon Sep 17 00:00:00 2001 From: guihuawen Date: Thu, 7 May 2026 00:45:32 +0800 Subject: [PATCH 5/7] [AURON #2080] Support Hive Parquet table to NativeParquetHiveTableScanExec --- .../sql/hive/execution/HiveParquetTableScanExecSuite.scala | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala b/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala index 632f473f5..740b2c150 100644 --- a/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala +++ b/spark-extension-shims-spark/src/test/scala/org/apache/spark/sql/hive/execution/HiveParquetTableScanExecSuite.scala @@ -39,11 +39,9 @@ class HiveParquetTableScanExecSuite extends AuronQueryTest with BaseAuronHiveSui spark.sql( "create table hive_with_partition (a string) stored as parquet partitioned by(pt string)") spark.sql("insert into hive_with_partition partition(pt='2026-03-10') values('1')") - spark.sql("insert into hive_with_partition partition(pt='2026-03-11') values('1')") - val df = spark.sql("select * from hive_with_partition where pt = '2026-03-10'") - df.show() + spark.sql("insert into hive_with_partition partition(pt='2026-03-11') values('2')") + val df = spark.sql("select a from hive_with_partition where pt = '2026-03-10'") assert(df.collect().toList.head.get(0) == "1") - assert(df.collect().toList.head.get(1) == "2026-03-10") val plan = df.queryExecution.executedPlan assert(collect(plan) { case e: NativeParquetHiveTableScanExec => e From 9756d28826406f103bbb1e43c30bb154e876ac5c Mon Sep 17 00:00:00 2001 From: guihuawen Date: Thu, 7 May 2026 03:09:01 +0800 Subject: [PATCH 6/7] [AURON #2080] Support Hive Parquet table to NativeParquetHiveTableScanExec --- dev/mvn-build-helper/assembly/pom.xml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dev/mvn-build-helper/assembly/pom.xml b/dev/mvn-build-helper/assembly/pom.xml index 2d6279f36..3026bbb6a 100644 --- a/dev/mvn-build-helper/assembly/pom.xml +++ b/dev/mvn-build-helper/assembly/pom.xml @@ -120,10 +120,6 @@ org.apache.arrow.c.** - - - - javax.annotation ${auron.shade.packageName}.javax.annotation From efc4399b4c254eafd9a4f9e5e044aca2fa77d85b Mon Sep 17 00:00:00 2001 From: guihuawen Date: Fri, 8 May 2026 00:06:03 +0800 Subject: [PATCH 7/7] [AURON #2080] Support Hive Parquet table to NativeParquetHiveTableScanExec --- dev/mvn-build-helper/assembly/pom.xml | 4 ++++ spark-extension-shims-spark/pom.xml | 10 ---------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/dev/mvn-build-helper/assembly/pom.xml b/dev/mvn-build-helper/assembly/pom.xml index 3026bbb6a..0b8ab272e 100644 --- a/dev/mvn-build-helper/assembly/pom.xml +++ b/dev/mvn-build-helper/assembly/pom.xml @@ -120,6 +120,10 @@ org.apache.arrow.c.** + + io.netty + ${auron.shade.packageName}.io.netty + javax.annotation ${auron.shade.packageName}.javax.annotation diff --git a/spark-extension-shims-spark/pom.xml b/spark-extension-shims-spark/pom.xml index f3e0d7ce3..801b4f99a 100644 --- a/spark-extension-shims-spark/pom.xml +++ b/spark-extension-shims-spark/pom.xml @@ -145,16 +145,6 @@ org.apache.spark spark-hive_${scalaVersion} test-jar - - - - - - - - - -