apache · weimingdiit · Apr 27, 2026 · May 5, 2026 · Copilot · Apr 28, 2026
diff --git a/.../auron-iceberg/src/main/scala/org/apache/spark/sql/auron/iceberg/IcebergScanSupport.scala b/.../auron-iceberg/src/main/scala/org/apache/spark/sql/auron/iceberg/IcebergScanSupport.scala
@@ -32,7 +32,8 @@ import org.apache.spark.sql.types.{BinaryType, DataType, DecimalType, StringType
 import org.apache.auron.{protobuf => pb}
 
 // fileSchema is read from the data files. partitionSchema carries supported metadata columns
-// (for example _file) that are materialized as per-file constant values in the native scan.
+// (for example _file and _spec_id) that are materialized as per-file constant values in
+// the native scan.
 final case class IcebergScanPlan(
     fileTasks: Seq[FileScanTask],
     fileFormat: FileFormat,
@@ -58,7 +59,8 @@ object IcebergScanSupport extends Logging {
 
     val readSchema = scan.readSchema
     val unsupportedMetadataColumns = collectUnsupportedMetadataColumns(readSchema)
-    // Native scan can project file-level metadata columns such as _file via partition values.
+    // Native scan can project file-level metadata columns such as _file and _spec_id
+    // via partition values.
     // Metadata columns that require per-row materialization (for example _pos) still fallback.
     if (unsupportedMetadataColumns.nonEmpty) {
       return None
@@ -136,7 +138,8 @@ object IcebergScanSupport extends Logging {
     }
 
   private def isSupportedMetadataColumn(field: org.apache.spark.sql.types.StructField): Boolean =
-    field.name == MetadataColumns.FILE_PATH.name()
+    field.name == MetadataColumns.FILE_PATH.name() ||
+      field.name == MetadataColumns.SPEC_ID.name()
 
   private def inputPartitions(exec: BatchScanExec): Seq[InputPartition] = {
     // Prefer DataSource V2 batch API; if not available, fallback to exec methods via reflection.

diff --git a/...src/main/scala/org/apache/spark/sql/execution/auron/plan/NativeIcebergTableScanExec.scala b/...src/main/scala/org/apache/spark/sql/execution/auron/plan/NativeIcebergTableScanExec.scala
@@ -67,6 +67,7 @@ case class NativeIcebergTableScanExec(basedScan: BatchScanExec, plan: IcebergSca
 
   private lazy val partitions: Array[FilePartition] = buildFilePartitions()
   private lazy val fileSizes: Map[String, Long] = buildFileSizes()
+  private lazy val fileSpecIds: Map[String, Int] = buildFileSpecIds()
 
   private lazy val nativeFileSchema: pb.Schema = NativeConverters.convertSchema(fileSchema)
   private lazy val nativePartitionSchema: pb.Schema =
@@ -125,6 +126,10 @@ case class NativeIcebergTableScanExec(basedScan: BatchScanExec, plan: IcebergSca
       field.name match {
         case name if name == MetadataColumns.FILE_PATH.name() =>
           NativeConverters.convertExpr(Literal.create(filePath, StringType)).getLiteral
+        case name if name == MetadataColumns.SPEC_ID.name() =>
+          NativeConverters
+            .convertExpr(Literal.create(fileSpecIds(filePath), field.dataType))
+            .getLiteral
         case name =>
           throw new IllegalStateException(
             s"unsupported Iceberg metadata column in native scan: $name")
@@ -221,6 +226,25 @@ case class NativeIcebergTableScanExec(basedScan: BatchScanExec, plan: IcebergSca
       .toMap
   }
 
+  private def buildFileSpecIds(): Map[String, Int] = {
+    // Map file path to Iceberg partition spec id; tasks may split a file into multiple ranges.
+    val specIds = scala.collection.mutable.HashMap.empty[String, Int]
+    fileTasks.foreach { task =>
+      val filePath = task.file().location()
+      val specId = task.file().specId()
+      specIds.get(filePath) match {
+        case Some(existingSpecId) if existingSpecId != specId =>
+          throw new IllegalStateException(
+            s"Inconsistent Iceberg partition spec id for file $filePath: " +
+              s"$existingSpecId != $specId")
+        case Some(_) =>
+        case None =>
+          specIds.put(filePath, specId)
+      }
+    }
+    specIds.toMap
+  }
+
   private def buildFilePartitions(): Array[FilePartition] = {
     // Convert Iceberg file tasks into Spark FilePartition groups for execution.
     if (fileTasks.isEmpty) {

diff --git a/.../auron-iceberg/src/test/scala/org/apache/auron/iceberg/AuronIcebergIntegrationSuite.scala b/.../auron-iceberg/src/test/scala/org/apache/auron/iceberg/AuronIcebergIntegrationSuite.scala
@@ -214,6 +214,20 @@ class AuronIcebergIntegrationSuite
     }
   }
 
+  test("iceberg native scan supports _spec_id metadata column") {
+    withTable("local.db.t4_spec_id") {
+      sql("create table local.db.t4_spec_id using iceberg as select 1 as id, 'a' as v")
+      checkSparkAnswerAndOperator("select _spec_id from local.db.t4_spec_id")
+    }
+  }
+
+  test("iceberg native scan supports data columns with _file and _spec_id metadata columns") {
+    withTable("local.db.t4_metadata_mixed") {
+      sql("create table local.db.t4_metadata_mixed using iceberg as select 1 as id, 'a' as v")
+      checkSparkAnswerAndOperator("select id, _file, _spec_id from local.db.t4_metadata_mixed")
+    }
+  }
+
   test("iceberg native scan supports data columns with _file metadata column") {
     withTable("local.db.t4_mixed") {
       sql("create table local.db.t4_mixed using iceberg as select 1 as id, 'a' as v")